diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b0fb983..b6c2b2d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -74,7 +74,7 @@ jobs: - uses: conda-incubator/setup-miniconda@v2 with: auto-update-conda: true - python-version: '3.7' + python-version: '3.10' - name: Set preferred device to CPU shell: bash -l {0} @@ -169,7 +169,7 @@ jobs: - uses: conda-incubator/setup-miniconda@v2 with: auto-update-conda: true - python-version: '3.7' + python-version: '3.10' - name: Set preferred device to GPU shell: bash -l {0} diff --git a/README.md b/README.md index 624f35a..417738c 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ **Features:** - [x] Image annotation for polygon, rectangle, circle, line and point. -- [x] Auto-labeling with YOLOv5 and Segment Anything. +- [x] Auto-labeling YOLOv8, Segment Anything (SAM, SAM2). - [x] Text detection, recognition and KIE (Key Information Extraction) labeling. - [x] Multiple languages availables: English, Vietnamese, Chinese. @@ -49,7 +49,7 @@ ### 2. Install from Pypi -- Requirements: Python >= 3.8, <= 3.12. Recommended: Python 3.12. +- Requirements: Python 3.10+. Recommended: Python 3.12. - Recommended: [Miniconda/Anaconda](https://docs.conda.io/en/latest/miniconda.html). - Create environment: @@ -126,4 +126,4 @@ If you want to contribute to **AnyLabeling**, please read [Contribution Guidelin - Labeling UI built with ideas and components from [LabelImg](https://github.com/heartexlabs/labelImg), [LabelMe](https://github.com/wkentaro/labelme). - Auto-labeling with [Segment Anything Models](https://segment-anything.com/), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM). -- Auto-labeling with [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv8](https://github.com/ultralytics/ultralytics). +- Auto-labeling with [YOLOv8](https://github.com/ultralytics/ultralytics). diff --git a/anylabeling/app_info.py b/anylabeling/app_info.py index ec4ce92..f236fd4 100644 --- a/anylabeling/app_info.py +++ b/anylabeling/app_info.py @@ -1,4 +1,4 @@ __appname__ = "AnyLabeling" __appdescription__ = "Effortless data labeling with AI support" -__version__ = "0.3.3" +__version__ = "0.4.0" __preferred_device__ = "CPU" # GPU or CPU diff --git a/anylabeling/configs/auto_labeling/models.yaml b/anylabeling/configs/auto_labeling/models.yaml index 523265b..2c05408 100644 --- a/anylabeling/configs/auto_labeling/models.yaml +++ b/anylabeling/configs/auto_labeling/models.yaml @@ -1,3 +1,15 @@ +- name: "sam2_hiera_tiny_20240803" + display_name: Segment Anything 2 (Hiera-Tiny) + download_url: https://huggingface.co/vietanhdev/segment-anything-2-onnx-models/resolve/main/sam2_hiera_tiny.zip +- name: "sam2_hiera_small_20240803" + display_name: Segment Anything 2 (Hiera-Small) + download_url: https://huggingface.co/vietanhdev/segment-anything-2-onnx-models/resolve/main/sam2_hiera_small.zip +- name: "sam2_hiera_base_plus_20240803" + display_name: Segment Anything 2 (Hiera-Base+) + download_url: https://huggingface.co/vietanhdev/segment-anything-2-onnx-models/resolve/main/sam2_hiera_base_plus.zip +- name: "sam2_hiera_large_20240803" + display_name: Segment Anything 2 (Hiera-Large) + download_url: https://huggingface.co/vietanhdev/segment-anything-2-onnx-models/resolve/main/sam2_hiera_large.zip - name: "mobile_sam_20230629" display_name: Segment Anything (MobileSAM) download_url: https://huggingface.co/vietanhdev/segment-anything-onnx-models/resolve/main/mobile_sam_20230629.zip @@ -19,22 +31,6 @@ - name: "sam_vit_h_4b8939_quant" display_name: Segment Anything (ViT-H Quant) download_url: https://huggingface.co/vietanhdev/segment-anything-onnx-models/resolve/main/sam_vit_h_4b8939_quant.zip -- name: "yolov5n-r20230415" - display_name: YOLOv5n - download_url: https://github.com/vietanhdev/anylabeling-assets/releases/download/v0.4.0/yolov5n-r20230415.zip -- name: "yolov5s-r20230415" - display_name: YOLOv5s - download_url: https://github.com/vietanhdev/anylabeling-assets/releases/download/v0.4.0/yolov5s-r20230415.zip -- name: "yolov5m-r20230415" - display_name: YOLOv5m - download_url: https://github.com/vietanhdev/anylabeling-assets/releases/download/v0.4.0/yolov5m-r20230415.zip -- name: "yolov5l-r20230415" - display_name: YOLOv5l - download_url: https://github.com/vietanhdev/anylabeling-assets/releases/download/v0.4.0/yolov5l-r20230415.zip -- name: "yolov5x-r20230415" - display_name: YOLOv5x - download_url: https://github.com/vietanhdev/anylabeling-assets/releases/download/v0.4.0/yolov5x-r20230415.zip -- name: "yolov8n-r20230415" display_name: YOLOv8n download_url: https://github.com/vietanhdev/anylabeling-assets/releases/download/v0.4.0/yolov8n-r20230415.zip - name: "yolov8s-r20230415" diff --git a/anylabeling/services/auto_labeling/sam2_onnx.py b/anylabeling/services/auto_labeling/sam2_onnx.py new file mode 100644 index 0000000..1ad5a1a --- /dev/null +++ b/anylabeling/services/auto_labeling/sam2_onnx.py @@ -0,0 +1,332 @@ +# Code from: https://github.com/vietanhdev/samexporter/blob/main/samexporter/sam2_onnx.py +import time +from typing import Any + +import cv2 +import numpy as np +import onnxruntime +from numpy import ndarray + + +class SegmentAnything2ONNX: + """Segmentation model using Segment Anything 2 (SAM2)""" + + def __init__(self, encoder_model_path, decoder_model_path) -> None: + self.encoder = SAM2ImageEncoder(encoder_model_path) + self.decoder = SAM2ImageDecoder( + decoder_model_path, self.encoder.input_shape[2:] + ) + + def encode(self, cv_image: np.ndarray) -> list[np.ndarray]: + original_size = cv_image.shape[:2] + high_res_feats_0, high_res_feats_1, image_embed = self.encoder(cv_image) + return { + "high_res_feats_0": high_res_feats_0, + "high_res_feats_1": high_res_feats_1, + "image_embedding": image_embed, + "original_size": original_size, + } + + def predict_masks(self, embedding, prompt) -> list[np.ndarray]: + points = [] + labels = [] + for mark in prompt: + if mark["type"] == "point": + points.append(mark["data"]) + labels.append(mark["label"]) + elif mark["type"] == "rectangle": + points.append([mark["data"][0], mark["data"][1]]) # top left + points.append( + [mark["data"][2], mark["data"][3]] + ) # bottom right + labels.append(2) + labels.append(3) + points, labels = np.array(points), np.array(labels) + + image_embedding = embedding["image_embedding"] + high_res_feats_0 = embedding["high_res_feats_0"] + high_res_feats_1 = embedding["high_res_feats_1"] + original_size = embedding["original_size"] + self.decoder.set_image_size(original_size) + masks, _ = self.decoder( + image_embedding, + high_res_feats_0, + high_res_feats_1, + points, + labels, + ) + + return masks + + def transform_masks(self, masks, original_size, transform_matrix): + """Transform the masks back to the original image size.""" + output_masks = [] + for batch in range(masks.shape[0]): + batch_masks = [] + for mask_id in range(masks.shape[1]): + mask = masks[batch, mask_id] + mask = cv2.warpAffine( + mask, + transform_matrix[:2], + (original_size[1], original_size[0]), + flags=cv2.INTER_LINEAR, + ) + batch_masks.append(mask) + output_masks.append(batch_masks) + return np.array(output_masks) + + +class SAM2ImageEncoder: + def __init__(self, path: str) -> None: + # Initialize model + self.session = onnxruntime.InferenceSession( + path, providers=onnxruntime.get_available_providers() + ) + + # Get model info + self.get_input_details() + self.get_output_details() + + def __call__( + self, image: np.ndarray + ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + return self.encode_image(image) + + def encode_image( + self, image: np.ndarray + ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + input_tensor = self.prepare_input(image) + + outputs = self.infer(input_tensor) + + return self.process_output(outputs) + + def prepare_input(self, image: np.ndarray) -> np.ndarray: + self.img_height, self.img_width = image.shape[:2] + + input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + input_img = cv2.resize(input_img, (self.input_width, self.input_height)) + + mean = np.array([0.485, 0.456, 0.406]) + std = np.array([0.229, 0.224, 0.225]) + input_img = (input_img / 255.0 - mean) / std + input_img = input_img.transpose(2, 0, 1) + input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32) + + return input_tensor + + def infer(self, input_tensor: np.ndarray) -> list[np.ndarray]: + start = time.perf_counter() + outputs = self.session.run( + self.output_names, {self.input_names[0]: input_tensor} + ) + + print(f"infer time: {(time.perf_counter() - start) * 1000:.2f} ms") + return outputs + + def process_output( + self, outputs: list[np.ndarray] + ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + return outputs[0], outputs[1], outputs[2] + + def get_input_details(self) -> None: + model_inputs = self.session.get_inputs() + self.input_names = [ + model_inputs[i].name for i in range(len(model_inputs)) + ] + + self.input_shape = model_inputs[0].shape + self.input_height = self.input_shape[2] + self.input_width = self.input_shape[3] + + def get_output_details(self) -> None: + model_outputs = self.session.get_outputs() + self.output_names = [ + model_outputs[i].name for i in range(len(model_outputs)) + ] + + +class SAM2ImageDecoder: + def __init__( + self, + path: str, + encoder_input_size: tuple[int, int], + orig_im_size: tuple[int, int] = None, + mask_threshold: float = 0.0, + ) -> None: + # Initialize model + self.session = onnxruntime.InferenceSession( + path, providers=onnxruntime.get_available_providers() + ) + + self.orig_im_size = ( + orig_im_size if orig_im_size is not None else encoder_input_size + ) + self.encoder_input_size = encoder_input_size + self.mask_threshold = mask_threshold + self.scale_factor = 4 + + # Get model info + self.get_input_details() + self.get_output_details() + + def __call__( + self, + image_embed: np.ndarray, + high_res_feats_0: np.ndarray, + high_res_feats_1: np.ndarray, + point_coords: list[np.ndarray] | np.ndarray, + point_labels: list[np.ndarray] | np.ndarray, + ) -> tuple[list[np.ndarray], ndarray]: + + return self.predict( + image_embed, + high_res_feats_0, + high_res_feats_1, + point_coords, + point_labels, + ) + + def predict( + self, + image_embed: np.ndarray, + high_res_feats_0: np.ndarray, + high_res_feats_1: np.ndarray, + point_coords: list[np.ndarray] | np.ndarray, + point_labels: list[np.ndarray] | np.ndarray, + ) -> tuple[list[np.ndarray], ndarray]: + + inputs = self.prepare_inputs( + image_embed, + high_res_feats_0, + high_res_feats_1, + point_coords, + point_labels, + ) + + outputs = self.infer(inputs) + + return self.process_output(outputs) + + def prepare_inputs( + self, + image_embed: np.ndarray, + high_res_feats_0: np.ndarray, + high_res_feats_1: np.ndarray, + point_coords: list[np.ndarray] | np.ndarray, + point_labels: list[np.ndarray] | np.ndarray, + ): + + input_point_coords, input_point_labels = self.prepare_points( + point_coords, point_labels + ) + + num_labels = input_point_labels.shape[0] + mask_input = np.zeros( + ( + num_labels, + 1, + self.encoder_input_size[0] // self.scale_factor, + self.encoder_input_size[1] // self.scale_factor, + ), + dtype=np.float32, + ) + has_mask_input = np.array([0], dtype=np.float32) + + return ( + image_embed, + high_res_feats_0, + high_res_feats_1, + input_point_coords, + input_point_labels, + mask_input, + has_mask_input, + ) + + def prepare_points( + self, + point_coords: list[np.ndarray] | np.ndarray, + point_labels: list[np.ndarray] | np.ndarray, + ) -> tuple[np.ndarray, np.ndarray]: + + if isinstance(point_coords, np.ndarray): + input_point_coords = point_coords[np.newaxis, ...] + input_point_labels = point_labels[np.newaxis, ...] + else: + max_num_points = max([coords.shape[0] for coords in point_coords]) + # We need to make sure that all inputs have the same number of points + # Add invalid points to pad the input (0, 0) with -1 value for labels + input_point_coords = np.zeros( + (len(point_coords), max_num_points, 2), dtype=np.float32 + ) + input_point_labels = ( + np.ones((len(point_coords), max_num_points), dtype=np.float32) + * -1 + ) + + for i, (coords, labels) in enumerate( + zip(point_coords, point_labels) + ): + input_point_coords[i, : coords.shape[0], :] = coords + input_point_labels[i, : labels.shape[0]] = labels + + input_point_coords[..., 0] = ( + input_point_coords[..., 0] + / self.orig_im_size[1] + * self.encoder_input_size[1] + ) # Normalize x + input_point_coords[..., 1] = ( + input_point_coords[..., 1] + / self.orig_im_size[0] + * self.encoder_input_size[0] + ) # Normalize y + + return input_point_coords.astype(np.float32), input_point_labels.astype( + np.float32 + ) + + def infer(self, inputs) -> list[np.ndarray]: + start = time.perf_counter() + + outputs = self.session.run( + self.output_names, + { + self.input_names[i]: inputs[i] + for i in range(len(self.input_names)) + }, + ) + + print(f"infer time: {(time.perf_counter() - start) * 1000:.2f} ms") + return outputs + + def process_output( + self, outputs: list[np.ndarray] + ) -> tuple[list[ndarray | Any], ndarray[Any, Any]]: + + scores = outputs[1].squeeze() + masks = outputs[0][0] + + # Select the best masks based on the scores + best_mask = masks[np.argmax(scores)] + best_mask = cv2.resize( + best_mask, (self.orig_im_size[1], self.orig_im_size[0]) + ) + return ( + np.array([[best_mask]]), + scores, + ) + + def set_image_size(self, orig_im_size: tuple[int, int]) -> None: + self.orig_im_size = orig_im_size + + def get_input_details(self) -> None: + model_inputs = self.session.get_inputs() + self.input_names = [ + model_inputs[i].name for i in range(len(model_inputs)) + ] + + def get_output_details(self) -> None: + model_outputs = self.session.get_outputs() + self.output_names = [ + model_outputs[i].name for i in range(len(model_outputs)) + ] diff --git a/anylabeling/services/auto_labeling/sam_onnx.py b/anylabeling/services/auto_labeling/sam_onnx.py index b50cfa0..c99bcb4 100644 --- a/anylabeling/services/auto_labeling/sam_onnx.py +++ b/anylabeling/services/auto_labeling/sam_onnx.py @@ -13,25 +13,12 @@ def __init__(self, encoder_model_path, decoder_model_path) -> None: self.target_size = 1024 self.input_size = (684, 1024) - # Load models - providers = onnxruntime.get_available_providers() - - # Pop TensorRT Runtime due to crashing issues - # TODO: Add back when TensorRT backend is stable - providers = [p for p in providers if p != "TensorrtExecutionProvider"] - - if providers: - logging.info( - "Available providers for ONNXRuntime: %s", ", ".join(providers) - ) - else: - logging.warning("No available providers for ONNXRuntime") self.encoder_session = onnxruntime.InferenceSession( - encoder_model_path, providers=providers + encoder_model_path ) self.encoder_input_name = self.encoder_session.get_inputs()[0].name self.decoder_session = onnxruntime.InferenceSession( - decoder_model_path, providers=providers + decoder_model_path ) def get_input_points(self, prompt): diff --git a/anylabeling/services/auto_labeling/segment_anything.py b/anylabeling/services/auto_labeling/segment_anything.py index 61d910b..1248238 100644 --- a/anylabeling/services/auto_labeling/segment_anything.py +++ b/anylabeling/services/auto_labeling/segment_anything.py @@ -3,6 +3,7 @@ import traceback import cv2 +import onnx import numpy as np from PyQt5 import QtCore from PyQt5.QtCore import QThread @@ -16,6 +17,7 @@ from .model import Model from .types import AutoLabelingResult from .sam_onnx import SegmentAnythingONNX +from .sam2_onnx import SegmentAnything2ONNX class SegmentAnything(Model): @@ -78,9 +80,14 @@ def __init__(self, config_path, on_message) -> None: ) # Load models - self.model = SegmentAnythingONNX( - encoder_model_abs_path, decoder_model_abs_path - ) + if self.detect_model_variant(decoder_model_abs_path) == "sam2": + self.model = SegmentAnything2ONNX( + encoder_model_abs_path, decoder_model_abs_path + ) + else: + self.model = SegmentAnythingONNX( + encoder_model_abs_path, decoder_model_abs_path + ) # Mark for auto labeling # points, rectangles @@ -96,6 +103,14 @@ def __init__(self, config_path, on_message) -> None: self.pre_inference_worker = None self.stop_inference = False + def detect_model_variant(self, decoder_model_abs_path): + """Load and detect model variant based on the model architecture""" + model = onnx.load(decoder_model_abs_path) + input_names = [input.name for input in model.graph.input] + if "high_res_feats_0" in input_names: + return "sam2" + return "sam" + def set_auto_labeling_marks(self, marks): """Set auto labeling marks""" self.marks = marks diff --git a/anylabeling/views/labeling/label_widget.py b/anylabeling/views/labeling/label_widget.py index 2c327c2..2971e6a 100644 --- a/anylabeling/views/labeling/label_widget.py +++ b/anylabeling/views/labeling/label_widget.py @@ -1097,7 +1097,7 @@ def get_labeling_instruction(self): f"{text_mode} {self.canvas.get_mode()} - {text_shortcuts}" f" {text_previous} A, {text_next} D," f" {text_rectangle} R," - f" {text_polygon}: P" + f" {text_polygon} P" ) @pyqtSlot() diff --git a/anylabeling/views/labeling/widgets/auto_labeling/auto_labeling.ui b/anylabeling/views/labeling/widgets/auto_labeling/auto_labeling.ui index 7c962c6..56d8081 100644 --- a/anylabeling/views/labeling/widgets/auto_labeling/auto_labeling.ui +++ b/anylabeling/views/labeling/widgets/auto_labeling/auto_labeling.ui @@ -43,6 +43,9 @@ 2 + + 4 + 0 @@ -187,6 +190,11 @@ 0 + + + 10 + + margin-top: 0; margin-bottom: 10px; diff --git a/requirements-gpu.txt b/requirements-gpu.txt index e981bc2..66e5303 100644 --- a/requirements-gpu.txt +++ b/requirements-gpu.txt @@ -4,7 +4,7 @@ imgviz==1.5.0 natsort==8.1.0 termcolor==1.1.0 PyYAML==6.0 -onnx==1.16.1 -onnxruntime-gpu==1.18.1 +onnx==1.16.2 +onnxruntime-gpu==1.16.3 qimage2ndarray==1.10.0 darkdetect==0.8.0 \ No newline at end of file diff --git a/requirements-macos.txt b/requirements-macos.txt index 8c6f2f3..dceb623 100644 --- a/requirements-macos.txt +++ b/requirements-macos.txt @@ -4,7 +4,7 @@ imgviz==1.5.0 natsort==8.1.0 termcolor==1.1.0 PyYAML==6.0 -onnx==1.16.1 -onnxruntime==1.18.1 +onnx==1.16.2 +onnxruntime==1.16.3 qimage2ndarray==1.10.0 darkdetect==0.8.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 92ad1c7..9bcf324 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ imgviz==1.5.0 natsort==8.1.0 termcolor==1.1.0 PyYAML==6.0 -onnx==1.16.1 -onnxruntime==1.18.1 +onnx==1.16.2 +onnxruntime==1.16.3 qimage2ndarray==1.10.0 darkdetect==0.8.0 \ No newline at end of file diff --git a/setup.py b/setup.py index bd81ee4..2500cb4 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ def get_install_requires(): "termcolor", "opencv-python-headless", 'PyQt5>=5.15.7; platform_system != "Darwin"', - "onnx==1.16.1", + "onnx==1.16.2", "qimage2ndarray==1.10.0", "darkdetect==0.8.0", ] @@ -60,10 +60,10 @@ def get_install_requires(): # Note: onnxruntime-gpu is not available on macOS preferred_device = get_preferred_device() if preferred_device == "GPU" and platform.system() != "Darwin": - install_requires.append("onnxruntime-gpu==1.18.1") + install_requires.append("onnxruntime-gpu==1.16.3") print("Building AnyLabeling with GPU support") else: - install_requires.append("onnxruntime==1.18.1") + install_requires.append("onnxruntime==1.16.3") print("Building AnyLabeling without GPU support") return install_requires @@ -95,8 +95,6 @@ def get_long_description(): "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12",