diff --git a/README.md b/README.md index b0452d8b..1a900644 100644 --- a/README.md +++ b/README.md @@ -72,8 +72,9 @@ ## 🥳 What's New [⏏️](#📄-table-of-contents) - Feb. 2024: + - ✨✨✨ Support [YOLO-World](https://github.com/AILab-CVC/YOLO-World) model. - 🤗 Release the latest version [2.3.2](https://github.com/CVHub520/X-AnyLabeling/releases/tag/v2.3.2) 🤗 - - ✨✨✨ Support [YOLOv9](https://github.com/WongKinYiu/yolov9) model. + - Support [YOLOv9](https://github.com/WongKinYiu/yolov9) model. - Support the conversion from a horizontal bounding box to a rotated bounding box. - Supports label deletion and renaming. For more details, please refer to the [document](./docs/zh_cn/user_guide.md). - Support for quick tag correction is available; please refer to this [document](./docs/en/user_guide.md) for guidance. diff --git a/README_zh-CN.md b/README_zh-CN.md index 2e2bbc38..ace96654 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -70,8 +70,9 @@ ## 🥳 新功能 [⏏️](#📄-目录) - 2024年2月: + - ✨✨✨ 支持[YOLO-World](https://github.com/AILab-CVC/YOLO-World)模型。 - 🤗 发布[2.3.2](https://github.com/CVHub520/X-AnyLabeling/releases/tag/v2.3.2)最新版本 🤗 - - ✨✨✨ 支持[YOLOv9](https://github.com/WongKinYiu/yolov9)模型。 + - 支持[YOLOv9](https://github.com/WongKinYiu/yolov9)模型。 - 支持将水平框一键转换为旋转框。 - 支持批量标签删除及重命名,详情可参考[用户手册](./docs/zh_cn/user_guide.md)。 - 支持快速标签纠正功能,详情可参考[用户手册](./docs/zh_cn/user_guide.md)。 diff --git a/anylabeling/configs/auto_labeling/models.yaml b/anylabeling/configs/auto_labeling/models.yaml index 42dd946c..ed6947d0 100644 --- a/anylabeling/configs/auto_labeling/models.yaml +++ b/anylabeling/configs/auto_labeling/models.yaml @@ -84,6 +84,8 @@ config_file: ":/yolo_nas_m.yaml" - model_name: "yolo-nas-s-r20230615" config_file: ":/yolo_nas_s.yaml" +- model_name: "yolow_l-r20240227" + config_file: ":/yolow_l.yaml" - model_name: "yolov5l-r20230520" config_file: ":/yolov5l.yaml" - model_name: "yolov5_car_plate-r20230112" diff --git a/anylabeling/configs/auto_labeling/yolow_l.yaml b/anylabeling/configs/auto_labeling/yolow_l.yaml new file mode 100644 index 00000000..8e0223f8 --- /dev/null +++ b/anylabeling/configs/auto_labeling/yolow_l.yaml @@ -0,0 +1,86 @@ +type: yolow +name: yolow_l-r20240227 +display_name: YOLO-World-L Tencent +model_path: https://github.com/CVHub520/X-AnyLabeling/releases/download/v2.3.3/yolow-l.onnx +confidence_threshold: 0.05 +classes: + - person + - bicycle + - car + - motorcycle + - airplane + - bus + - train + - truck + - boat + - traffic light + - fire hydrant + - stop sign + - parking meter + - bench + - bird + - cat + - dog + - horse + - sheep + - cow + - elephant + - bear + - zebra + - giraffe + - backpack + - umbrella + - handbag + - tie + - suitcase + - frisbee + - skis + - snowboard + - sports ball + - kite + - baseball bat + - baseball glove + - skateboard + - surfboard + - tennis racket + - bottle + - wine glass + - cup + - fork + - knife + - spoon + - bowl + - banana + - apple + - sandwich + - orange + - broccoli + - carrot + - hot dog + - pizza + - donut + - cake + - chair + - couch + - potted plant + - bed + - dining table + - toilet + - tv + - laptop + - mouse + - remote + - keyboard + - cell phone + - microwave + - oven + - toaster + - sink + - refrigerator + - book + - clock + - vase + - scissors + - teddy bear + - hair drier + - toothbrush diff --git a/anylabeling/services/auto_labeling/model_manager.py b/anylabeling/services/auto_labeling/model_manager.py index 7229a93b..22da178f 100644 --- a/anylabeling/services/auto_labeling/model_manager.py +++ b/anylabeling/services/auto_labeling/model_manager.py @@ -206,6 +206,7 @@ def load_custom_model(self, config_file): "rtmdet_pose", "depth_anything", "yolov9", + "yolow", ] ): self.new_model_status.emit( @@ -439,6 +440,28 @@ def _load_model(self, model_id): ) ) return + elif model_config["type"] == "yolow": + from .yolow import YOLOW + + try: + model_config["model"] = YOLOW( + model_config, on_message=self.new_model_status.emit + ) + self.auto_segmentation_model_unselected.emit() + except Exception as e: # noqa + self.new_model_status.emit( + self.tr( + "Error in loading model: {error_message}".format( + error_message=str(e) + ) + ) + ) + print( + "Error in loading model: {error_message}".format( + error_message=str(e) + ) + ) + return elif model_config["type"] == "yolov5_seg": from .yolov5_seg import YOLOv5_Seg diff --git a/anylabeling/services/auto_labeling/utils/points_conversion.py b/anylabeling/services/auto_labeling/utils/points_conversion.py index 9297c20e..a848d1ef 100755 --- a/anylabeling/services/auto_labeling/utils/points_conversion.py +++ b/anylabeling/services/auto_labeling/utils/points_conversion.py @@ -232,6 +232,31 @@ def rbox2poly(obboxes): *order, 8 ) +def denormalize_bbox(bbox, input_shape, image_shape): + """ + Denormalizes bounding box coordinates from input_shape to image_shape. + + Parameters: + - bbox: Normalized bounding box coordinates [xmin, ymin, xmax, ymax] + - input_shape: The shape of the input image used during normalization (e.g., [640, 640]) + - image_shape: The shape of the original image (e.g., [height, width]) + + Returns: + - Denormalized bounding box coordinates [xmin, ymin, xmax, ymax] + """ + xmin, ymin, xmax, ymax = bbox + + # Denormalize x-coordinates + denorm_xmin = int(xmin * image_shape[1] / input_shape[1]) + denorm_xmax = int(xmax * image_shape[1] / input_shape[1]) + + # Denormalize y-coordinates + denorm_ymin = int(ymin * image_shape[0] / input_shape[0]) + denorm_ymax = int(ymax * image_shape[0] / input_shape[0]) + + denormalized_bbox = [denorm_xmin, denorm_ymin, denorm_xmax, denorm_ymax] + + return denormalized_bbox def rescale_box(input_shape, boxes, image_shape, kpts=False): """Rescale the output to the original image shape""" diff --git a/anylabeling/services/auto_labeling/yolow.py b/anylabeling/services/auto_labeling/yolow.py new file mode 100644 index 00000000..08919360 --- /dev/null +++ b/anylabeling/services/auto_labeling/yolow.py @@ -0,0 +1,57 @@ +import logging + +from PyQt5 import QtCore + +from anylabeling.app_info import __preferred_device__ +from anylabeling.views.labeling.shape import Shape +from anylabeling.views.labeling.utils.opencv import qt_img_to_rgb_cv_img +from .types import AutoLabelingResult +from .__base__.yolo import YOLO +from .utils import denormalize_bbox + + +class YOLOW(YOLO): + """https://github.com/AILab-CVC/YOLO-World""" + + def postprocess(self, outputs, image_shape): + num_objs, bboxes, scores, class_ids = [out[0] for out in outputs] + bboxes = [denormalize_bbox(bbox, self.input_shape, image_shape) for bbox in bboxes] + return num_objs, bboxes, scores, class_ids + + def predict_shapes(self, image, image_path=None): + """ + Predict shapes from image + """ + + if image is None: + return [] + + try: + image = qt_img_to_rgb_cv_img(image, image_path) + except Exception as e: # noqa + logging.warning("Could not inference model") + logging.warning(e) + return [] + + blob = self.preprocess(image, upsample_mode="resize") + outputs = self.net.get_ort_inference(blob, extract=False) + _, bboxes, scores, class_ids = self.postprocess(outputs, image.shape[:2]) + + shapes = [] + for bbox, score, cls_id in zip(bboxes, scores, class_ids): + if score < self.conf_thres or (int(cls_id) == -1): + continue + xmin, ymin, xmax, ymax = bbox + rectangle_shape = Shape( + label=str(self.classes[int(cls_id)]), + shape_type="rectangle", + ) + rectangle_shape.add_point(QtCore.QPointF(xmin, ymin)) + rectangle_shape.add_point(QtCore.QPointF(xmax, ymin)) + rectangle_shape.add_point(QtCore.QPointF(xmax, ymax)) + rectangle_shape.add_point(QtCore.QPointF(xmin, ymax)) + shapes.append(rectangle_shape) + + result = AutoLabelingResult(shapes, replace=True) + + return result diff --git a/docs/en/custom_model.md b/docs/en/custom_model.md index 3550b6f5..27241284 100644 --- a/docs/en/custom_model.md +++ b/docs/en/custom_model.md @@ -246,4 +246,30 @@ Refer to this [tutorial](https://github.com/CVHub520/sam-hq). - [InternImage](https://github.com/OpenGVLab/InternImage) -InternImage introduces a large-scale convolutional neural network (CNN) model, leveraging deformable convolution as the core operator to achieve a large effective receptive field, adaptive spatial aggregation, and reduced inductive bias, leading to stronger and more robust pattern learning from massive data. It outperforms current CNNs and vision transformers on benchmarks \ No newline at end of file +InternImage introduces a large-scale convolutional neural network (CNN) model, leveraging deformable convolution as the core operator to achieve a large effective receptive field, adaptive spatial aggregation, and reduced inductive bias, leading to stronger and more robust pattern learning from massive data. It outperforms current CNNs and vision transformers on benchmarks + +> Paper: InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions
+> Affiliation: Shanghai AI Laboratory, Tsinghua University, Nanjing University, etc.
+> Published: CVPR 2023
+ +Refer to this [tutorial](../../tools/export_internimage_model_onnx.py). + +- [EdgeSAM](https://github.com/chongzhou96/EdgeSAM) + +`EdgeSAM` is an accelerated variant of the Segment Anything Model (SAM), optimized for efficient execution on edge devices with minimal compromise in performance. It achieves a 40-fold speed increase compared to the original SAM, and outperforms MobileSAM, being 14 times as fast when deployed on edge devices while enhancing the mIoUs on COCO and LVIS by 2.3 and 3.2 respectively. EdgeSAM is also the first SAM variant that can run at over 30 FPS on an iPhone 14. + +> Paper: Prompt-In-the-Loop Distillation for On-Device Deployment of SAM
+> Affiliation: S-Lab, Nanyang Technological University, Shanghai Artificial Intelligence Laboratory.
+> Published: Arxiv 2023
+ +Refer to this [tutorial](https://github.com/chongzhou96/EdgeSAM/blob/master/scripts/export_onnx_model.py). + +- [YOLO-World](https://github.com/AILab-CVC/YOLO-World) + +`YOLO-World` enhances the YOLO series by incorporating vision-language modeling, achieving efficient open-scenario object detection with impressive performance on various tasks. + +> Paper: Real-Time Open-Vocabulary Object Detection
+> Affiliation: Tencent AI Lab, ARC Lab, Tencent PCG, Huazhong University of Science and Technology.
+> Published: Arxiv 2024
+ +Refer to this [tutorial](../../tools/export_yolow_onnx.py). diff --git a/docs/zh_cn/custom_model.md b/docs/zh_cn/custom_model.md index c5ff69c5..44edc806 100644 --- a/docs/zh_cn/custom_model.md +++ b/docs/zh_cn/custom_model.md @@ -264,4 +264,14 @@ InternImage introduces a large-scale convolutional neural network (CNN) model, l > 单位:S-Lab, Nanyang Technological University, Shanghai Artificial Intelligence Laboratory.
> 发表:Arxiv 2023
-参考此[教程](https://github.com/chongzhou96/EdgeSAM/blob/master/scripts/export_onnx_model.py). \ No newline at end of file +参考此[教程](https://github.com/chongzhou96/EdgeSAM/blob/master/scripts/export_onnx_model.py). + +- [YOLO-World](https://github.com/AILab-CVC/YOLO-World) + +`YOLO-World` enhances the YOLO series by incorporating vision-language modeling, achieving efficient open-scenario object detection with impressive performance on various tasks. + +> 论文:Real-Time Open-Vocabulary Object Detection
+> 单位:Tencent AI Lab, ARC Lab, Tencent PCG, Huazhong University of Science and Technology.
+> 发表:Arxiv 2024
+ +参考此[教程](../../tools/export_yolow_onnx.py). diff --git a/tools/export_yolow_onnx.py b/tools/export_yolow_onnx.py new file mode 100644 index 00000000..d70c7cbc --- /dev/null +++ b/tools/export_yolow_onnx.py @@ -0,0 +1,114 @@ +import os +import cv2 +import time as time +import numpy as np +import onnxruntime as ort + +""" +The onnxruntime demo of the YOLO-World +Written by Wei Wang (CVHub) + Usage: + 1. Download source code from [huggingface/stevengrove/YOLO-World](https://huggingface.co/spaces/stevengrove/YOLO-World) + 2. cd YOLO-World and pip install -r requirements.txt + 3. export PYTHONPATH=/path/to/your/YOLO-World + 4. execute `python app.py --config ${your_custom_config.py} --checkpoint ${your_custom_finetune_model_config.py}` + 5. Place the current script in this directory + 6. Put the corresponding onnx weight into the specific directory + 7. Run the following command + ```bash + python ${export_yolow_onnx.py} + ``` +""" + +import onnxruntime as ort +import numpy as np +import cv2 + +def denormalize_bbox(bbox, input_shape, image_shape): + """ + Denormalizes bounding box coordinates from input_shape to image_shape. + + Parameters: + - bbox: Normalized bounding box coordinates [xmin, ymin, xmax, ymax] + - input_shape: The shape of the input image used during normalization (e.g., [640, 640]) + - image_shape: The shape of the original image (e.g., [height, width]) + + Returns: + - Denormalized bounding box coordinates [xmin, ymin, xmax, ymax] + """ + xmin, ymin, xmax, ymax = bbox + + # Denormalize x-coordinates + denorm_xmin = int(xmin * image_shape[1] / input_shape[1]) + denorm_xmax = int(xmax * image_shape[1] / input_shape[1]) + + # Denormalize y-coordinates + denorm_ymin = int(ymin * image_shape[0] / input_shape[0]) + denorm_ymax = int(ymax * image_shape[0] / input_shape[0]) + + denormalized_bbox = [denorm_xmin, denorm_ymin, denorm_xmax, denorm_ymax] + + return denormalized_bbox + + +def preprocess_image(image_path, input_shape): + im0 = cv2.imread(image_path) + image_shape = im0.shape[:2] + image = cv2.resize(im0, input_shape) # Resize to the input dimension expected by the YOLO model + image = image.astype(np.float32) / 255.0 # Normalize the image + image = np.transpose(image, (2, 0, 1)) # Change data layout from HWC to CHW + image = np.expand_dims(image, axis=0) # Add batch dimension + + return image, image_shape, im0 + + +def inference(session, input_name, image): + outputs = session.run(None, {input_name: image}) + num_objs, bboxes, scores, class_ids = [out[0] for out in outputs] + return num_objs, bboxes, scores, class_ids + + +def postprocess_results(output_image, scores, class_ids, bbox, input_shape, image_shape, score_threshold): + for i, score in enumerate(scores): + if score > score_threshold and (class_ids[i] != -1): + bbox[i] = denormalize_bbox(bbox[i], input_shape, image_shape) + x_min, y_min, x_max, y_max = bbox[i] + start_point = (int(x_min), int(y_min)) + end_point = (int(x_max), int(y_max)) + color = (0, 255, 0) + cv2.rectangle(output_image, start_point, end_point, color, 2) + label = f"{class_ids[i]}: {score:.2f}" + cv2.putText(output_image, label, (int(x_min), int(y_min)-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) + + return output_image + + +def forward(image_path, onnx_path, score_threshold): + session = ort.InferenceSession(onnx_path, providers=['CPUExecutionProvider']) + input_name = session.get_inputs()[0].name + input_shape = session.get_inputs()[0].shape[-2:] + + blob, image_shape, im0 = preprocess_image(image_path, input_shape) + _, bboxes, scores, class_ids = inference(session, input_name, blob) + output_image = postprocess_results(im0, scores, class_ids, bboxes, input_shape, image_shape, score_threshold) + + return output_image + + +def main(): + score_threshold = 0.05 + image_path = '/path/to/image' + model_path = '/path/to/model' + + result_image = forward(image_path, model_path, score_threshold) + + try: + cv2.imshow("Detected Objects", result_image) + cv2.waitKey(0) + cv2.destroyAllWindows() + except: + cv2.imwrite("/path/to/save", result_image) + + +if __name__ == '__main__': + main()