diff --git a/README.md b/README.md index 8407bf1..4c53df6 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ encoder = 'vits' # can also be 'vitb' or 'vitl' depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{:}14'.format(encoder)) ``` -Depth Anything is also supported in ``transformers``. You can use it for depth prediction within [3 lines of code](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)). +Depth Anything is also supported in [``transformers``](https://github.com/huggingface/transformers). You can use it for depth prediction within [3 lines of code](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)). ### No network connection, cannot load these models? @@ -115,9 +115,12 @@ pip install -r requirements.txt ### Running ```bash -python run.py --encoder --img-path --outdir +python run.py --encoder --img-path --outdir [--pred-only] [--grayscale] ``` -For the ``img-path``, you can either 1) point it to an image directory storing all interested images, 2) point it to a single image, or 3) point it to a text file storing all image paths. +Arguments: +- ``--img-path``: you can either 1) point it to an image directory storing all interested images, 2) point it to a single image, or 3) point it to a text file storing all image paths. +- ``--pred-only`` is set to save the predicted depth map only. Without it, by default, we visualize both image and its depth map side by side. +- ``--grayscale`` is set to save the grayscale depth map. Without it, by default, we apply a color palette to the depth map. For example: ```bash @@ -182,8 +185,12 @@ depth = depth_anything(image) ### Do not want to define image pre-processing or download model definition files? -Easily use Depth Anything through ``transformers`` within 3 lines of code! Please refer to [these instructions](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)). +Easily use Depth Anything through [``transformers``](https://github.com/huggingface/transformers) within 3 lines of code! Please refer to [these instructions](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)). +**Note:** If you encounter ``KeyError: 'depth_anything'``, please install the latest ``transformers`` from source: +```bash +pip install git+https://github.com/huggingface/transformers.git +```
Click here for a brief demo: diff --git a/run.py b/run.py index 10eb781..06c7716 100644 --- a/run.py +++ b/run.py @@ -17,6 +17,9 @@ parser.add_argument('--outdir', type=str, default='./vis_depth') parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl']) + parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction') + parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette') + args = parser.parse_args() margin_width = 50 @@ -76,25 +79,35 @@ depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 depth = depth.cpu().numpy().astype(np.uint8) - depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO) - split_region = np.ones((raw_image.shape[0], margin_width, 3), dtype=np.uint8) * 255 - combined_results = cv2.hconcat([raw_image, split_region, depth_color]) + if args.grayscale: + depth = np.repeat(depth[..., np.newaxis], 3, axis=-1) + else: + depth = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO) + + filename = os.path.basename(filename) - caption_space = np.ones((caption_height, combined_results.shape[1], 3), dtype=np.uint8) * 255 - captions = ['Raw image', 'Depth Anything'] - segment_width = w + margin_width - for i, caption in enumerate(captions): - # Calculate text size - text_size = cv2.getTextSize(caption, font, font_scale, font_thickness)[0] + if args.pred_only: + cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_depth.png'), depth) + else: + split_region = np.ones((raw_image.shape[0], margin_width, 3), dtype=np.uint8) * 255 + combined_results = cv2.hconcat([raw_image, split_region, depth]) + + caption_space = np.ones((caption_height, combined_results.shape[1], 3), dtype=np.uint8) * 255 + captions = ['Raw image', 'Depth Anything'] + segment_width = w + margin_width + + for i, caption in enumerate(captions): + # Calculate text size + text_size = cv2.getTextSize(caption, font, font_scale, font_thickness)[0] - # Calculate x-coordinate to center the text - text_x = int((segment_width * i) + (w - text_size[0]) / 2) + # Calculate x-coordinate to center the text + text_x = int((segment_width * i) + (w - text_size[0]) / 2) - # Add text caption - cv2.putText(caption_space, caption, (text_x, 40), font, font_scale, (0, 0, 0), font_thickness) - - final_result = cv2.vconcat([caption_space, combined_results]) - - filename = os.path.basename(filename) - cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_img_depth.png'), final_result) + # Add text caption + cv2.putText(caption_space, caption, (text_x, 40), font, font_scale, (0, 0, 0), font_thickness) + + final_result = cv2.vconcat([caption_space, combined_results]) + + cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_img_depth.png'), final_result) + \ No newline at end of file diff --git a/semseg/README.md b/semseg/README.md index 52b27b1..c474b71 100644 --- a/semseg/README.md +++ b/semseg/README.md @@ -37,6 +37,7 @@ Note that our results are obtained *without* Mapillary pre-training. - [Cityscapes-ViT-L-mIoU-86.4](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints_semseg/cityscapes_vitl_mIoU_86.4.pth) - [ADE20K-ViT-L-mIoU-59.4](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints_semseg/ade20k_vitl_mIoU_59.4.pth) +**Note:** If you want to reproduce the training process, please 1) download the [Depth Anything pre-trained model](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints/depth_anything_vitl14.pth) (to initialize the encoder) and 2) put it under the ``checkpoints`` folder. ## Installation diff --git a/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py b/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py index 429bef7..5bc733c 100644 --- a/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py +++ b/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py @@ -20,7 +20,7 @@ type='DINOv2', version='large', freeze=False, - load_from='../checkpoints/depth_anything_vitl14.pth'), + load_from='./checkpoints/depth_anything_vitl14.pth'), neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]), decode_head=dict( type='Mask2FormerHead', diff --git a/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py b/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py index 9e3695a..614dadd 100644 --- a/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py +++ b/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py @@ -20,7 +20,7 @@ type='DINOv2', version='large', freeze=False, - load_from='../checkpoints/depth_anything_vitl14.pth'), + load_from='./checkpoints/depth_anything_vitl14.pth'), neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]), decode_head=dict( type='Mask2FormerHead', diff --git a/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py b/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py index 55352a0..8ae5de1 100644 --- a/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py +++ b/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py @@ -21,7 +21,7 @@ type='DINOv2', version='large', freeze=False, - load_from='../checkpoints/depth_anything_vitl14.pth'), + load_from='./checkpoints/depth_anything_vitl14.pth'), neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]), decode_head=dict( type='Mask2FormerHead', diff --git a/semseg/dinov2.py b/semseg/dinov2.py index 0354865..047a4e0 100644 --- a/semseg/dinov2.py +++ b/semseg/dinov2.py @@ -15,7 +15,7 @@ def __init__(self, version='large', freeze=False, load_from=None): super().__init__() if version == 'large': - self.dinov2 = torch.hub.load('torchhub/facebookresearch_dinov2_main', 'dinov2_vit14', source='local', pretrained=False) + self.dinov2 = torch.hub.load('torchhub/facebookresearch_dinov2_main', 'dinov2_vitl14', source='local', pretrained=False) else: raise NotImplementedError