Save prediction only

huchenlei · Feb 2, 2024 · e911178 · e911178
1 parent 308b513
commit e911178
Show file tree

Hide file tree

Showing 7 changed files with 47 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -82,7 +82,7 @@ encoder = 'vits' # can also be 'vitb' or 'vitl'
 depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{:}14'.format(encoder))
 ```
 
-Depth Anything is also supported in ``transformers``. You can use it for depth prediction within [3 lines of code](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)).
+Depth Anything is also supported in [``transformers``](https://github.com/huggingface/transformers). You can use it for depth prediction within [3 lines of code](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)).
 
 ### No network connection, cannot load these models?
 
@@ -115,9 +115,12 @@ pip install -r requirements.txt
 ### Running
 
 ```bash
-python run.py --encoder <vits | vitb | vitl> --img-path <img-directory | single-img | txt-file> --outdir <outdir>
+python run.py --encoder <vits | vitb | vitl> --img-path <img-directory | single-img | txt-file> --outdir <outdir> [--pred-only] [--grayscale]
 ```
-For the ``img-path``, you can either 1) point it to an image directory storing all interested images, 2) point it to a single image, or 3) point it to a text file storing all image paths.
+Arguments:
+- ``--img-path``: you can either 1) point it to an image directory storing all interested images, 2) point it to a single image, or 3) point it to a text file storing all image paths.
+- ``--pred-only`` is set to save the predicted depth map only. Without it, by default, we visualize both image and its depth map side by side.
+- ``--grayscale`` is set to save the grayscale depth map. Without it, by default, we apply a color palette to the depth map.
 
 For example:
 ```bash
@@ -182,8 +185,12 @@ depth = depth_anything(image)
 
 ### Do not want to define image pre-processing or download model definition files?
 
-Easily use Depth Anything through ``transformers`` within 3 lines of code! Please refer to [these instructions](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)).
+Easily use Depth Anything through [``transformers``](https://github.com/huggingface/transformers) within 3 lines of code! Please refer to [these instructions](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)).
 
+**Note:** If you encounter ``KeyError: 'depth_anything'``, please install the latest ``transformers`` from source:
+```bash
+pip install git+https://github.com/huggingface/transformers.git
+```
 <details>
 <summary>Click here for a brief demo:</summary>
 

diff --git a/run.py b/run.py
@@ -17,6 +17,9 @@
     parser.add_argument('--outdir', type=str, default='./vis_depth')
     parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl'])
 
+    parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction')
+    parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette')
+
     args = parser.parse_args()
 
     margin_width = 50
@@ -76,25 +79,35 @@
         depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
 
         depth = depth.cpu().numpy().astype(np.uint8)
-        depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
 
-        split_region = np.ones((raw_image.shape[0], margin_width, 3), dtype=np.uint8) * 255
-        combined_results = cv2.hconcat([raw_image, split_region, depth_color])
+        if args.grayscale:
+            depth = np.repeat(depth[..., np.newaxis], 3, axis=-1)
+        else:
+            depth = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
+
+        filename = os.path.basename(filename)
 
-        caption_space = np.ones((caption_height, combined_results.shape[1], 3), dtype=np.uint8) * 255
-        captions = ['Raw image', 'Depth Anything']
-        segment_width = w + margin_width
-        for i, caption in enumerate(captions):
-            # Calculate text size
-            text_size = cv2.getTextSize(caption, font, font_scale, font_thickness)[0]
+        if args.pred_only:
+            cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_depth.png'), depth)
+        else:
+            split_region = np.ones((raw_image.shape[0], margin_width, 3), dtype=np.uint8) * 255
+            combined_results = cv2.hconcat([raw_image, split_region, depth])
+
+            caption_space = np.ones((caption_height, combined_results.shape[1], 3), dtype=np.uint8) * 255
+            captions = ['Raw image', 'Depth Anything']
+            segment_width = w + margin_width
+
+            for i, caption in enumerate(captions):
+                # Calculate text size
+                text_size = cv2.getTextSize(caption, font, font_scale, font_thickness)[0]
 
-            # Calculate x-coordinate to center the text
-            text_x = int((segment_width * i) + (w - text_size[0]) / 2)
+                # Calculate x-coordinate to center the text
+                text_x = int((segment_width * i) + (w - text_size[0]) / 2)
 
-            # Add text caption
-            cv2.putText(caption_space, caption, (text_x, 40), font, font_scale, (0, 0, 0), font_thickness)
-
-        final_result = cv2.vconcat([caption_space, combined_results])
-
-        filename = os.path.basename(filename)
-        cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_img_depth.png'), final_result)
+                # Add text caption
+                cv2.putText(caption_space, caption, (text_x, 40), font, font_scale, (0, 0, 0), font_thickness)
+            
+            final_result = cv2.vconcat([caption_space, combined_results])
+            
+            cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_img_depth.png'), final_result)
+
diff --git a/semseg/README.md b/semseg/README.md
@@ -37,6 +37,7 @@ Note that our results are obtained *without* Mapillary pre-training.
 - [Cityscapes-ViT-L-mIoU-86.4](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints_semseg/cityscapes_vitl_mIoU_86.4.pth)
 - [ADE20K-ViT-L-mIoU-59.4](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints_semseg/ade20k_vitl_mIoU_59.4.pth)
 
+**Note:** If you want to reproduce the training process, please 1) download the [Depth Anything pre-trained model](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints/depth_anything_vitl14.pth) (to initialize the encoder) and 2) put it under the ``checkpoints`` folder.
 
 
 ## Installation

diff --git a/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py b/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py
@@ -20,7 +20,7 @@
         type='DINOv2',
         version='large',
         freeze=False,
-        load_from='../checkpoints/depth_anything_vitl14.pth'),
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
     neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
     decode_head=dict(
         type='Mask2FormerHead',

diff --git a/...eg/config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py b/...eg/config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py
@@ -20,7 +20,7 @@
         type='DINOv2',
         version='large',
         freeze=False,
-        load_from='../checkpoints/depth_anything_vitl14.pth'),
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
     neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
     decode_head=dict(
         type='Mask2FormerHead',

diff --git a/...config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py b/...config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py
@@ -21,7 +21,7 @@
         type='DINOv2',
         version='large',
         freeze=False,
-        load_from='../checkpoints/depth_anything_vitl14.pth'),
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
     neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
     decode_head=dict(
         type='Mask2FormerHead',

diff --git a/semseg/dinov2.py b/semseg/dinov2.py
@@ -15,7 +15,7 @@ def __init__(self, version='large', freeze=False, load_from=None):
         super().__init__()
 
         if version == 'large':
-            self.dinov2 = torch.hub.load('torchhub/facebookresearch_dinov2_main', 'dinov2_vit14', source='local', pretrained=False)
+            self.dinov2 = torch.hub.load('torchhub/facebookresearch_dinov2_main', 'dinov2_vitl14', source='local', pretrained=False)
         else:
             raise NotImplementedError