forked from mit-han-lab/bevfusion
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fusion-bev256d2-lss.yaml
130 lines (126 loc) · 2.93 KB
/
fusion-bev256d2-lss.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
model:
encoders:
camera:
backbone:
type: SwinTransformer
embed_dims: 96
depths: [2, 2, 6, 2]
num_heads: [3, 6, 12, 24]
window_size: 7
mlp_ratio: 4
qkv_bias: true
qk_scale: null
drop_rate: 0.
attn_drop_rate: 0.
drop_path_rate: 0.3
patch_norm: true
out_indices: [1, 2, 3]
with_cp: false
convert_weights: true
init_cfg:
type: Pretrained
checkpoint: https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
neck:
type: GeneralizedLSSFPN
in_channels: [192, 384, 768]
out_channels: 256
start_level: 0
num_outs: 3
norm_cfg:
type: BN2d
requires_grad: true
act_cfg:
type: ReLU
inplace: true
upsample_cfg:
mode: bilinear
align_corners: false
vtransform:
type: LSSTransform
in_channels: 256
out_channels: 80
image_size: ${image_size}
feature_size: ${[image_size[0] // 8, image_size[1] // 8]}
xbound: [-51.2, 51.2, 0.4]
ybound: [-51.2, 51.2, 0.4]
zbound: [-10.0, 10.0, 20.0]
dbound: [1.0, 60.0, 0.5]
downsample: 2
lidar:
voxelize:
max_num_points: 10
point_cloud_range: ${point_cloud_range}
voxel_size: ${voxel_size}
max_voxels: [90000, 120000]
backbone:
type: SparseEncoder
in_channels: 5
sparse_shape: [1024, 1024, 41]
output_channels: 128
order:
- conv
- norm
- act
encoder_channels:
- [16, 16, 32]
- [32, 32, 64]
- [64, 64, 128]
- [128, 128]
encoder_paddings:
- [0, 0, 1]
- [0, 0, 1]
- [0, 0, [1, 1, 0]]
- [0, 0]
block_type: basicblock
fuser:
type: ConvFuser
in_channels: [80, 256]
out_channels: 256
decoder:
backbone:
type: SECOND
in_channels: 256
out_channels: [128, 256]
layer_nums: [5, 5]
layer_strides: [1, 2]
norm_cfg:
type: BN
eps: 1.0e-3
momentum: 0.01
conv_cfg:
type: Conv2d
bias: false
neck:
type: SECONDFPN
in_channels: [128, 256]
out_channels: [256, 256]
upsample_strides: [1, 2]
norm_cfg:
type: BN
eps: 1.0e-3
momentum: 0.01
upsample_cfg:
type: deconv
bias: false
use_conv_for_no_stride: true
heads:
map:
in_channels: 512
optimizer:
type: AdamW
lr: 1.0e-4
weight_decay: 0.01
paramwise_cfg:
custom_keys:
absolute_pos_embed:
decay_mult: 0
relative_position_bias_table:
decay_mult: 0
optimizer_config:
grad_clip:
max_norm: 35
norm_type: 2
lr_config:
policy: cyclic
momentum_config:
policy: cyclic