-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcl_test1.py
308 lines (258 loc) · 13 KB
/
cl_test1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
import os
import sys
sys.path.append("/mnt/HDD5/zl/vlmaps")
import math
import numpy as np
import cv2
from tqdm import tqdm
import torch
import torchvision.transforms as transforms
from vlmaps.utils.mapping_utils import load_pose, load_obj2cls_dict, save_map, cvt_obj_id_2_cls_id, depth2pc, transform_pc, get_sim_cam_mat, pos2grid_id, project_point
from vlmaps.lseg.modules.models.lseg_net import LSegEncNet
from vlmaps.lseg.additional_utils.models import resize_image, pad_image, crop_image
import clip
def load_depth(depth_filepath):
with open(depth_filepath, 'rb') as f:
depth = np.load(f)
return depth
def load_semantic(semantic_filepath):
with open(semantic_filepath, "rb") as f:
semantic = np.load(f)
return semantic
def create_lseg_map_batch(img_save_dir, camera_height, cs=0.05, gs=1000, depth_sample_rate=100):
mask_version = 1 # 0, 1
crop_size = 480 # 480
base_size = 520 # 520
# lang = "door,chair,ground,ceiling,other"
lang = "arm chair,bed,cabinet,chair,coffee machine,coffee table,counter,desk,desktop,dining table,dog bed,drawer,dresser,floor lamp,fridge,garbage bag,garbage can,mirror,shelf,sink,sink basin,sofa,television,toilet,towel holder,tv stand,vacuum cleaner,others" # 限制77tokens
labels = lang.split(",")
# loading models
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
clip_version = "ViT-B/32"
clip_feat_dim = {'RN50': 1024, 'RN101': 512, 'RN50x4': 640, 'RN50x16': 768,
'RN50x64': 1024, 'ViT-B/32': 512, 'ViT-B/16': 512, 'ViT-L/14': 768}[clip_version]
print("Loading CLIP model...")
clip_model, preprocess = clip.load(name='/media/cl/SSK/OtherCode/vlmaps_ai2thor/model/clip/ViT-B-32.pt') # clip.available_models()
clip_model.to(device).eval()
lang_token = clip.tokenize(labels)
lang_token = lang_token.to(device)
with torch.no_grad():
text_feats = clip_model.encode_text(lang_token)
text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
text_feats = text_feats.cpu().numpy()
model = LSegEncNet(lang, arch_option=0,
block_depth=0,
activation='lrelu',
crop_size=crop_size)
model_state_dict = model.state_dict()
pretrained_state_dict = torch.load("/media/cl/SSK/OtherCode/vlmaps_ai2thor/lseg/checkpoints/demo_e200.ckpt")
pretrained_state_dict = {k.lstrip('net.'): v for k, v in pretrained_state_dict['state_dict'].items()}
model_state_dict.update(pretrained_state_dict)
model.load_state_dict(pretrained_state_dict)
model.eval()
model = model.cuda()
norm_mean= [0.5, 0.5, 0.5]
norm_std = [0.5, 0.5, 0.5]
padding = [0.0] * 3
transform = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
]
)
print(f"loading scene {img_save_dir}")
rgb_dir = os.path.join(img_save_dir, "rgb")
depth_dir = os.path.join(img_save_dir, "depth")
pose_dir = os.path.join(img_save_dir, "pose")
semantic_dir = os.path.join(img_save_dir, "semantic")
obj2cls_path = os.path.join(img_save_dir, "obj2cls_dict.txt")
rgb_list = sorted(os.listdir(rgb_dir), key=lambda x: int(
x.split("_")[-1].split(".")[0]))
depth_list = sorted(os.listdir(depth_dir), key=lambda x: int(
x.split("_")[-1].split(".")[0]))
pose_list = sorted(os.listdir(pose_dir), key=lambda x: int(
x.split("_")[-1].split(".")[0]))
pose_list = sorted(os.listdir(pose_dir), key=lambda x: int(
x.split("_")[-1].split(".")[0]))
semantic_list = sorted(os.listdir(semantic_dir), key=lambda x: int(
x.split("_")[-1].split(".")[0]))
rgb_list = [os.path.join(rgb_dir, x) for x in rgb_list]
depth_list = [os.path.join(depth_dir, x) for x in depth_list]
pose_list = [os.path.join(pose_dir, x) for x in pose_list]
semantic_list = [os.path.join(semantic_dir, x) for x in semantic_list]
map_save_dir = os.path.join(img_save_dir, "map")
os.makedirs(map_save_dir, exist_ok=True)
color_top_down_save_path = os.path.join(map_save_dir, f"color_top_down_{mask_version}.npy")
gt_save_path = os.path.join(map_save_dir, f"grid_{mask_version}_gt.npy")
grid_save_path = os.path.join(map_save_dir, f"grid_lseg_{mask_version}.npy")
weight_save_path = os.path.join(map_save_dir, f"weight_lseg_{mask_version}.npy")
obstacles_save_path = os.path.join(map_save_dir, "obstacles.npy")
# obj2cls = load_obj2cls_dict(obj2cls_path)
# initialize a grid with zero position at the center
color_top_down_height = (camera_height + 1) * np.ones((gs, gs), dtype=np.float32)
color_top_down = np.zeros((gs, gs, 3), dtype=np.uint8)
gt = np.zeros((gs, gs), dtype=np.int32)
grid = np.zeros((gs, gs, clip_feat_dim), dtype=np.float32)
obstacles = np.ones((gs, gs), dtype=np.uint8)
weight = np.zeros((gs, gs), dtype=float)
tf_list = []
data_iter = zip(rgb_list, depth_list, pose_list)
pbar = tqdm(total=len(rgb_list))
# load all images and depths and poses
for data_sample in data_iter:
rgb_path, depth_path, pose_path = data_sample
bgr = cv2.imread(rgb_path)
rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
# read pose 从相机坐标系到世界坐标系
# 1. 数学变换
pos, rot = load_pose(pose_path) # z backward, y upward, x to the right ==> pos:一个形状为 (3, 1) 的位置向量; rot:一个形状为 (3, 3) 的旋转矩阵
# 2. 旋转到相机上(这里要结合实际修改)
rot_ro_cam = np.eye(3)
rot_ro_cam[1, 1] = 1 #
rot_ro_cam[2, 2] = 1 #
rot = rot @ rot_ro_cam
pos[1] += camera_height # 加上相机的高度
pose = np.eye(4)
pose[:3, :3] = rot
pose[:3, 3] = pos.reshape(-1)
tf_list.append(pose)
if len(tf_list) == 1:
init_tf_inv = np.linalg.inv(tf_list[0])
tf = init_tf_inv @ pose
# read depth
depth = load_depth(depth_path) # [720, 1080]
# read semantic
# semantic = load_semantic(semantic_path) # [720, 1080]
# semantic = cvt_obj_id_2_cls_id(semantic, obj2cls) # 这里每个类型存储为不同的数值即可
pix_feats = get_lseg_feat(model, rgb, labels, transform, crop_size, base_size, norm_mean, norm_std)
# transform all points to the global frame
pc, mask = depth2pc(depth)
shuffle_mask = np.arange(pc.shape[1])
np.random.shuffle(shuffle_mask)
shuffle_mask = shuffle_mask[::depth_sample_rate]
mask = mask[shuffle_mask]
pc = pc[:, shuffle_mask]
pc = pc[:, mask]
pc_global = transform_pc(pc, tf)
rgb_cam_mat = get_sim_cam_mat(rgb.shape[0], rgb.shape[1])
feat_cam_mat = get_sim_cam_mat(pix_feats.shape[2], pix_feats.shape[3])
all_p_local1=[]
# project all point cloud onto the ground
for i, (p, p_local) in enumerate(zip(pc_global.T, pc.T)):
x, y = pos2grid_id(gs, cs, p[0], p[2])
all_p_local1.append(p_local[1])
# # 忽略被投影到地图外或者高于摄像机0.5米以上的点(这些可能是来自天花板的点)
if x >= obstacles.shape[0] or y >= obstacles.shape[1] or x < 0 or y < 0 or p_local[1] < -0.5: # 摄像机以上0.5
continue
rgb_px, rgb_py, rgb_pz = project_point(rgb_cam_mat, p_local)
rgb_v = rgb[rgb_py, rgb_px, :]
# semantic_v = semantic[rgb_py, rgb_px]
# if semantic_v == 40:
# semantic_v = -1
# when the projected location is already assigned a color value before, overwrite if the current point has larger height
if p_local[1] < color_top_down_height[y, x]:
color_top_down[y, x] = rgb_v
color_top_down_height[y, x] = p_local[1]
# gt[y, x] = semantic_v
# average the visual embeddings if multiple points are projected to the same grid cell
px, py, pz = project_point(feat_cam_mat, p_local)
if not (px < 0 or py < 0 or px >= pix_feats.shape[3] or py >= pix_feats.shape[2]):
feat = pix_feats[0, :, py, px]
grid[y, x] = (grid[y, x] * weight[y, x] + feat) / (weight[y, x] + 1)
weight[y, x] += 1
# build an obstacle map ignoring points on the floor (0 means occupied, 1 means free)
if p_local[1] > camera_height: #:
continue # 如果高度 高于摄像机高度xx 就认为是空闲(认为是天花板的点,都只有天花板的点肯定就是ok的)
obstacles[y, x] = 0# 默认是占有
pbar.update(1)
print("相关数据")
print(min(all_p_local1),max(all_p_local1),sum(all_p_local1)/len(all_p_local1))
save_map(color_top_down_save_path, color_top_down)
# save_map(gt_save_path, gt)
save_map(grid_save_path, grid)
save_map(weight_save_path, weight)
save_map(obstacles_save_path, obstacles)
# 过一个深度学习模型(model)来提取图像的 分割特征,具体过程包括:图像预处理、裁剪、填充、推理、特征融合
def get_lseg_feat(model: LSegEncNet, image: np.array, labels, transform, crop_size=480, \
base_size=520, norm_mean=[0.5, 0.5, 0.5], norm_std=[0.5, 0.5, 0.5]):
vis_image = image.copy()
image = transform(image).unsqueeze(0).cuda()
img = image[0].permute(1,2,0)
img = img * 0.5 + 0.5
batch, _, h, w = image.size() # [1, 3, 720, 1080]
stride_rate = 2.0/3.0
stride = int(crop_size * stride_rate)
long_size = base_size
if h > w:
height = long_size
width = int(1.0 * w * long_size / h + 0.5)
short_size = width
else:
width = long_size
height = int(1.0 * h * long_size / w + 0.5)
short_size = height
# [1, 3, 347, 520]
cur_img = resize_image(image, height, width, **{'mode': 'bilinear', 'align_corners': True})
if long_size <= crop_size:
pad_img = pad_image(cur_img, norm_mean,
norm_std, crop_size)
print(pad_img.shape)
with torch.no_grad():
outputs, logits = model(pad_img, labels)
outputs = crop_image(outputs, 0, height, 0, width)
else:
if short_size < crop_size:
# pad if needed
pad_img = pad_image(cur_img, norm_mean,
norm_std, crop_size)
else:
pad_img = cur_img
_,_,ph,pw = pad_img.shape #.size() 480, 520
assert(ph >= height and pw >= width)
h_grids = int(math.ceil(1.0 * (ph-crop_size)/stride)) + 1
w_grids = int(math.ceil(1.0 * (pw-crop_size)/stride)) + 1
with torch.cuda.device_of(image):
with torch.no_grad():
outputs = image.new().resize_(batch, model.out_c,ph,pw).zero_().cuda()
logits_outputs = image.new().resize_(batch, len(labels),ph,pw).zero_().cuda()
count_norm = image.new().resize_(batch,1,ph,pw).zero_().cuda()
# grid evaluation
for idh in range(h_grids):
for idw in range(w_grids):
h0 = idh * stride
w0 = idw * stride
h1 = min(h0 + crop_size, ph)
w1 = min(w0 + crop_size, pw)
crop_img = crop_image(pad_img, h0, h1, w0, w1)
# pad if needed [1, 3, 480, 480]
pad_crop_img = pad_image(crop_img, norm_mean,
norm_std, crop_size)
with torch.no_grad():
output, logits = model(pad_crop_img, labels)
cropped = crop_image(output, 0, h1-h0, 0, w1-w0)
cropped_logits = crop_image(logits, 0, h1-h0, 0, w1-w0)
outputs[:,:,h0:h1,w0:w1] += cropped
logits_outputs[:,:,h0:h1,w0:w1] += cropped_logits
count_norm[:,:,h0:h1,w0:w1] += 1
assert((count_norm==0).sum()==0)
outputs = outputs / count_norm
logits_outputs = logits_outputs / count_norm
outputs = outputs[:,:,:height,:width]
logits_outputs = logits_outputs[:,:,:height,:width]
outputs = outputs.cpu()
outputs = outputs.numpy() # B, D, H, W
predicts = [torch.max(logit, 0)[1].cpu().numpy() for logit in logits_outputs]
pred = predicts[0]
return outputs # [1, 512, 347, 520]
torch.cuda.empty_cache()
# setup parameters
# @
cs = 0.05 # @param {type: "number"}markdown meters per cell size
gs = 1000 # @param {type: "integer"}markdown map resolution (gs x gs)
camera_height = 1.57 # @param {type: "number"}markdown camera height (used for filtering out points on the floor)
depth_sample_rate = 100 # @param {type: "integer"} markdown depth pixels subsample rate
root_dir='/media/cl/Data/2026/Baseline/vlmaps/vlmaps_dataset'
data_dir = os.path.join(root_dir, "FloorPlan13_2")
data_dir = data_dir # @param {type: "string"}
create_lseg_map_batch(data_dir, camera_height=camera_height, cs=cs, gs=gs, depth_sample_rate=depth_sample_rate)