PET_main/engine.py at main · Hongru266/PET_main · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
"""
Train and eval functions used in main.py
"""
import math
import os
import sys
from typing import Iterable
import numpy as np
import cv2
from scipy import ndimage
from scipy.ndimage import label

import torch
import torchvision.transforms as standard_transforms
import torch.nn.functional as F

import util.misc as utils
from util.misc import NestedTensor

import time
from sklearn.cluster import DBSCAN


class DeNormalize(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, tensor):
        for t, m, s in zip(tensor, self.mean, self.std):
            t.mul_(s).add_(m)
        return tensor


def visualization(samples, targets, pred, vis_dir, split_map=None):
    """
    Visualize predictions
    """
    gts = [t['points'].tolist() for t in targets]

    pil_to_tensor = standard_transforms.ToTensor()

    restore_transform = standard_transforms.Compose([
        DeNormalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        standard_transforms.ToPILImage()
    ])

    images = samples.tensors
    masks = samples.mask
    for idx in range(images.shape[0]):
        sample = restore_transform(images[idx])
        sample = pil_to_tensor(sample.convert('RGB')).numpy() * 255
        sample_vis = sample.transpose([1, 2, 0])[:, :, ::-1].astype(np.uint8).copy()

        # draw ground-truth points (red)
        size = 2
        for t in gts[idx]:
            sample_vis = cv2.circle(sample_vis, (int(t[1]), int(t[0])), size, (0, 0, 255), -1)

        # draw predictions (green)
        for p in pred[idx]:
            sample_vis = cv2.circle(sample_vis, (int(p[1]), int(p[0])), size, (0, 255, 0), -1)

        # draw split map
        if split_map is not None:
            imgH, imgW = sample_vis.shape[:2]
            split_map = (split_map * 255).astype(np.uint8)
            split_map = cv2.applyColorMap(split_map, cv2.COLORMAP_JET)
            split_map = cv2.resize(split_map, (imgW, imgH), interpolation=cv2.INTER_NEAREST)
            sample_vis = split_map * 0.9 + sample_vis

        # save image
        if vis_dir is not None:
            # eliminate invalid area
            imgH, imgW = masks.shape[-2:]
            valid_area = torch.where(~masks[idx])
            valid_h, valid_w = valid_area[0][-1], valid_area[1][-1]
            sample_vis = sample_vis[:valid_h+1, :valid_w+1]

            name = targets[idx]['image_path'].split('/')[-1].split('.')[0]
            cv2.imwrite(os.path.join(vis_dir, '{}_gt{}_pred{}.jpg'.format(name, len(gts[idx]), len(pred[idx]))), sample_vis)


# training
def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
                    device: torch.device, epoch: int, max_norm: float = 0,
                    silent: bool = False):  # 添加silent参数控制是否打印
    model.train()
    criterion.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)  # 这里生成 "Epoch: [1499]" 部分

    # 根据silent参数决定打印频率
    # 使用一个非常大的数字，确保不会打印
    print_freq = 10

    # 这个循环不会打印训练进度
    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
        t_start = time.time()
        samples = samples.to(device)
        # 将所有张量数据移动到设备上
        targets = [{k: v.to(device) if hasattr(v, 'to') else v
                   for k, v in t.items()} for t in targets]
        gt_points = [target['points'] for target in targets]
        # print(f' samples shape: {samples.tensors.shape}, targets: {len(targets)}')
        t_data = time.time()

        outputs = model(samples, epoch=epoch, train=True,
                                        criterion=criterion, targets=targets)
            # print(f'train_one_epoch: output keys: {outputs.keys()}')
        t_forward = time.time()

        loss_dict, weight_dict, losses= outputs['loss_dict'], outputs['weight_dict'], outputs['losses']
        # print(f'train_one_epoch: loss_dict keys:{loss_dict.keys()}, weight_dict keys:{weight_dict.keys()}')
        t_loss = time.time()

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        loss_dict_reduced_unscaled = {f'{k}_unscaled': v
                                      for k, v in loss_dict_reduced.items()}
        loss_dict_reduced_scaled = {k: v * weight_dict[k]
                                    for k, v in loss_dict_reduced.items() if k in weight_dict}
        losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())

        loss_value = losses_reduced_scaled.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        if max_norm > 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
        optimizer.step()
        t_backward = time.time()

        metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        # print(f'Time taken - Data: {t_data - t_start:.4f}s, Forward: {t_forward - t_data:.4f}s, Loss: {t_loss - t_forward:.4f}s, Backward: {t_backward - t_loss:.4f}s')

    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)

    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}


def get_pred_points(prob_map, offset_map, prob_thresh=0.5, eps=3, min_samples=2, min_cluster_size=2):
    """
    从 prob_map + offset_map 得到预测的人头中心点

    Args:
        prob_map: Tensor [H, W]   # 像素概率
        offset_map: Tensor [2, H, W]   # 偏移向量 (dx, dy)
        prob_thresh: 阈值
        eps: DBSCAN 半径
        min_samples: DBSCAN 最小点数
    Returns:
        pred_points: list of (x, y) 预测中心点
    """
    H, W = prob_map.shape
    # 1. 前景像素
    # thresh = torch.quantile(prob_map, 0.8)  # 取前5%高置信度
    mask = prob_map > prob_thresh
    ys, xs = torch.where(mask)

    if len(xs) == 0:
        return []

    # 2. 应用 offset 得到中心点位置
    dx = offset_map[0, ys, xs]
    dy = offset_map[1, ys, xs]
    cx = xs + dx
    cy = ys + dy

    coords = torch.stack([cx, cy], dim=1).cpu().numpy()

    coords = coords[~np.isnan(coords).any(axis=1)]
    coords = coords[~np.isinf(coords).any(axis=1)]
    # print(f'coords shape:{coords.shape}')
    if coords.shape[0] == 0:
        return []

    # 3. 聚类 (DBSCAN 合并重复投票)
    if len(coords) == 0:
        return []

    # Step 1. DBSCAN 聚类
    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(coords)
    labels = clustering.labels_

    pred_points = []

    # Step 2. 遍历簇
    for lab in set(labels):
        if lab == -1:  # 噪声点忽略
            continue

        cluster_pts = coords[labels == lab]       # 当前簇的坐标
        # cluster_probs = prob_map[labels == lab]   # 当前簇的置信度
        cluster_mask = (labels == lab)
        cluster_ys = ys[cluster_mask]
        cluster_xs = xs[cluster_mask]
        cluster_probs = prob_map[cluster_ys, cluster_xs]

        # Step 3. 过滤掉太小的簇
        if len(cluster_pts) < min_cluster_size:
            continue

        # Step 4. 置信度加权聚类中心 (避免简单平均带来偏移)
        weights = cluster_probs / (cluster_probs.sum() + 1e-6)  # 归一化
        cx = np.sum(cluster_pts[:, 0] * weights.cpu().numpy())
        cy = np.sum(cluster_pts[:, 1] * weights.cpu().numpy())

        pred_points.append((cx, cy))

    return pred_points

def get_pred_points_mixed(prob_map, offset_map=None, prob_thresh=0.5, sigma=1.0,
                          eps=3, min_samples=2, min_cluster_size=2, mode="union"):
    """
    混合策略：结合 prob_map 局部极大值检测 和 offset_map 投票方式

    Args:
        prob_map: Tensor [H, W]   # 像素概率
        offset_map: Tensor [2, H, W] or None   # 偏移向量 (dx, dy)，可以为 None
        prob_thresh: float 阈值
        sigma: 高斯平滑参数
        eps, min_samples: DBSCAN 参数
        min_cluster_size: 过滤太小的簇
        mode: 'union' 或 'intersect' 或 'prob_only'
              - union: 两种方法取并集
              - intersect: 两种方法取交集
              - prob_only: 只用 prob_map 峰值（不依赖 offset）
    Returns:
        pred_points: list of (x, y)
    """

    H, W = prob_map.shape
    prob_map_cpu = prob_map.detach().cpu().numpy()

    # === 1. 高斯平滑 + 阈值 ===
    prob_smooth = ndimage.gaussian_filter(prob_map_cpu, sigma=sigma)
    mask = prob_smooth > prob_thresh

    # === 2. 局部极大值检测 ===
    local_max = (prob_smooth == ndimage.maximum_filter(prob_smooth, size=3))
    peaks = local_max & mask
    ys_peak, xs_peak = np.where(peaks)

    peak_points = [(xs_peak[i], ys_peak[i]) for i in range(len(xs_peak))]

    # 如果只用 prob_map，直接返回
    if mode == "prob_only" or offset_map is None:
        return peak_points

    # === 3. offset_map 修正 ===
    mask_tensor = prob_map > prob_thresh
    ys, xs = torch.where(mask_tensor)

    if len(xs) > 0:
        dx = offset_map[0, ys, xs]
        dy = offset_map[1, ys, xs]
        cx = xs + dx
        cy = ys + dy
        coords = torch.stack([cx, cy], dim=1).cpu().numpy()

        # DBSCAN 聚类
        clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(coords)
        labels = clustering.labels_

        offset_points = []
        for lab in set(labels):
            if lab == -1:  # 噪声点
                continue
            cluster_pts = coords[labels == lab]
            cluster_mask = (labels == lab)
            cluster_ys = ys[cluster_mask]
            cluster_xs = xs[cluster_mask]
            cluster_probs = prob_map[cluster_ys, cluster_xs]

            if len(cluster_pts) < min_cluster_size:
                continue

            weights = cluster_probs / (cluster_probs.sum() + 1e-6)
            cx = np.sum(cluster_pts[:, 0] * weights.cpu().numpy())
            cy = np.sum(cluster_pts[:, 1] * weights.cpu().numpy())
            offset_points.append((cx, cy))
    else:
        offset_points = []

    # === 4. 融合策略 ===
    if mode == "union":
        pred_points = peak_points + offset_points
    elif mode == "intersect":
        pred_points = []
        for px, py in peak_points:
            for ox, oy in offset_points:
                if np.hypot(px - ox, py - oy) < eps:
                    pred_points.append(((px + ox) / 2, (py + oy) / 2))
    else:
        pred_points = peak_points

    return pred_points


# evaluation
@torch.no_grad()
def evaluate(model, data_loader, device, epoch=0, vis_dir=None):
    model.eval()
    prob_map = None
    prob_thresh = 0.5

    metric_logger = utils.MetricLogger(delimiter="  ")
    header = 'Test:'

    if vis_dir is not None:
        os.makedirs(vis_dir, exist_ok=True)

    print_freq = 10
    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
        samples = samples.to(device)
        img_h, img_w = samples.tensors.shape[-2:]

        # inference
        outputs = model(samples, test=True, targets=targets)
        # print(f'evaluate: output keys: {outputs.keys()}')
        outputs_scores = torch.nn.functional.softmax(outputs['pred_logits'], -1)[:, :, 1][0]
        outputs_points = outputs['pred_points'][0]
        outputs_offsets = outputs['pred_offsets'][0]

        outputs_offset_map = outputs['offset_map'][0]
        prob_map = outputs['prob_map'][0,0]
        np.save("test_predict_prob_map", prob_map.detach().cpu().numpy())
        np.save("test_offset_map", outputs_offset_map.detach().cpu().numpy())
        threshold = 0.5


        # 1. 高斯平滑（可选）
        prob_map_cpu = prob_map.detach().cpu().numpy()  # 转换为CPU上的numpy数组
        prob_smooth = ndimage.gaussian_filter(prob_map_cpu, sigma=1.0)
        # 2. 阈值过滤
        mask = prob_smooth > threshold
        # 3. 局部最大值检测
        local_max = (prob_smooth == ndimage.maximum_filter(prob_smooth, size=3))
        peaks = local_max & mask

        # 将张量转换为numpy数组并进行阈值操作
        prob_map_numpy = prob_map.detach().cpu().numpy()
        binary_map = (prob_map_numpy > threshold).astype(np.uint8)
        structure = np.ones((3, 3), dtype=np.int32)  # 8-connected，使用 np.int32 替代 np.int
        labeled_map, num_clusters = label(binary_map, structure=structure)
        # num_clusters, labeled_map = cv2.connectedComponents(binary_map)
        num_clusters -= 1  # OpenCV 默认包含背景 label 0，需要减去

        # process predicted points
        predict_cnt1 = len(outputs_scores)
        predict_cnt1 = peaks.sum().item()

        pred_points = get_pred_points(prob_map, outputs_offset_map, prob_thresh=prob_thresh)
        np.save("pred_points.npy", pred_points)
        predict_cnt = len(pred_points)

        pred_points2 = get_pred_points_mixed(prob_map, outputs_offset_map, prob_thresh=prob_thresh, mode="union")
        predict_cnt2 = len(pred_points2)

        # predict_cnt = num_clusters
        gt_cnt = targets[0]['points'].shape[0]
        print(f'evsl1:{predict_cnt1}, evsl2:{predict_cnt}, eval3:{predict_cnt2} gt_points:{gt_cnt}')

        # compute error
        mae = abs(predict_cnt2 - gt_cnt)
        mse = (predict_cnt2 - gt_cnt) * (predict_cnt2 - gt_cnt)

        # record results
        results = {}
        toTensor = lambda x: torch.tensor(x).float().cuda()
        results['mae'], results['mse'] = toTensor(mae), toTensor(mse)
        metric_logger.update(mae=results['mae'], mse=results['mse'])

        results_reduced = utils.reduce_dict(results)
        metric_logger.update(mae=results_reduced['mae'], mse=results_reduced['mse'])

        # visualize predictions
        if vis_dir:
            points = [[point[0]*img_h, point[1]*img_w] for point in outputs_points]     # recover to actual points
            split_map = (outputs['split_map_raw'][0].detach().cpu().squeeze(0) > 0.5).float().numpy()
            visualization(samples, targets, [points], vis_dir, split_map=split_map)

    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    results = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
    results['mse'] = np.sqrt(results['mse'])
    return results