From 00ae676a09540615c00a657f25a08016c51b0dda Mon Sep 17 00:00:00 2001
From: lu60ruhy <pavlo.beylin@fau.de>
Date: Tue, 5 Oct 2021 14:43:35 +0200
Subject: [PATCH] Add cosine similarity matrix calculation for YOLO predictions
 for all classes.

---
 CSM.py  | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 main.py |  39 +++++++++--
 2 files changed, 233 insertions(+), 4 deletions(-)
 create mode 100644 CSM.py

diff --git a/CSM.py b/CSM.py
new file mode 100644
index 00000000..f852a462
--- /dev/null
+++ b/CSM.py
@@ -0,0 +1,198 @@
+import torch
+import torch.nn.functional as F
+
+
+def calc_yolo_csms(imgs_and_preds: torch.Tensor,
+                   sign: bool = True,
+                   rescale: bool = True) -> torch.Tensor:
+    '''
+    computes the cosine similarity map for given input images X
+
+    Parameters
+    ---------
+    model: torch model
+    imgs_and_preds: torch tensor; shape: (Batch_Size, Channels, Width, Height)
+    sign: use sign of gradients to calculate cosine similarity maps
+    rescale: rescale the logits before applying softmax -> solves gradient obfuscation problem of large logits
+
+    Returns
+    ---------
+    return: cosine_similarity_map:
+    '''
+
+    csms = []  # saliency maps w.r.t. all possible output classes
+    imgs = []
+
+    for tup in imgs_and_preds:
+        img, pred, frame, x1, y1, x2, y2 = tup
+        if not img.requires_grad:
+            img.requires_grad_()
+        logit = pred[5:]
+        imgs.append(img)
+
+        # rescale network output to avoid gradient obfuscation
+        if rescale:
+            logit = logit / torch.max(torch.abs(logit)) * 10
+
+        classes = len(logit)
+
+        deltas = []
+        for c in range(classes):
+            #  calculate loss and compute gradient w.r.t. the input of the current class
+            y = torch.ones(1, device="cuda", dtype=torch.long) * c
+            loss = F.cross_entropy(logit.unsqueeze(0), y)
+            frame_grad = torch.autograd.grad(loss, frame, retain_graph=True)[0][:, 5:]
+            img_grad = frame_grad[int(y1):int(y2), int(x1):int(x2), :]
+
+            #  take sign of gradient as in the original paper
+            if sign:
+                img_grad = torch.sign(img_grad)
+
+            deltas.append(img_grad.clone().detach())
+
+        deltas = torch.stack(deltas)
+        #  compute cosine similarity matrices
+
+        try:
+            deltas = torch.max(deltas, dim=-3).values  # take only the maximum value of all channels to compute the
+            deltas = deltas.view(classes, 1, -1)
+            norm = torch.norm(deltas, p=2, dim=2, keepdim=True)
+            deltas = deltas / norm
+            deltas = deltas.transpose(0, 1)
+            csm = torch.matmul(deltas, deltas.transpose(1, 2))
+        except Exception as e:
+            print("error")
+            raise e
+
+        #  division by zero can lead to NaNs
+        if torch.isnan(csm).any():
+            # raise Exception("NaNs in CSM!")
+            print("NaNs in csm")
+        else:
+            print(f'{deltas.mean()}')
+            csms.append(csm)
+    return imgs, csms
+
+
+def calc_csm(model: torch.nn.Module,
+             X: torch.Tensor,
+             sign: bool = True,
+             rescale: bool = True) -> torch.Tensor:
+    '''
+    computes the cosine similarity map for given input images X
+
+    Parameters
+    ---------
+    model: torch model
+    X: torch tensor; shape: (Batch_Size, Channels, Width, Height)
+    sign: use sign of gradients to calculate cosine similarity maps
+    rescale: rescale the logits before applying softmax -> solves gradient obfuscation problem of large logits
+
+    Returns
+    ---------
+    return: cosine_similarity_map:
+    '''
+
+    deltas = []  # saliency maps w.r.t. all possible output classes
+    if not X.requires_grad:
+        X.requires_grad_()
+
+    logits = model(X)  # network output
+
+    # rescale network output to avoid gradient obfuscation
+    if rescale:
+        logits = logits / torch.max(torch.abs(logits), 1, keepdim=True).values * 10
+
+    B = logits.shape[0]  # batch size
+    classes = logits.shape[-1]  # output classes
+
+    for c in range(classes):
+        #  calculate loss and compute gradient w.r.t. the input of the current class
+        y = torch.ones(B, device="cuda", dtype=torch.long) * c
+        loss = F.cross_entropy(logits, y)
+        grad = torch.autograd.grad(loss, X, retain_graph=True)[0]
+
+        #  take sign of gradient as in the original paper
+        if sign:
+            grad = torch.sign(grad)
+        deltas.append(grad.detach().clone())
+
+    model.zero_grad()
+    deltas = torch.stack(deltas, dim=0)
+
+    deltas = torch.max(deltas,
+                       dim=-3).values  # take only the maximum value of all channels to compute the cosine similarity
+
+    #  compute cosine similarity matrices
+    deltas = deltas.view(classes, B, -1)
+    norm = torch.norm(deltas, p=2, dim=2, keepdim=True)
+    deltas = deltas / norm
+    deltas = deltas.transpose(0, 1)
+    csm = torch.matmul(deltas, deltas.transpose(1, 2))
+
+    #  division by zero can lead to NaNs
+    if torch.isnan(csm).any():
+        raise Exception("NaNs in CSM!")
+    return csm
+
+
+def calc_csm_partial_network(model_first_part: torch.nn.Module,
+                             model_second_part: torch.nn.Module,
+                             X: torch.Tensor,
+                             sign: bool = True,
+                             rescale: bool = True,
+                             scalar_product: bool = False) -> torch.Tensor:
+    '''
+        computes the cosine similarity map for given input images X
+
+        Parameters
+        ---------
+        model: torch model
+        X: torch tensor; shape: (Batch_Size, Channels, Width, Height)
+        sign: use sign of gradients to calculate cosine similarity maps
+        rescale: rescale the logits before applying softmax -> solves gradient obfuscation problem of large logits
+
+        Returns
+        ---------
+        return: cosine_similarity_map:
+        '''
+
+    deltas = []  # saliency maps w.r.t. all possible output classes
+
+    pre_ultimate_output = model_first_part(X)
+    pre_ultimate_output.requires_grad_()
+    logits = model_second_part(pre_ultimate_output)  # network output
+
+    # rescale network output to avoid gradient obfuscation
+    if rescale:
+        logits = logits / torch.max(torch.abs(logits), 1, keepdim=True).values * 10
+
+    B = logits.shape[0]  # batch size
+    classes = logits.shape[-1]  # output classes
+
+    for c in range(classes):
+        #  calculate loss and compute gradient w.r.t. the input of the current class
+        y = torch.ones(B, device="cuda", dtype=torch.long) * c
+        loss = F.cross_entropy(logits, y)
+        grad = torch.autograd.grad(loss, pre_ultimate_output, retain_graph=True)[0]
+
+        #  take sign of gradient as in the original paper
+        if sign:
+            grad = torch.sign(grad)
+        deltas.append(grad.detach().clone())
+
+    deltas = torch.stack(deltas, dim=0)
+
+    #  compute cosine similarity matrices
+    deltas = deltas.view(classes, B, -1)
+    norm = torch.norm(deltas, p=2, dim=2, keepdim=True)
+    if not scalar_product:
+        deltas = deltas / norm
+
+    deltas = deltas.transpose(0, 1)
+    csm = torch.matmul(deltas, deltas.transpose(1, 2))
+
+    #  division by zero can lead to NaNs
+    if torch.isnan(csm).any():
+        raise Exception("NaNs in CSM!")
+    return csm
diff --git a/main.py b/main.py
index 7bed43ab..9513509c 100644
--- a/main.py
+++ b/main.py
@@ -9,6 +9,7 @@ import math
 import matplotlib
 from torch import optim
 
+import CSM
 import models
 from models.common import Detections
 from utils.external import TotalVariation
@@ -129,6 +130,7 @@ def bb_intersection_over_union(boxA, boxB):
     # return the intersection over union value
     return iou
 
+
 def save_image(image):
     print("save image called!")
     im = transforms.ToPILImage('RGB')(image)
@@ -136,6 +138,7 @@ def save_image(image):
     plt.show()
     im.save(f"saved_patches/{time.time()}.jpg")
 
+
 def get_best_prediction(true_box, res, cls_nr):
     min_distance = float("inf")
     max_iou = float(0)
@@ -149,10 +152,32 @@ def get_best_prediction(true_box, res, cls_nr):
             max_iou = pred_iou
             best_prediction = pred[cls_nr + 5]
 
-    print(f"max found iou: {max_iou}")
+    # print(f"max found iou: {max_iou}")
+
     return max_iou, best_prediction
 
 
+def calculate_csms(frame, predictions):
+
+    imgs_and_preds = []
+
+    for pred in predictions:
+        x1, y1, x2, y2, conf = pred[:5].float()
+
+        pred_img_section = frame.flip(2)[int(y1):int(y2), int(x1):int(x2), :]
+        tup = (pred_img_section, pred, frame, x1, y1, x2, y2)
+        # print(tup)
+        imgs_and_preds.append(tup)
+
+        # if conf > 0.8:
+        #     cls = classes[int(pred[5:].argmax())]
+            # print(f"{cls}: {conf} - {pred[:5].float()}")
+            # show(frame.flip(2)[int(y1):int(y2), int(x1):int(x2), :] / 255.)
+            # print("done")
+
+    imgs, csms = CSM.calc_yolo_csms(imgs_and_preds)
+
+
 if __name__ == "__main__":
     # init
     patch_transformer = PatchTransformer().cuda()
@@ -209,8 +234,8 @@ if __name__ == "__main__":
     pred = -1
     frame_read = False
     fix_frame = False
-    patch_transformer.maxangle = 5/180 * math.pi
-    patch_transformer.minangle = - 5/180 * math.pi
+    patch_transformer.maxangle = 5 / 180 * math.pi
+    patch_transformer.minangle = - 5 / 180 * math.pi
     loss = None
     while True:
         if not (fix_frame and frame_read):
@@ -257,6 +282,12 @@ if __name__ == "__main__":
                     # debug_preds()
                     pass
 
+                # calculate Cosine Similarity Matrix
+                imgs, csms = calculate_csms(frame, raw_results)
+                for i in range(len(imgs)):
+                    show(imgs[i])
+                    show(csms[i])
+
                 iou, pred = get_best_prediction(bounding_box, raw_results, 15)  # get cat
                 # iou, pred = get_best_prediction(bounding_box, raw_results, 0)  # get personal
                 # iou, pred = get_best_prediction(bounding_box, raw_results, 12)  # get parking meter
@@ -296,7 +327,7 @@ if __name__ == "__main__":
             # sgn_grads = torch.sign(optimizer.param_groups[0]['params'][0].grad)
             # optimizer.param_groups[0]['params'][0].grad = sgn_grads
             # optimizer.step()
-            patch.data -= torch.sign(gradient_sum) * 0.001 # * 0 # TODO reactivate
+            patch.data -= torch.sign(gradient_sum) * 0.001  # * 0 # TODO reactivate
             patch.data = patch.detach().clone().clamp(MIN_THRESHOLD, 0.99999).data
             gradient_sum = 0
 
-- 
GitLab