ssl/dino.py

### refer to https://github.com/facebookresearch/dino/blob/main/main_dino.py ###

import copy
import math
import random
import warnings
from argparse import Namespace
from functools import wraps

import torch
import torch.nn.functional as F
from torch import nn

from ssl.base import BaseSelfSupervisedModel
from models import get_backbone_class


def _get_module_device(module):
    return next(module.parameters()).device


def _set_requires_grad(model, val):
    for p in model.parameters():
        p.requires_grad = val


class EMA:
    def __init__(self, beta):
        super().__init__()
        self.beta = beta

    def update_average(self, old, new):
        if old is None:
            return new
        return old * self.beta + (1 - self.beta) * new


def _update_moving_average(ema_updater, ma_model, current_model):
    for current_params, ma_params in zip(current_model.parameters(), ma_model.parameters()):
        old_weight, up_weight = ma_params.data, current_params.data
        ma_params.data = ema_updater.update_average(old_weight, up_weight)
    for current_buffers, ma_buffers in zip(current_model.buffers(), ma_model.buffers()):
        old_weight, up_weight = ma_buffers.data, current_buffers.data
        ma_buffers.data = ema_updater.update_average(old_weight, up_weight)


def _no_grad_trunc_normal_(tensor, mean, std, a, b):
    # Cut & paste from PyTorch official master until it's in a few official releases - RW
    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
    def norm_cdf(x):
        # Computes standard normal cumulative distribution function
        return (1. + math.erf(x / math.sqrt(2.))) / 2.

    if (mean < a - 2 * std) or (mean > b + 2 * std):
        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
                      "The distribution of values may be incorrect.",
                      stacklevel=2)

    with torch.no_grad():
        # Values are generated by using a truncated uniform distribution and
        # then using the inverse CDF for the normal distribution.
        # Get upper and lower cdf values
        l = norm_cdf((a - mean) / std)
        u = norm_cdf((b - mean) / std)

        # Uniformly fill tensor with values from [l, u], then translate to
        # [2l-1, 2u-1].
        tensor.uniform_(2 * l - 1, 2 * u - 1)

        # Use inverse cdf transform for normal distribution to get truncated
        # standard normal
        tensor.erfinv_()

        # Transform to proper mean, std
        tensor.mul_(std * math.sqrt(2.))
        tensor.add_(mean)

        # Clamp to ensure it's in the proper range
        tensor.clamp_(min=a, max=b)
        return tensor


def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
    # type: (Tensor, float, float, float, float) -> Tensor
    return _no_grad_trunc_normal_(tensor, mean, std, a, b)


class DINOHead(nn.Module):
    def __init__(self, in_dim, out_dim, use_bn=False, norm_last_layer=True, nlayers=3, hidden_dim=2048, bottleneck_dim=256):
        super().__init__()
        nlayers = max(nlayers, 1)
        if nlayers == 1:
            self.mlp = nn.Linear(in_dim, bottleneck_dim)
        else:
            layers = [nn.Linear(in_dim, hidden_dim)]
            if use_bn:
                layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.GELU())
            for _ in range(nlayers - 2):
                layers.append(nn.Linear(hidden_dim, hidden_dim))
                if use_bn:
                    layers.append(nn.BatchNorm1d(hidden_dim))
                layers.append(nn.GELU())
            layers.append(nn.Linear(hidden_dim, bottleneck_dim))
            self.mlp = nn.Sequential(*layers)
        self.apply(self._init_weights)
        self.last_layer = nn.utils.weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
        self.last_layer.weight_g.data.fill_(1)
        if norm_last_layer:
            self.last_layer.weight_g.requires_grad = False

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.mlp(x)
        x = nn.functional.normalize(x, dim=-1, p=2)
        x = self.last_layer(x)
        return x
    

class NetWrapper(nn.Module):
    def __init__(self, net, out_dim, norm_last_layer=True):
        super().__init__()
        self.net = net
        self.projector = DINOHead(net.final_feat_dim, out_dim, norm_last_layer=norm_last_layer)  # follow the original paper hyperparameter

    def forward(self, x):
        if isinstance(x, list): # use multi-crop strategy
            representation = torch.empty(0).to(x[0].device)
            for inp in x:
                _rep = self.net(inp)
                representation = torch.cat([representation, _rep])
        else:
            representation = self.net(x) # shape == (batch_size, emd_dim)
        projection = self.projector(representation) # shape == (batch_size, out_dim)
        return projection


class DINO(BaseSelfSupervisedModel):
    def __init__(self, backbone: nn.Module, params: Namespace):
        super().__init__(backbone, params)

        self.moving_average_decay = params.initial_ema
        self.student_temp = 0.1
        self.initial_teacher_temp = params.initial_temp
        self.teacher_temp = params.initial_temp # initial value of 0.04 works well and last value above 0.07 is unstable
        self.temp_warmup_epochs = params.temp_warmup_epochs # default = 30
        self.center_momentum = 0.9
        self.ema_scheduler = params.ema_scheduler # default == True
        self.local_crops_number = params.local_crops_number
        out_dim = 65536

        backbone_drop = get_backbone_class(params.model)(drop_path_rate=0.1)
        self.online_encoder = NetWrapper(backbone_drop, out_dim, norm_last_layer=False) # include Projector
        # target encoder
        self.target_encoder = NetWrapper(backbone, out_dim)
        _set_requires_grad(self.target_encoder, False)
        self.target_ema_updater = EMA(self.moving_average_decay)
        
        # center for target_output
        self.register_buffer('center', torch.zeros(1, out_dim))

        # get device of network and make wrapper same device
        device = _get_module_device(backbone)
        self.to(device)

    def _update_moving_average(self):
        assert self.target_encoder is not None, 'target encoder has not been created yet'
        _update_moving_average(self.target_ema_updater, self.target_encoder, self.online_encoder)
    
    @torch.no_grad()
    def update_center(self, output1, output2):
        """
        Update center used for teacher output.
        """
        target_output = torch.cat([output1, output2], dim=0)
        batch_center = torch.mean(target_output, dim=0, keepdim=True)

        # ema update
        self.center = self.center * self.center_momentum + batch_center * (1 - self.center_momentum)

    def _data_parallel(self):
        self.online_encoder = nn.DataParallel(self.online_encoder)
        self.target_encoder = nn.DataParallel(self.target_encoder)

    def compute_ssl_loss(self, x1, x2, return_features=False):
        if isinstance(x2, list): # use multi-crop strategy
            st_x = [x1, torch.cat(x2, dim=0)]
            tc_x = x1
        else:
            x = torch.cat([x1, x2], dim=0)
            st_x = x
            tc_x = x

        assert not (self.training and x1.shape[
            0] == 1), 'you must have greater than 1 sample when training, due to the batchnorm in the projection layer'
                
        online_pred = self.online_encoder(st_x)    
        with torch.no_grad():
            target_pred = self.target_encoder(tc_x)

        # target centering and sharpening
        target_pred_one, target_pred_two = target_pred.chunk(2)
        target_pred_one_t = F.softmax((target_pred_one - self.center) / self.teacher_temp, dim=-1)
        target_pred_two_t = F.softmax((target_pred_two - self.center) / self.teacher_temp, dim=-1)

        online_pred_t = online_pred / self.student_temp
        online_pred_t = online_pred_t.chunk(self.local_crops_number + 2)

        total_loss, n_loss_terms = 0, 0
        for iq, q in enumerate([target_pred_one_t, target_pred_two_t]):
            for v in range(len(online_pred_t)):
                if v == iq: continue # skip cases where student and teacher operate on the same view
                loss = torch.sum(-q * F.log_softmax(online_pred_t[v], dim=-1), dim=-1)
                total_loss += loss.mean()
                n_loss_terms += 1
        total_loss /= n_loss_terms

        # on_step_end
        self.update_center(target_pred_one, target_pred_two)

        return total_loss

    def on_step_end(self):
        self._update_moving_average()

    def on_epoch_end(self, cur_epoch, tot_epoch):
        if self.ema_scheduler:
            new_beta = 1 - (1 - self.moving_average_decay) * (math.cos(math.pi*cur_epoch/tot_epoch) + 1) / 2
            self.target_ema_updater.beta = new_beta
        print('EMA beta: {}'.format(self.target_ema_updater.beta))

        if self.temp_warmup_epochs:
            if cur_epoch >= self.temp_warmup_epochs:
                new_temp = 0.07
            else: # linear scheduling
                new_temp = 0.07 - (0.07 - self.initial_teacher_temp) * (1 - cur_epoch/self.temp_warmup_epochs)
            self.teacher_temp = new_temp
        print('Teacher temperature: {}'.format(self.teacher_temp))