Skip to content

Poor performance of ProjectedGradientDescentPyTorch on a simple model and dataset #2590

@Tuyki

Description

@Tuyki

Describe the bug
The success rate of ProjectedGradientDescentPyTorch turns out fairly low (around 50%) on an almost linear separable dataset and a simple classifier.

To Reproduce
Relevant dependencies:

Package                        Version        Editable project location
------------------------------ -------------- -------------------------------------------------------------------
adversarial-robustness-toolbox 1.19.0
numpy                          2.2.1
scikit-learn                   1.6.0
scipy                          1.15.0
torch                          2.5.1
torchvision                    0.20.1

Codes

import logging

import numpy as np
import matplotlib.pyplot as plt
import torch
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from art.estimators.classification import PyTorchClassifier
from art.attacks.evasion import ProjectedGradientDescentPyTorch

class MLP(torch.nn.Module):
    def __init__(
        self, input_dim, hidden_dim, output_dim, output_activation=None
    ):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.relu1 = torch.nn.Tanh()
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)
        self.output_activation = output_activation

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        if self.output_activation is not None:
            out = self.output_activation(out)
        return out

def train(
    n_epochs,
    model,
    optimizer,
    criterion,
    train_dataloader,
    device,
):
    model.to(device)
    model.train()

    for epoch in range(n_epochs):
        epoch_loss = 0.0
        for _, data in enumerate(train_dataloader):
            inputs, labels = data
            optimizer.zero_grad()
            outputs = model(inputs.to(device))
            loss = criterion(outputs, labels.to(device))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        epoch_loss = epoch_loss / len(train_dataloader)
        logging.info("Epoch %d Loss %f", epoch, epoch_loss)

    return model

np.random.seed(123)
torch.manual_seed(123)

x, y = make_classification(
    n_samples=1000,
    n_features=2,
    n_informative=2,
    n_redundant=0,
    n_classes=2,
    n_clusters_per_class=1,
    random_state=37,
)

device = 'cpu'

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=123
)
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

x_train = x_train.astype("float32")
x_test = x_test.astype("float32")

y_train = y_train[:, None]
y_test = y_test[:, None]

x_train, y_train, x_test, y_test = (
    torch.Tensor(z).to(device) for z in [x_train, y_train, x_test, y_test]
)

train_dataloader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(x_train, y_train),
    batch_size=32,
    shuffle=True,
)

plt.scatter(x_train[:, 0], x_train[:, 1], c=y_train, alpha=.5)
plt.show()

model = MLP(input_dim=2, hidden_dim=5, output_dim=1, output_activation=torch.sigmoid)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCELoss()

model = train(
    10,
    model,
    optimizer,
    criterion,
    train_dataloader,
    device
)
pred = model(x_test)

(y_test.numpy().reshape(-1) == (pred.detach().numpy().reshape(-1)>.5)).mean()

epsilon = 0.05
alpha=0.001
steps = 1000

classifier = PyTorchClassifier(
    model=model,
    clip_values=(0, 1),
    loss=criterion,
    optimizer=optimizer,
    input_shape=(2,),
    nb_classes=2,
    device_type=device
)

attack = ProjectedGradientDescentPyTorch(
    estimator=classifier,
    norm='2',
    eps=epsilon,
    eps_step=alpha,
    max_iter=steps,
    targeted=False, 
    batch_size=8
)
success = []

for _ in range(100): 

    sample_idx = np.random.choice(x_test.shape[0], 1)

    sample_x = x_test[[sample_idx]]
    sample_y = y_test[[sample_idx]]

    benchmark_adv_x = attack.generate(x=sample_x.numpy())

    benchmark_adv_pred = classifier.model(torch.tensor(benchmark_adv_x, device=device)).detach()[0]
    
    success.append(
        criterion(model(torch.tensor(benchmark_adv_x)), sample_y) > criterion(model(sample_x), sample_y)
    )

print(np.array(success).mean())

Expected behavior
Even untargeted, the PGD is supposed to perform better than 0.5 IMO. Since a vanilla implementation achieved 1.0 with exactly the same configs, I believe that I'm probably not using the API correctly.
Would appreciate investigation, explanation or support very much.

System information (please complete the following information):

  • OS: Ubuntu
  • Python version: 3.11.5
  • ART version or commit number: 1.19.0
  • PyTorch: 2.5.1

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions