import torchfrom PIL import Imagefrom IPython import displayimport pandas as pdimport torchvisionfrom torchvision import transformsimport numpy as npimport matplotlib.pyplot as pltdevice = torch.device('cuda'if torch.cuda.is_available() else'cpu')print(device)#load the model from the pytorch hubmodel = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', weights='MobileNet_V2_Weights.DEFAULT', verbose=False)# Put model in evaluation modemodel.eval()# put the model on a GPU if available, otherwise CPUmodel.to(device);# Define the transforms for preprocessingpreprocess = transforms.Compose([ transforms.Resize(256), # Resize the image to 256x256 transforms.CenterCrop(224), # Crop the image to 224x224 about the center transforms.ToTensor(), # Convert the image to a PyTorch tensor transforms.Normalize( mean=[0.485, 0.456, 0.406], # Normalize the image with the ImageNet dataset mean values std=[0.229, 0.224, 0.225] # Normalize the image with the ImageNet dataset standard deviation values )]);deftensor_to_pil(img_tensor):# tensor: pre-processed tensor object resulting from preprocess(img).unsqueeze(0) unnormed_tensor =unnormalize(img_tensor)return transforms.functional.to_pil_image(unnormed_tensor[0])unnormalize = transforms.Normalize( mean= [-m/s for m, s inzip([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])], std= [1/s for s in [0.229, 0.224, 0.225]])# load labelswithopen("../data/labels.txt", 'r')as f: labels = [label.strip()for label in f.readlines()]# load an example imageimg = Image.open("../data/dog.jpg")plt.imshow(img)plt.axis('off')plt.show()# preprocess the imageimg_tensor =preprocess(img).unsqueeze(0)print(f"Inputs information:\n---------------\nshape:{img_tensor.shape}\n")# move sample to the right deviceimg_tensor = img_tensor.to(device)with torch.no_grad(): output =model(img_tensor)print(f"Image tensor on device:\n---------------\n{img_tensor.device}\n")print(f"Inputs information:\n---------------\nshape:{img_tensor.shape}\nclass: {type(img_tensor)}\n")print(f"Shape of outputs:\n---------------\n{output.shape}\n")print(f"Pred Index:\n---------------\n{output[0].argmax()}\n")print(f"Pred Label:\n---------------\n{labels[output[0].argmax()]}\n")unnormed_img_tensor=unnormalize(img_tensor)img_pil = transforms.functional.to_pil_image(unnormed_img_tensor[0])img_pil.show()
Untargeted Attack
# create new img_tensorimg_tensor =preprocess(img).unsqueeze(0).to(device)n_masks =1000eta =0.005mask_collection = torch.randn((n_masks, *img_tensor.shape)).to(device)*etacurrent_mask = torch.zeros_like(img_tensor).to(device)starting_index =model(img_tensor).argmax(1)print(f"Starting index is:\n---------------\n{starting_index}\n")starting_class_score =model(img_tensor + current_mask)[0, starting_index.item()].item()print(f"Starting class score is:\n---------------\n{starting_class_score}\n")# Zero our current maskcurrent_mask = torch.zeros_like(img_tensor).to(device)# Get our starting label indexstarting_label =model(img_tensor).argmax(1)current_label = starting_label# Get our starting confidence scorebest_score =model(img_tensor + current_mask)[0, starting_index.item()].item()# Run until we reclassify successfully ...while current_label == starting_label:# Select a random mask from the collection we created mask_candidate_idx = np.random.choice(len(mask_collection)) mask_candidate = mask_collection[mask_candidate_idx]# Don't store gradient information while doing inferencewith torch.no_grad():# get the scores for the image if we updated the current mask to# use the candidate we just randomly picked output =model(img_tensor + current_mask + mask_candidate)# select the most probable label from the output of the model with the candidate mask in play current_label = output.argmax(1).item()# our score will be the score of our original label: we want this to go _down_ (i.e. the current# label to become less likely) new_score = output[0, starting_label.item()].item()# If the score for the current class -- the one we're trying to get away from -- did not decrease, then# skip back to the beginning and pick another candidate maskif new_score >= best_score:continue# If we got to here, then we decreased the score of the true class for the image; this means# the score for some other class went up, and at some point if we get the score for the true # label low enough, some other class will be the maximum and we will have successfully misclassified.# Write some monitoring for dopamine print(f"Best score is: {best_score:4.6f} -- prediction is: {current_label} ", end='\r', flush=True)# Update our current score best_score = new_score# And update our mask current_mask += mask_candidateprint(f"\n\nWinner winner: {labels[output[0].argmax()]} ({output[0].argmax()})")with torch.no_grad(): output =model(img_tensor) masked =model(img_tensor + current_mask) probs = torch.softmax(output, dim=1)[0][output[0].argmax()].item() mask_probs = torch.softmax(masked, dim=1)[0][masked[0].argmax()].item() prediction = labels[output[0].argmax()] mask_prediction = labels[masked[0].argmax()] unnormed_img_tensor=unnormalize(img_tensor) unnormed_mask_tensor=unnormalize(img_tensor + current_mask) img_pil = transforms.functional.to_pil_image(unnormed_img_tensor[0]) masked_pil = transforms.functional.to_pil_image(unnormed_mask_tensor[0])plt.figure(figsize=(10, 5))# Adjust the figsize as neededplt.subplot(1, 2, 1)plt.imshow(img_pil)plt.title(f"Original Image\nPrediction: {prediction}, Probability: {probs:.2f}")plt.axis('off')plt.subplot(1, 2, 2)plt.imshow(masked_pil)plt.title(f"Masked Image\nPrediction: {mask_prediction}, Probability: {mask_probs:.2f}")plt.axis('off')plt.show()difference = unnormed_mask_tensor - unnormed_img_tensordifference_list = difference.view(-1).tolist()# Plot the histogramplt.hist(difference_list, bins=100)plt.xticks(rotation=45)plt.show()print("Minimum and maximum difference between the images:", difference.min().item(), difference.max().item())
Purpose: This code implements an adversarial attack to misclassify an image by applying a perturbation (mask) to it, using a random selection of masks and iterating until the model's classification changes.
Setup: The image is preprocessed and moved to the device (GPU/CPU). A collection of random masks is generated to be applied to the image during the attack.
Initial classification: The model's initial prediction (starting class) is obtained, and the score for this class is recorded.
Adversarial perturbation: In a loop, a random mask is selected from the collection and added to the current mask (initially zero). The model's output is evaluated to check if the score of the original class decreases. If it does, the current mask is updated; otherwise, the loop retries with a different mask.
Success criteria: The process continues until the model misclassifies the image, meaning the score for the original class is low enough for another class to become the model’s prediction.
Final results: Once the attack succeeds, the predicted class for the perturbed image is printed. The original and perturbed images are displayed, and the difference between them is plotted in a histogram.
In summary, the code performs an iterative process to find a perturbation that successfully changes the model's classification of the image.
Exercise
That was an untargeted attack, what about a targeted attack?
Modify the above code to perform a targeted adversarial evasion: make the German Shepherd look like a robin.
Hint: we're making our starting class score go down in the example above -- how could you make the target class score go up instead?
Solution
Provided Code
The code below is provided in the lab and must be run for the exercise solution to work.
n_masks =1000eta =0.005# Generate a tensor that is a collection of "masks"# The tensor will have 1000 copies of tensors with the same shape as img_tensor# The values will have a mean 0 and variance 1 and be scaled down by eta mask_collection = torch.randn((n_masks, *img_tensor.shape)).to(device)* eta# initial mask with shape of img_tensor and values of 0current_mask = torch.zeros_like(img_tensor).to(device)# compute our starting indexstarting_index =model(img_tensor).argmax(1)print(f"Starting index is:\n---------------\n{starting_index}\n")starting_class_score =model(img_tensor + current_mask)[0, starting_index.item()].item()print(f"Starting class score is:\n---------------\n{starting_class_score}\n")
Targeted Attack
# Zero our current maskcurrent_mask = torch.zeros_like(img_tensor).to(device)# Get our starting label indexstarting_label =model(img_tensor).argmax(1).item()current_label = starting_label# Target class indextarget_index = torch.tensor(labels.index('robin')).unsqueeze(0).to(device)# Get our starting confidence scorebest_score =model(img_tensor + current_mask)[0, target_index.item()].item()# Run until we reclassify successfully ...while current_label != target_index:# Select a random mask from the collection we created mask_candidate_idx = np.random.choice(len(mask_collection)) mask_candidate = mask_collection[mask_candidate_idx]# Don't store gradient information while doing inferencewith torch.no_grad(): output =model(img_tensor + current_mask + mask_candidate)# Based on our mask addition, get our new label and updated score current_label = output.argmax(1).item() new_score = output[0, target_index.item()].item()# If we haven't hit our target yet and also didn't improve the score, just move on # NOTE CHANGED TARGET_LABEL TO STARTING_LABEL because our goal is to NOT BE what we are more than be a specific thing
if new_score < best_score:continue# Write some monitoring for dopamine print(f"Best score is: {best_score:4.6f} -- pred score is: {output[0, current_label].item()} -- prediction is: {current_label} ", end='\r', flush=True)
# Update our current score best_score = new_score# And update our mask current_mask += mask_candidateprint(f"\n\nWinner winner: {labels[output[0].argmax()]}")
Purpose: This code aims to misclassify an image by applying a targeted adversarial perturbation until the model predicts a specific target class ("robin").
Setup: The current_mask is initialized to zeros and applied to the original image tensor. The starting class label and the target label index ("robin") are set, and the model's confidence score for the target class is recorded.
Iterative mask application: In a loop, a random mask from a pre-generated collection of masks is chosen and temporarily applied to the current mask. The model’s prediction for the modified image is evaluated, specifically checking if the new prediction is closer to the target label.
Score evaluation: If the updated score for the target label is higher than the previous best score, the current_mask is updated with the selected mask, and the process continues.
Success criteria: The loop continues until the model classifies the image as the target label ("robin").
Result: Once the model misclassifies the image as the target class, the attack succeeds, and the final prediction is printed.