import os import clip import argparse import torch import json import numpy as np from torchvision.datasets import CIFAR100 from PIL import Image if __name__ == "__main__": argparser = argparse.ArgumentParser() argparser.add_argument("--input_json_file", help="input_json_file", type=str, default="outputs/img_captions.json", required=False) argparser.add_argument("--output_file", help="output_file", type=str, default="outputs/coco_clip_sim.txt", required=False) args = argparser.parse_args() # Inputs #input_json_file = "outputs/img_captions.json" #output_file = "outputs/coco_clip_sim.txt" # Load the model device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load('ViT-B/32', device) count = 0 neg_prob_list, ratio_list = [], [] with open(args.output_file, "w") as f_out: f_out.write("{}\t{}\t{}\n".format("img_id", "neg_score", "ratio")) with open(args.input_json_file) as f: data = json.load(f) samples = data['samples'] for sample in samples: img_id, file_name, pos_captions, neg_captions = sample['imgId'], sample['imgName'], sample['pos_captions'], sample['neg_captions'] #pos_captions = " ".join(pos_captions) #neg_captions = " ".join(neg_captions) pos_captions = pos_captions[0] neg_captions = neg_captions[0] # Prepare the inputs try: image_input = preprocess(Image.open(file_name)).unsqueeze(0).to(device) except: print("Can't open file: ", data_dir+image) continue text_inputs = clip.tokenize([pos_captions, neg_captions]).to(device) # Calculate features with torch.no_grad(): image_features = model.encode_image(image_input) text_features = model.encode_text(text_inputs) # Get probs logits_per_image, logits_per_text = model(image_input, text_inputs) probs = (logits_per_image.softmax(dim=-1).cpu().numpy())[0] #print(probs) low, avg, high = min(probs), np.mean(probs), max(probs) neg_score, ratio = round(probs[1], 2), round(probs[1]/probs[0], 2) neg_prob_list.append(neg_score) ratio_list.append(ratio) f_out.write("{}\t{:.2f}\t{:.2f}\n".format(img_id, neg_score, ratio)) count += 1 f_out.write("\nAvg prob of neg scores: {:.2f}".format(np.mean(neg_prob_list))) f_out.write("\nAvg prob of ratio: {:.2f}".format(np.mean(ratio_list))) print("Number of images predicted: ", count)