import os import clip import torch from torchvision.datasets import CIFAR100 from PIL import Image # Inputs data_dir = "/vulcanscratch/lzhao/data/DSDI20/frames_v2/" features = ["damage", "flooding or water damage", "landslide", "road washout", "rubble or debris", "smoke or fire", "dirt", "grass", "lava", "rocks", "sand", "shrubs", "snow or ice", "trees", "bridge", "building", "dam or levee", "pipes", "utility or power lines or electric towers", "railway", "wireless or radio communication towers", "water tower", "aircraft", "boat", "car", "truck", "flooding", "lake or pond", "ocean", "puddle", "river or stream", "road"] output_file = "/vulcanscratch/lzhao/data/DSDI20/predictions/frames_top_preds.txt" # Load the model device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load('ViT-B/32', device) with open(output_file, "w") as f_out: for image in os.listdir(data_dir): image_name = os.path.splitext(image)[0].lower() # Prepare the inputs image_input = preprocess(Image.open(data_dir+image)).unsqueeze(0).to(device) text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in features]).to(device) # Calculate features with torch.no_grad(): image_features = model.encode_image(image_input) text_features = model.encode_text(text_inputs) # Pick the top k most similar labels for the image image_features /= image_features.norm(dim=-1, keepdim=True) text_features /= text_features.norm(dim=-1, keepdim=True) similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1) values, indices = similarity[0].topk(32) feature_list = [features[index] for index in indices] feature_scores = [str(round(100 * value.item(), 2)) for value in values] f_out.write("{} {}\n".format(image_name, ", ".join(feature_list))) f_out.write("{} {}\n".format("score_" + image_name, ", ".join(feature_scores)))