import os import clip import torch from torchvision.datasets import CIFAR100 from PIL import Image # Inputs data_dir = "/vulcanscratch/lzhao/data/DSDI20/frames/" images = ["Shot12_001.jpg", "Shot11_025.jpg", "Shot14_010.jpg"] #image = data_dir + "Shot12_001.jpg" features = ["damage", "flooding or water damage", "landslide", "road washout", "rubble or debris", "smoke or fire", "dirt", "grass", "lava", "rocks", "sand", "shrubs", "snow or ice", "trees", "bridge", "building", "dam or levee", "pipes", "utility or power lines or electric towers", "railway", "wireless or radio communication towers", "water tower", "aircraft", "boat", "car", "truck", "flooding", "lake or pond", "ocean", "puddle", "river or stream", "road"] # Load the model device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load('ViT-B/32', device) for image_name in images: image = data_dir + image_name # Prepare the inputs image_input = preprocess(Image.open(image)).unsqueeze(0).to(device) text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in features]).to(device) # Calculate features with torch.no_grad(): image_features = model.encode_image(image_input) text_features = model.encode_text(text_inputs) # Pick the top 5 most similar labels for the image image_features /= image_features.norm(dim=-1, keepdim=True) text_features /= text_features.norm(dim=-1, keepdim=True) similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1) values, indices = similarity[0].topk(10) # Print the result print("\nTop predictions for image {}:\n".format(image_name)) for value, index in zip(values, indices): print(f"{features[index]:>16s}: {100 * value.item():.2f}%\n")