from tqdm import tqdm import json import pprint import urllib.request from urllib.error import HTTPError, URLError from socket import timeout dataset_dir = "/cfarhomes/lzhao/data/IncidentsDataset/data/" images_dir = "/vulcanscratch/lzhao/data/IncidentsDataset/images/train/" with open(dataset_dir+"eccv_train.json", "r") as fp: dataset = json.load(fp) count, download_count = 0, 0 for image_name in tqdm(dataset.keys()): print(image_name, flush=True) #pprint.pprint(dataset[image_name]) count += 1 try: img_file = urllib.request.urlopen(dataset[image_name]['url'], timeout=300).read() except (HTTPError, URLError) as error: print("Url can't open: ", dataset[image_name]['url'], flush=True) continue except timeout: print("Socket timeout: ", dataset[image_name]['url'], flush=True) continue except: print("Unexpected error: ", dataset[image_name]['url'], flush=True) continue new_img_name = image_name.replace("/", "_") with open(images_dir+new_img_name, "wb") as out_f: out_f.write(img_file) download_count += 1 print("Number of images in json file: ", count) print("Number of valid downloaded images ", download_count)