from nltk.corpus import wordnet as wn import collections import json from nltk.stem import PorterStemmer def create_vocab(json_file_list): vocab = set() for json_file in json_file_list: with open(json_file) as f: data = json.load(f) sents = data['sentences'] for sent in sents: caption = sent['caption'] tokens = caption.strip().split() # TODO: text processing vocab.update(tokens) return vocab def wordnet_categories(vocab, output_json_file): ps = PorterStemmer() categories = collections.defaultdict(set) pos_list = [wn.VERB, wn.NOUN] for word in vocab: for pos in pos_list: syns = wn.synsets(word, pos=pos) #for syn in syns: if syns: lexname = syns[0].lexname() # most frequent sense categories[lexname].add(ps.stem(word)) for lexname, words in categories.items(): categories[lexname] = list(words) with open(output_json_file, 'w') as f: json.dump(categories, f) vocab = create_vocab(["/vulcanscratch/lzhao/data/MSRVTT/test_videodatainfo.json"]) wordnet_categories(vocab, output_json_file="test_wordnet_categories.json")