from nltk.corpus import wordnet as wn
import collections
import json
from nltk.stem import PorterStemmer


def create_vocab(json_file_list):
    vocab = set()
    for json_file in json_file_list:
        with open(json_file) as f:
            data = json.load(f)
            sents = data['sentences']
            for sent in sents:
                caption = sent['caption']
                tokens = caption.strip().split()  # TODO: text processing
                vocab.update(tokens)

    return vocab


def wordnet_categories(vocab, output_json_file):
    ps = PorterStemmer()
    categories = collections.defaultdict(set)
    pos_list = [wn.VERB, wn.NOUN]
    for word in vocab:
        for pos in pos_list:
            syns = wn.synsets(word, pos=pos)
            #for syn in syns:
            if syns:
                lexname = syns[0].lexname()  # most frequent sense
                categories[lexname].add(ps.stem(word))

    for lexname, words in categories.items():
        categories[lexname] = list(words)

    with open(output_json_file, 'w') as f:
        json.dump(categories, f)


vocab = create_vocab(["/vulcanscratch/lzhao/data/MSRVTT/test_videodatainfo.json"])
wordnet_categories(vocab, output_json_file="test_wordnet_categories.json")