import subprocess
from collections import Counter
import math
import re

import nltk
from nltk.corpus.reader import wordnet

try:
    wordnet_path = nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
    wordnet_path = nltk.data.find('corpora/wordnet')

wn = wordnet.WordNetCorpusReader(wordnet_path)

def reduce_sense(sense):
    bag = set(re.findall(r'\b[A-Za-z]+\b',sense.definition.lower()))
    #for x in sense.lemmas:
    #    bag.update(re.findall(r'\b[A-Za-z]+\b',x.name.lower()))
    for e in sense.examples:
        bag.update(re.findall(r'\b[A-Za-z]+\b',e.lower()))
    return bag

def get_idf_scores():
    scores = Counter()
    for s in wn.all_synsets():
        scores.update(reduce_sense(s))
    tot_count = float(len(list(wn.all_synsets())))
    for k in scores:
        scores[k] = -1 * math.log(float(scores[k])/tot_count)
    return scores

try:
    from idf_scores import scores as IDF
except ImportError:
    IDF = get_idf_scores()


def bag_of_words(sense):
    bag = reduce_sense(sense)
    for l in [sense.hypernyms(),sense.instance_hypernyms(),
                sense.hyponyms(),sense.instance_hyponyms(),
                sense.member_holonyms(),sense.substance_holonyms(),
                sense.part_holonyms(),sense.member_meronyms(),
                sense.substance_meronyms(),sense.part_meronyms(),
                sense.topic_domains(),sense.region_domains(),
                sense.usage_domains(),sense.attributes(),
                sense.entailments(),sense.causes(),
                sense.also_sees(),sense.verb_groups(),
                sense.similar_tos()]:
        for s in l:
            bag.update(reduce_sense(s))
    return bag

def relatedness(sense_one,sense_two):
    bag_one = bag_of_words(sense_one)
    bag_two = bag_of_words(sense_two)
    return sum([IDF[i] for i in bag_one & bag_two])

def get_semantic_predictability(word_sense,context_senses,debug=False,style='A'):
    if isinstance(word_sense,str):
        word_sense = to_wordnet_sense(word_sense)
    if word_sense is None:
        return None
    score = 0.0
    for s in context_senses:
        if isinstance(s,str):
            s = to_wordnet_sense(s)
        if s is None:
            continue
        score += relatedness(word_sense,s)
    if style == 'A':
        try:
            score = float(score)/float(len(context_senses))
        except ZeroDivisionError:
            pass
    return score

def to_wordnet_sense(sense_string):
    try:
        return wn.synset(sense_string)
    except wordnet.WordNetError:
        return None



def disambiguate_sense(word,cat,context,to_string=False):
    synsets = wn.synsets(word,pos=cat)
    if len(synsets) == 0:
        return None
    best_sense = synsets[0]
    best_score = 0
    for s in synsets:
        words = reduce_sense(s)
        score = sum([ IDF[x] for x in context if x in words])
        if score > best_score:
            best_sense = s
    if to_string:
        return best_sense.name
    return best_sense

def perl_get_semantic_relatedness(word,context,debug=False,style='A'):
    com = ["perl",SEM_PRED,word,','.join(context)]
    p = subprocess.Popen(com,stdout=subprocess.PIPE,stderr=subprocess.PIPE,stdin=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if debug:
        print stdout
        print stderr
    if stdout == '':
        return 0.0
    sp = stdout.split(",")
    spsum = sum(map(float,sp))
    if style == 'A':
        if spsum > 0:
            return spsum / float(len(sp))
        return 0.0
    return spsum

if __name__ == '__main__':
    import os
    import pprint
    scores = get_idf_scores()
    print len(scores)
    pprint.pprint(dict(scores),open(os.path.join(os.path.dirname(os.path.abspath(__file__)),'idf_scores.py'),'w'),indent=4,width=80)
