import os

from django.db.models import Sum

from .models import Lemma,WordForm,Orthography,Transcription



def lookupFreq(word):
    total_freq = sum([x.frequency for x in Lemma.objects.filter(wordform__orthographies__spelling = word).distinct()])
    return total_freq

#def lookupStress(word,freqDict):
#    qs = Spelling.objects.filter(Label=word).order_by('-Word__Frequency')
#    return [q.StressPattern for q in qs]

def categorize_words(words):
    qs = Lemma.objects.filter(wordform__orthographies__spelling__in = words).distinct().order_by('frequency')
    t = []
    for x in words:
        if x == 'a':
            t.append((x,'art'))
        elif x == 'i':
            t.append((x,'pron'))
        else:
            t.append((x,(['NA']+[str(y.category).lower() for y in qs if x in y.get_spelling_set()])[-1]))
            if t[-1][1] == 'adv':
                t[-1] = (t[-1][0],'r')
    return ['#'.join(x) for x in t if x[1]]


def lookupCat(word):
    if word in ['a','i']:
        return u'pron'
    qs = Lemma.objects.filter(wordform__orthographies__spelling = word).distinct().order_by('-frequency')
    if len(qs) == 0:
        return 'NA'
    cat = qs[0].category.label
    if cat == 'ADV':
        cat = 'R'
    return cat.lower()

def filterNGrams(ngram_path):
    qs = Orthography.objects.all()#.prefetch_related()
    #qs = qs.exclude(spelling__contains="'")
    #qs = qs.exclude(spelling__contains=".")
    #qs = qs.exclude(spelling__contains=",")
    #qs = qs.exclude(spelling__contains='"')
    spells = set([str(x) for x in qs])
    orig_path = os.path.join(ngram_path,'original')
    trim_path = os.path.join(ngram_path,'trimmed')
    files = os.listdir(orig_path)
    for f in files:
        with open(os.path.join(orig_path,f),'r') as infile:
            with open(os.path.join(trim_path,f),'w') as outfile:
                for line in infile:
                    l = line.strip().split("\t")
                    l[0] = l[0].split(" ")
                    bad_word_check = False
                    for i in l[0]:
                        if i not in spells:
                            bad_word_check = True
                            break
                    if bad_word_check:
                        continue
                    outfile.write('\t'.join([' '.join(l[0]),l[1]]))
                    outfile.write('\n')



