# VERSION
from . import mimodd_base
version = mimodd_base.Version((0, 1, 5))

# improved handling of genome file, formatter file and species-detection
# bug fix version 0,1,4 
# new in 0,1,3:
# improved effect grouping in annotated output
# tab-separated text output as alternative to html
# override link definitions in anno_weblinks with user-specified link_formatter dictionary
# new in 0,1,2:
# annotate now provides basic functionality independent of snpeff

import os
import sys
import subprocess
from collections import namedtuple, OrderedDict
from . import config, tmpfiles, pyvcf, anno_weblinks

def annotate (inputfile, genome = None, species = None, ofile=None, oformat = 'html', snpeff_out = None, grouping = None, link_formatter = None, **snpeff_options):
    """High-level function to run SnpEff and convert its output to pretty-formatted html."""

    
    outputfile = snpeff_out if snpeff_out else tmpfiles.unique_tmpfile_name ('snpeff','.vcf')

    try:
        if genome:
            # call snpeff, then work on resulting vcf
            snpeff (genome, inputfile, outputfile, **snpeff_options)
            vcf_to_annotate = pyvcf.open(outputfile)
        else:
            # work on input file directly
            vcf_to_annotate = pyvcf.open(inputfile)

        if grouping == "by_sample":
            vcf_iter = vcf_to_annotate.by_sample
        elif grouping is None:
            vcf_iter = vcf_to_annotate.expand_samples
        elif grouping == "by_genes":
            vcf_iter = affected_genes(vcf_to_annotate)

        if oformat == 'html':
            # html pre-formatting
            header="""\
<html>
<body>
<table border="1">
<tr>
<th>Sample</th>
<th>Chromosome</th>
<th>Position</th>
<th>Change</th>
<th>Affected Transcript</th>
<th>Effect</th>
<th>Genotype</th>
"""

            line_template="""\
<tr>
<td>{sample}</td>
<td>{chromosome}</td>
<td>{position}</td>
<td>{change}</td>
<td>{gene}</td>
<td>{effect}</td>
<td>{genotype}</td>
"""

            footer="""\
</table>
</body>
</html>
"""
        else:
            header = 'Sample\tChromosome\tPosition\tChange\tAffected Transcript\tEffect\tGenotype\n'
            line_template = footer = ''

        if oformat == 'html':
            if genome and not species:
                # if no organism species is specified, try to get it from SnpEff's config file
                snpeff_species = get_organism_from_snpeff_genome(genome)
                # replace underscores with spaces
                species = ' '.join(snpeff_species.split('_'))

            if not species and link_formatter:
                raise RuntimeError('Need a species name to use with the link formatter file')
            
            if link_formatter:
                try:
                    link_formatter = link_formatter[species]
                except KeyError:
                    raise ValueError('Species {0} not found in formatter file.'.format(species))
            else:
                # see if that species is in the default dictionary
                species_id = anno_weblinks.species_synonyms.get(species)
                if species_id:
                    link_formatter = anno_weblinks.links[species_id]
                else:
                    raise ValueError('Unknown species {0} (not found in default lookup table').format(species)

        if not ofile:
            ofo = sys.stdout
        else:
            ofo = open(ofile, 'w')
            
        ofo.write(header)

        for sample, e in vcf_iter():
            per_transcript = OrderedDict()
            for eff in snpeff_effects(e):                        
                if (eff.gene_name, eff.transcript_id) in per_transcript:
                    per_transcript[(eff.gene_name, eff.transcript_id)].append(eff)
                else:
                    per_transcript[(eff.gene_name, eff.transcript_id)] = [eff]
            for gene, transcript in per_transcript:
                if oformat == 'html':
                    if link_formatter:
                        # write html with database links
                        ofo.write(line_template.format(sample = sample,
                                                            chromosome = e.chrom,
                                                            position = '<a href={0}>{1}</a>'.format(link_formatter['pos'].format(chromosome = e.chrom, start = int(e.pos)-500, stop = int(e.pos)+500), e.pos),
                                                            change = '{0}->{1}'.format(e.ref, e.alt),
                                                            gene = '<a href={0}>{1}</a>'.format(link_formatter['gene'].format(gene = gene), transcript),
                                                            effect = ' | '.join(effect.func_class or effect.efftype for effect in per_transcript[(gene, transcript)]),
                                                            genotype = e.sampleinfo['GT'][sample] if e.sampleinfo else '?'
                                                            ))
                    else:
                        # write html without links
                        # while this is not particularly useful, users have a right to get what they asked for
                        ofo.write(line_template.format(sample = sample,
                                                            chromosome = e.chrom,
                                                            position = e.pos,
                                                            change = '{0}->{1}'.format(e.ref, e.alt),
                                                            gene = transcript,
                                                            effect = ' | '.join(effect.func_class or effect.efftype for effect in per_transcript[(gene, transcript)]),
                                                            genotype = e.sampleinfo['GT'][sample] if e.sampleinfo else '?'
                                                            ))

                else:
                    # write tab-separated txt
                    ofo.write('\t'.join((sample,
                                         e.chrom,
                                         str(e.pos),
                                         '{0}->{1}'.format(e.ref, e.alt),
                                         transcript,
                                         ' | '.join(effect.func_class or effect.efftype for effect in per_transcript[(gene, transcript)]),
                                         e.sampleinfo['GT'][sample] if e.sampleinfo else '?', '\n')
                                        ))
        ofo.write(footer)
    finally:
        try:
            if ofo is not sys.stdout:
                ofo.close()
        except:
            pass
        if not snpeff_out:
            try:
                os.remove(outputfile)
            except:
                pass
    
def affected_genes (vcf):
    affected_genes = {}
    for record in vcf:
        snpeffects_by_gene = {}
        for effect in snpeff_effects(record):
            ident = effect.gene_name or (record.chrom, record.pos)
            if ident in snpeffects_by_gene:
                snpeffects_by_gene[ident].append(effect)
            else:
                snpeffects_by_gene[ident] = [effect]
        for gene, effects in snpeffects_by_gene.items():
            partial_record = record.copy()
            effects_string = ','.join(e.verbatim for e in effects if e.verbatim)
            if effects_string:
                partial_record.info['EFF']=effects_string
            else:
                partial_record.info.pop('EFF', None)
            if gene in affected_genes:
                affected_genes[gene].append(partial_record)
            else:
                affected_genes[gene] = [partial_record]
    gene_list = sorted(affected_genes.items(),key=lambda x: (-len(x[1]), x[1][0].chrom, x[1][0].pos))
    
    def records_by_times_affected():
        for gene, records in gene_list:
            for record in records:
                for sample in record.samplenames:
                    if record.sampleinfo['GT'][sample] in ('0/1', '1/1'):
                        yield sample, record
    return records_by_times_affected

SnpEff_Effect = namedtuple('SnpEff_Effect',
                           ['verbatim',
                            'efftype', 'impact', 'func_class',
                            'codon_change', 'aa_change',
                            'aa_len', 'gene_name',
                            'transcript_biotype',
                            'gene_coding', 'transcript_id',
                            'exon', 'genotype_num',
                            'errors', 'warnings'])

def snpeff_effects (vcf_entry):
    """Read out the Eff tag added to the INFO field by SnpEff as a namedtuple."""
    
    if 'EFF' in vcf_entry.info:
        effects = [eff.strip() for eff in vcf_entry.info['EFF'].split(',')]
        for effect in effects:
            try:
                eff_type, details = effect.rstrip(')').split('(')
            except:
                print(effects)
                raise
            eff_details = [d.split(':')[-1] for d in details.split('|')]
            l = len(eff_details)
            assert 11 <= l <= 13, 'Mal-formatted EFF entry in vcf INFO field: {0}'.format(effect)
            if l < 13: #no errors, no warnings
                for i in range(13-l):
                    eff_details.append(None)
            yield SnpEff_Effect(effect, eff_type, *eff_details)
    else:
        yield SnpEff_Effect(*['']*13, errors = None, warnings = None)

def snpeff (genome, inputfile = None, outputfile = None, memory = None, verbose = False, quiet = True, **optionals):
    """Wrapper around SnpEff.

    Provides a function interface for subprocess calls of the form:
    java -Xmx<i>g -jar snpEff.jar [options] genome-version variants-file.
    Arguments:
    inputfile (required): path to the input VCF file
    genome (required): the reference genome version;
    reference genome file must be at data/genomes/genome_version.fa.gz
    relative to config.snpEff_path
    outputfile: path to outputfile;
    default: cwd/output.vcf
    stats: write an overview of the results to summary-file
                  (-stats, default: None)
    memory: GB of memory used (-Xmx<i>g, default: i = 2)
    threads: use multiple threads (-t, default: None (off))
    !warning: using multiple threads disables generating a summary file
    chr: prepend 'chr' to chromosome names if True, e.g., 'chr7' instead of '7'
    (-chr, default: False)
    minC: filter out sequence changes with coverage lower than min_cov
             (-minC <i>, default: None (off))
    minQ: filter out sequence changes with quality lower than min_qual
              (-minQ <i>, default: None (off))
    no_downstream: do not show downstream changes if True
                  (-no_downstream, default: False)
    no_upstream: do not show upstream changes if True
                 (-no_upstream, default: False)
    no_intron: do not show intron changes if True
               (-no_intron, default: False)
    ud: set upstream downstream interval length
        (-ud, default: None)
    v: show messages and errors if True (-v, default: False)
    settings currently not modifiable through the interface:
    -o vcf: output format is vcf; on by default."""

    assert is_installed_snpeff_genome(genome), '{0} is not the name of an installed SnpEff genome.'.format(genome)
    #adjust the memory parameter
    memory = '-Xmx{0}g'.format(memory or config.max_memory)
    if not optionals.get('stats'):
        optionals['noStats'] = True
        
    option_table = {'stats':'-stats',
                    'minC':'-minC',
                    'minQ':'-minQ',
                    'ud':'-ud'}

    bool_table = {'chr':'-chr chr',
                  'threads':'-t',
                  'no_downstream':'-no-downstream',
                  'no_upstream':'-no-upstream',
                  'no_intron':'-no-intron',
                  'no_intergenic':'-no-intergenic',
                  'no_utr':'-no-utr',
                  'v':'-v',
                  'noStats':'-noStats'}

    # build a SnpEff call of the form
    # java -Xmx<i>g -jar snpEff.jar [options] genome-version variants-file

    switches = ' '.join(switch_trans
                        for switch, switch_trans in bool_table.items()
                        if optionals.get(switch))
    options = ' '.join('{0} {1}'.format(option_trans, optionals[option])
                       for option, option_trans in option_table.items()
                                        if option in optionals)
    snpeff_call = 'java {0} -jar snpEff.jar {1} -noLog {2} -o vcf {3} {4}'.format(memory, switches, options,
                                                                                  genome, os.path.abspath(inputfile))


    if outputfile:
        outputfile = os.path.abspath(outputfile)
        snpeff_call = ' '.join((snpeff_call, '>', outputfile))

    if verbose:
        print ('Calling SnpEff with')
        print (snpeff_call)
        print ('-' * 20)

    # now make the actual call to SnpEff
    # we have to change the working directory temporarily for this to work
    # and have to make sure we change it back afterwards
    cwd = os.getcwd()
    try:
        os.chdir(config.snpEff_path)
        if outputfile:
            proc = subprocess.Popen(snpeff_call, shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
            results, errors = (s.decode() for s in proc.communicate())
        else:
            # output is going to stdout, don't PIPE it !!
            proc = subprocess.Popen(snpeff_call, shell = True, stderr = subprocess.PIPE)
            results, errors = None, proc.communicate()[1].decode()
    finally:
        os.chdir(cwd)

    if proc.returncode:
        err = []
        for line in errors:
            if line.startswith('snpEff version'):
                break
            err.append(line)
        raise RuntimeError ('snpEff failed with the following error:\n{0}'.format(''.join(err)))
    if errors and not quiet:
        # redirect warnings to stdout
        print ('Stderr output from snpEff:')
        print (errors)


def get_snpeff_config_file ():
    config_file = os.path.join(os.path.expanduser(config.snpEff_path), 'snpEff.config')
    assert os.path.isfile(config_file), 'Could not get snpEff config file. Check the snpEff_path variable in the MiModD configuration settings.'
    return config_file

def get_snpeff_data_dir (config_file = None):
    if not config_file:
        config_file = get_snpeff_config_file()
    with open(config_file) as ifo:
        for line in ifo:
            if line.startswith('data_dir'):
                data_dir = line.split('=')[-1].strip()
                data_dir = os.path.normpath(os.path.join(os.path.expanduser(config.snpEff_path), os.path.expanduser(data_dir)))
                assert os.path.isdir(data_dir), 'Could not find snpEff data directory at location {0} specified in snpEff config file.'.format(data_dir)
                return data_dir
        raise RuntimeError ('snpEff config file at {0} does not specify a data directory.'.format(config_file))

def get_snpeff_config_genomes (config_file = None):
    if not config_file:
        config_file = get_snpeff_config_file()
    with open(config_file) as ifo:
        for line in ifo:
            if not line.startswith('#'):
                fields = [e.strip() for e in line.split(':')]
                if len(fields) == 2 and fields[0].endswith('.genome'):
                    yield fields[0][:-len('.genome')], fields[1]

def is_installed_snpeff_genome (query_genome, config = None):
    return True if query_genome in (genome for genome, organism in get_snpeff_config_genomes(config)) and os.path.isfile(os.path.join(get_snpeff_data_dir (config), query_genome, 'snpEffectPredictor.bin')) else False

def get_organism_from_snpeff_genome (query_genome, config = None):
    for genome, organism in get_snpeff_config_genomes(config):
        if genome == query_genome:
            return organism
    raise KeyError('Genome file {0} not found among the registered SnpEff genomes'.format(query_genome))
        
def get_installed_snpeff_genomes (output = None, config = None):
    if output:
        file = open(output, 'w')
    else:
        file = sys.stdout
        
    snpeff_data_dir = get_snpeff_data_dir (config)
    for genome, organism in get_snpeff_config_genomes(config):
        if os.path.isfile(os.path.join(snpeff_data_dir, genome, 'snpEffectPredictor.bin')):
            print ('{0}: {1}\t{1}'.format(organism, genome), file = file)
    if file is not sys.stdout:
        file.close()
