#!/usr/bin/env python
'''
    BESST - Scaffolder for genomic assemblies 
    Copyright (C) 2013  Kristoffer Sahlin

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

'''

import sys
import os
from time import time
from optparse import OptionParser
import optparse
#import gc
#gc.disable()



def ReadInContigseqs(contigfile):
    cont_dict = {}
    k = 0
    temp = ''
    accession = ''
    for line in contigfile:
        if line[0] == '>' and k == 0:
            accession = line[1:].strip().split()[0]
            cont_dict[accession] = ''
            k += 1
        elif line[0] == '>':
            cont_dict[accession] = temp
            temp = ''
            accession = line[1:].strip().split()[0]
        else:
            temp += line.strip()
    cont_dict[accession] = temp
    return(cont_dict)

def main(options):
    import pysam

    from BESST import Parameter
    from BESST import CreateGraph as CG
    from BESST import MakeScaffolds as MS
    from BESST import GenerateOutput as GO
    from BESST import libmetrics
    from BESST.diploid import get_haplotype_regions

    Parameter.MemoryUsage()
    tot_start = time()
    Contigs = {} # contig dict that stores contig objects 
    Scaffolds = {}     # scaffold dict with contig objects for easy fetching of all contigs in a scaffold
    small_contigs = {}
    small_scaffolds = {}

    param = Parameter.parameter() # object containing all parameters (user specified, defaulted and comuted along the way.)
    param.scaffold_indexer = 1 # global indicator for scaffolds, used to index scaffolds when they are created
    param.rel_weight = options.relweight
    param.multiprocess = options.multiprocess
    param.development = options.development
    options.output = options.output + '/BESST_output'
    os.popen('mkdir -p ' + options.output)
    param.information_file = open(options.output + '/Statistics.txt', 'w')
    Information = param.information_file
    open(options.output + '/haplotypes.fa', 'w')
    #Read in the sequences of the contigs in memory
    C_dict = ReadInContigseqs(open(options.contigfile, 'r'))
    print 'Number of initial contigs:', len(C_dict)
    if options.haplotype:
        get_haplotype_regions.main(options.output, C_dict, options.kmer, options.mmer, 0.8)
        sys.exit()
    #iterate over libraries
    param.first_lib = True
    for i in xrange(len(options.bamfiles)):
        start = time()
        param.bamfile = options.bamfiles[i]
        param.mean_ins_size = options.mean[i] if options.mean != None else None
        param.ins_size_threshold = options.threshold[i] if options.threshold != None else None
        param.edgesupport = options.edgesupport[i] if options.edgesupport != None else 5
        param.read_len = options.readlen[i] if options.readlen != None else None
        param.output_directory = options.output
        param.std_dev_ins_size = options.stddev[i] if options.stddev != None else None
        param.contig_threshold = options.minsize[i] if options.minsize != None else None
        param.cov_cutoff = options.covcutoff[i] if options.covcutoff != None else None
        param.hapl_ratio = options.haplratio
        param.hapl_threshold = options.haplthreshold
        param.detect_haplotype = options.haplotype
        param.detect_duplicate = options.duplicate
        param.fosmidpool = options.fosmidpool
        param.extend_paths = options.extendpaths
        print >> Information, '\nPASS ' + str(i + 1) + '\n\n'
        print 'Creating contig graph with library: ', param.bamfile

        #get library statistics
        with pysam.Samfile(param.bamfile, 'rb') as bam_file:
            libmetrics.get_metrics(bam_file, param, Information)

            #create graph
            (G, G_prime) = CG.PE(Contigs, Scaffolds, Information, param.output_directory, C_dict, param, small_contigs, small_scaffolds, bam_file)      #Create graph, single out too short contigs/scaffolds and store them in F

        param.first_lib = False   #not the first lib any more
        if G == None:
            print '0 contigs/super-contigs passed the length criteria of this step. Exiting and printing results.. '
            break
        elapsed = time() - start
        print >> Information, 'Time elapsed for creating graph, iteration ' + str(i) + ': ' + str(elapsed) + '\n'
        start = time()

        Parameter.MemoryUsage()
        print 'Constructed contig graph. Start BESST algorithm for creating scaffolds. '
        MS.Algorithm(G, G_prime, Contigs, small_contigs, Scaffolds, small_scaffolds, Information, param)   # Make scaffolds, store the complex areas (consisting of contig/scaffold) in F, store the created scaffolds in Scaffolds, update Contigs
        elapsed = time() - start

        Parameter.MemoryUsage()

        print >> Information, 'Time elapsed for making scaffolds, iteration ' + str(i) + ': ' + str(elapsed) + '\n'
        print 'Writing out scaffolding results for step', i + 1, ' ...'

        F = [] #list of (ordered) lists of tuples containing (contig_name, direction, position, length, sequence). The tuple is a contig within a scaffold and the list of tuples is the scaffold.
        for scaffold_ in small_scaffolds:
            S_obj = small_scaffolds[scaffold_]
            list_of_contigs = S_obj.contigs   #list of contig objects contained in scaffold object
            F = GO.WriteToF(F, small_contigs, list_of_contigs)
        for scaffold_ in Scaffolds.keys(): #iterate over keys in hash, so that we can remove keys while iterating over it
            ###  Go to function and print to F
            ### Remove Scaf_obj from Scaffolds and Contig_obj from contigs
            S_obj = Scaffolds[scaffold_]
            list_of_contigs = S_obj.contigs   # List of contig objects contained in scaffold object
            F = GO.WriteToF(F, Contigs, list_of_contigs)


        GO.PrintOutput(F, Information, param.output_directory, param, i + 1)
        Parameter.MemoryUsage()

    ### Calculate stats for last scaffolding step    
    scaf_lengths = [Scaffolds[scaffold_].s_length for scaffold_ in Scaffolds.keys()]
    sorted_lengths = sorted(scaf_lengths, reverse=True)
    scaf_lengths_small = [small_scaffolds[scaffold_].s_length for scaffold_ in small_scaffolds.keys()]
    sorted_lengths_small = sorted(scaf_lengths_small, reverse=True)
    NG50, LG50 = CG.CalculateStats(sorted_lengths, sorted_lengths_small, param, Information)
    param.current_LG50 = LG50
    param.current_NG50 = NG50

    elapsed = time() - tot_start
    print >> Information, 'Total time for scaffolding: ' + str(elapsed) + '\n'
    print 'Finished\n\n '
    Parameter.MemoryUsage()



def vararg_callback(option, opt_str, value, parser):
    assert value is None
    value = []

    def floatable(str):
        try:
            float(str)
            return True
        except ValueError:
            return False

    for arg in parser.rargs:
        # stop on --foo like options
        if arg[:2] == "--" and len(arg) > 2:
            break
        # stop on -a, but not on -3 or -3.0
        if arg[:1] == "-" and len(arg) > 1 and not floatable(arg):
            break
        try:
            value.append(int(arg))
        except ValueError:
            value.append(arg)

    del parser.rargs[:len(value)]
    setattr(parser.values, option.dest, value)





parser = OptionParser()
libstats = optparse.OptionGroup(parser, 'Options for library statistics such as read length, insert size mean and std_dev, coverage etc.')
haplotypes = optparse.OptionGroup(parser, 'Options involving haplotype detection.')
required = optparse.OptionGroup(parser, 'Required input.')
parameters = optparse.OptionGroup(parser, 'Parameters for algorithm.')
other = optparse.OptionGroup(parser, 'Various other options.')


## required
required.add_option("-c", dest="contigfile",
                  help="Fasta file containing contigs.", type="string")

required.add_option("-f", "--bamfile", dest="bamfiles", action="callback", callback=vararg_callback,
                  help="Path(s) to bamfile(s).")

## optional
other.add_option("-o", "--output", dest="output", default='.',
                  help="Path to output directory. BESST will create a folder named 'BESST_output' in the directory given by the path.", type="string")

libstats.add_option("-r", dest="readlen", action="callback", callback=vararg_callback,
                  help="Mean read length of libraries. ")

libstats.add_option("-m", dest="mean", action="callback", callback=vararg_callback,
                  help="Mean insert size of libraries.")

libstats.add_option("-s", "--stddev", dest="stddev", action="callback", callback=vararg_callback,
                  help="Estimated standard deviation of libraries.")

parameters.add_option("-w", dest="relweight", default=3,
                  help="Threshold value for the relative weight of an edge (deprecated)", type="int")

parameters.add_option("-T", dest="threshold", action="callback", callback=vararg_callback,
                  help="Threshold value filter out reads that are mapped too far apart given insert size. ")


parameters.add_option("-e", dest="edgesupport", action="callback", callback=vararg_callback,
                  help="Threshold value for the least nr of links that is needed to create an edge. Default for all libs: 5 ")
parameters.add_option("-k", "--minsize", dest="minsize", action="callback", callback=vararg_callback,
                    help="Contig size threshold for the library (all contigs below this size is discarded from the \
                    'large contigs' scaffolding, but included in pathfinding). Default: Set to same as -T parameter")

libstats.add_option("-z", "--covcutoff", dest="covcutoff", action="callback", callback=vararg_callback,
                  help="User specified coverage cutoff. (Manually filter out contigs with coverage over this value)")

haplotypes.add_option("-a", "--haplratio", dest="haplratio", default=1.3,
                  help="Maximum length ratio for merging of haplotypic regions.", type="float")

haplotypes.add_option("-b", "--haplthreshold", dest="haplthreshold", default=3,
                  help="Number of standard deviations over mean/2 of coverage to allow for clasification of haplotype.\
                   Example: contigs with under mean/2 + 3sigma are indicated as potential haplotypes (tested later) \
                   if -b 3", type="int")

haplotypes.add_option("-g", "--haplotype", dest="haplotype", default=0,
                  help="Haplotype detection function, default = off", type="int")

other.add_option("-d", "--duplicate", dest="duplicate", default=1,
                  help="Sequencing duplicates detection, default = off", type="int")

other.add_option("-K", "--kmer", dest="kmer", default=50,
                  help="k-mer size used in de brujin graph for obtaining contigs in assembly, default 50", type="int")

other.add_option("-M", "--mmer", dest="mmer", default=40,
                  help="m-mer usted for creating connection graph. Should be set lower than k-mer size ", type="int")


other.add_option("-y", "--extendpaths", dest="extendpaths", default=1,
                  help="Enhance the N50 and L50 stats on behalf of increasing nr scaffolding errors. \
                   This function should be pretty conservative so it's recommended to have the default value,\
                    default = on. To turn of, specify -y 0.", type="int")

other.add_option("-q", action="store_true", dest="multiprocess",
                help="Parallellize work load of path finder module in case of multiple processors available.\
                  ",)

other.add_option("-D", "--devel", dest="development",
                  help="Run in development mode (bug checking and memory usage etc.)", type="int", default=0)
#TEMPORARY parameters here, remove after spruce assembly
other.add_option("-t", dest="transcriptalignfile",
                  help="file of contigs", type="string")

other.add_option("-p", dest="fosmidpool", default=None,
                  help="""Specify that data comes from a fosmid pool. This parameter sets the number of
                  links that the second strongest link must have in order to not do any scaffolding
                  with the region.""", type="int")

parser.add_option_group(required)
parser.add_option_group(libstats)
parser.add_option_group(parameters)
parser.add_option_group(haplotypes)
parser.add_option_group(other)

if len(sys.argv) == 1:
    sys.argv.append('-h')
(options, args) = parser.parse_args()


if not all([x == None or len(x) == len(options.bamfiles) for x in [options.stddev , options.mean , options.readlen, options.edgesupport, options.covcutoff, options.threshold, options.minsize]]):
    parser.error("Same number of arguments are required")

if (options.mean and not options.stddev) or (not options.mean and options.stddev):
    parser.error("either both or none of -m and -s is required")
if (options.threshold and not options.minsize) or (not options.threshold and options.minsize):
    parser.error("either both or none of -T and -k is required")
if not options.contigfile:
    parser.error("parameter -c (a fasta contig file) is required")
if not options.bamfiles:
    parser.error("parameter -f (BAM files) is required")

#check that bam files exists
for file_ in options.bamfiles:
    try:
        open(file_)
    except IOError as e:
        sys.exit("couldn't find BAM file: " + file_ + " check that the path is correct and that the file exists")
    try:
        open(file_ + '.bai')
    except IOError as e:
        print "couldn't find index file: ", file_ + '.bai', " check that the path is correct and that the file exists"
        sys.exit(0)
#check that contig files exists
try:
    open(options.contigfile)
except IOError as e:
    sys.exit("couldn't open contig file " + options.contigfile + " check that the path is correct and that the file exists")

if options.development:
    import guppy

main(options)

if options.development:
    h = guppy.hpy()
    print h.heap()

