#!/usr/bin/env python
print '''
    BESST - Scaffolder for genomic assemblies 
    Copyright (C) 2013  Kristoffer Sahlin

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

'''

import sys
import os
from time import time
from optparse import OptionParser

import pysam

from BESST import Parameter
from BESST import CreateGraph as CG
from BESST import MakeScaffolds as MS
from BESST import GenerateOutput as GO
from BESST import libmetrics

def ReadInContigseqs(contigfile):
    cont_dict = {}
    k = 0
    temp = ''
    accession = ''
    for line in contigfile:
        if line[0] == '>' and k == 0:
            accession = line[1:].strip().split()[0]
            cont_dict[accession] = ''
            k += 1
        elif line[0] == '>':
            cont_dict[accession] = temp
            temp = ''
            accession = line[1:].strip().split()[0]
        else:
            temp += line.strip()
    cont_dict[accession] = temp
    return(cont_dict)

def main(options):
    Parameter.MemoryUsage()
    tot_start = time()
    Contigs = {} # contig dict that stores contig objects 
    Scaffolds = {}     # scaffold dict with contig objects for easy fetching of all contigs in a scaffold
    small_contigs = {}
    small_scaffolds = {}

    param = Parameter.parameter() # object containing all parameters (user specified, defaulted and comuted along the way.)
    param.scaffold_indexer = 1 # global indicator for scaffolds, used to index scaffolds when they are created
    param.rel_weight = options.relweight
    param.multiprocess = options.multiprocess
    param.development = options.development
    options.output = options.output + '/BESST_output'
    os.popen('mkdir -p ' + options.output)
    param.information_file = open(options.output + '/Statistics.txt', 'w')
    Information = param.information_file
    open(options.output + '/haplotypes.fa', 'w')
    #Read in the sequences of the contigs in memory
    C_dict = ReadInContigseqs(open(options.contigfile, 'r'))

    #iterate over libraries
    param.first_lib = True
    for i in xrange(len(options.bamfiles)):
        start = time()
        param.bamfile = options.bamfiles[i]
        param.mean_ins_size = options.mean[i] if options.mean != None else None
        param.ins_size_threshold = options.threshold[i] if options.threshold != None else None
        param.edgesupport = options.edgesupport[i] if options.edgesupport != None else 5
        param.read_len = options.readlen[i] if options.readlen != None else None
        param.output_directory = options.output
        param.std_dev_ins_size = options.stddev[i] if options.stddev != None else None
        param.contig_threshold = options.minsize[i] if options.minsize != None else None
        param.cov_cutoff = options.covcutoff[i] if options.covcutoff != None else None
        param.hapl_ratio = options.haplratio
        param.hapl_threshold = options.haplthreshold
        param.detect_haplotype = options.haplotype
        param.detect_duplicate = options.duplicate
        param.fosmidpool = options.fosmidpool
        param.extend_paths = options.extendpaths
        print >> Information, '\nPASS ' + str(i + 1) + '\n\n'
        print 'Creating contig graph with library: ', param.bamfile

        #get library statistics
        with pysam.Samfile(param.bamfile, 'rb') as bam_file:
            libmetrics.get_metrics(bam_file, param, Information)

            #create graph
            (G, G_prime) = CG.PE(Contigs, Scaffolds, Information, param.output_directory, C_dict, param, small_contigs, small_scaffolds, bam_file)      #Create graph, single out too short contigs/scaffolds and store them in F

        param.first_lib = False   #not the first lib any more
        if G == None:
            print '0 contigs/super-contigs passed the length criteria of this step. Exiting and printing results.. '
            break
        elapsed = time() - start
        print >> Information, 'Time elapsed for creating graph, iteration ' + str(i) + ': ' + str(elapsed) + '\n'
        start = time()

        Parameter.MemoryUsage()
        print 'Constructed contig graph. Start BESST algorithm for creating scaffolds. ',
        MS.Algorithm(G, G_prime, Contigs, small_contigs, Scaffolds, small_scaffolds, Information, param)   # Make scaffolds, store the complex areas (consisting of contig/scaffold) in F, store the created scaffolds in Scaffolds, update Contigs
        elapsed = time() - start

        Parameter.MemoryUsage()

        print >> Information, 'Time elapsed for making scaffolds, iteration ' + str(i) + ': ' + str(elapsed) + '\n'
        print 'Writing out scaffolding results for step', i + 1, ' ...'

        F = [] #list of (ordered) lists of tuples containing (contig_name, direction, position, length, sequence). The tuple is a contig within a scaffold and the list of tuples is the scaffold.
        for scaffold_ in small_scaffolds:
            S_obj = small_scaffolds[scaffold_]
            list_of_contigs = S_obj.contigs   #list of contig objects contained in scaffold object
            F = GO.WriteToF(F, small_contigs, list_of_contigs)
        for scaffold_ in Scaffolds.keys(): #iterate over keys in hash, so that we can remove keys while iterating over it
            ###  Go to function and print to F
            ### Remove Scaf_obj from Scaffolds and Contig_obj from contigs
            S_obj = Scaffolds[scaffold_]
            list_of_contigs = S_obj.contigs   # List of contig objects contained in scaffold object
            F = GO.WriteToF(F, Contigs, list_of_contigs)


        GO.PrintOutput(F, Information, param.output_directory, param, i + 1)
        Parameter.MemoryUsage()

    ### Calculate stats for last scaffolding step    
    scaf_lengths = [Scaffolds[scaffold_].s_length for scaffold_ in Scaffolds.keys()]
    sorted_lengths = sorted(scaf_lengths, reverse=True)
    scaf_lengths_small = [small_scaffolds[scaffold_].s_length for scaffold_ in small_scaffolds.keys()]
    sorted_lengths_small = sorted(scaf_lengths_small, reverse=True)
    NG50, LG50 = CG.CalculateStats(sorted_lengths, sorted_lengths_small, param, Information)
    param.current_LG50 = LG50
    param.current_NG50 = NG50

    elapsed = time() - tot_start
    print >> Information, 'Total time for scaffolding: ' + str(elapsed) + '\n'
    print 'Finished\n\n '
    Parameter.MemoryUsage()



def vararg_callback(option, opt_str, value, parser):
    assert value is None
    value = []

    def floatable(str):
        try:
            float(str)
            return True
        except ValueError:
            return False

    for arg in parser.rargs:
        # stop on --foo like options
        if arg[:2] == "--" and len(arg) > 2:
            break
        # stop on -a, but not on -3 or -3.0
        if arg[:1] == "-" and len(arg) > 1 and not floatable(arg):
            break
        try:
            value.append(int(arg))
        except ValueError:
            value.append(arg)

    del parser.rargs[:len(value)]
    setattr(parser.values, option.dest, value)





parser = OptionParser()

## required
parser.add_option("-c", dest="contigfile",
                  help="file of contigs", type="string")

parser.add_option("-o", "--output", dest="output", default='.',
                  help="The output location", type="string")

parser.add_option("-f", "--bamfile", dest="bamfiles", action="callback", callback=vararg_callback,
                  help="Name of bamfile")

## optional
parser.add_option("-r", dest="readlen", action="callback", callback=vararg_callback,
                  help="Mean read length ")

parser.add_option("-m", dest="mean", action="callback", callback=vararg_callback,
                  help="mean of insert library")

parser.add_option("-s", "--stddev", dest="stddev", action="callback", callback=vararg_callback,
                  help="estimated standard deviation of libraries")

parser.add_option("-w", dest="relweight", default=3,
                  help="treshold value for the relative weight of an edge", type="int")

parser.add_option("-T", dest="threshold", action="callback", callback=vararg_callback,
                  help="treshold value ")


parser.add_option("-e", dest="edgesupport", action="callback", callback=vararg_callback,
                  help="treshold value for the least nr of links that is needed to create an edge. Default for all libs: 5 ")
parser.add_option("-k", "--minsize", dest="minsize", action="callback", callback=vararg_callback,
              help="contig size threshold for the library (all contigs below this size is discarded from the scaffolding). Default: Set to same as -T parameter")

parser.add_option("-z", "--covcutoff", dest="covcutoff", action="callback", callback=vararg_callback,
                  help="User specified coverage cutoff")

parser.add_option("-a", "--haplratio", dest="haplratio", default=1.3,
                  help="Maximum length ratio for merging of haplotypic regions", type="float")

parser.add_option("-b", "--haplthreshold", dest="haplthreshold", default=3,
                  help="Nr of standard deviations over mean/2 of coverage to allow for clasification of haplotype", type="int")

parser.add_option("-g", "--haplotype", dest="haplotype", default=0,
                  help="Haplotype detection function, default = off", type="int")

parser.add_option("-d", "--duplicate", dest="duplicate", default=1,
                  help="Sequencing duplicates detection, default = off", type="int")

parser.add_option("-y", "--extendpaths", dest="extendpaths", default=1,
                  help="Enhance the N50 and L50 stats on behalf of increasing nr scaffolding errors. This function should be pretty conservative so it's recommended to have the default value, default = on. To turn of, specify -y 0.", type="int")
parser.add_option("-q", "--multiprocess", dest="multiprocess", default=0,
                  help="parallellize work load in case of multiple processes available.", type="int")

parser.add_option("-D", "--devel", dest="development",
                  help="run in development mode (bug schecking and memoru usage etc.)", type="int", default=0)
#TEMPORARY parameters here, remove after spruce assembly
parser.add_option("-t", dest="transcriptalignfile",
                  help="file of contigs", type="string")

parser.add_option("-p", dest="fosmidpool", default=None,
                  help="""Specify that data comes from a fosmid pool. This parameter is sets the number of
                  links that the second strongest link must have in order to not do any scaffolding
                  with the region.""", type="int")


if len(sys.argv) == 1:
    sys.argv.append('-h')
(options, args) = parser.parse_args()


if not all([x == None or len(x) == len(options.bamfiles) for x in [options.stddev , options.mean , options.readlen, options.edgesupport, options.covcutoff, options.threshold, options.minsize]]):
    parser.error("Same number of arguments are required")

if (options.mean and not options.stddev) or (not options.mean and options.stddev):
    parser.error("either both or none of -m and -s is required")
if (options.threshold and not options.minsize) or (not options.threshold and options.minsize):
    parser.error("either both or none of -T and -k is required")
if not options.contigfile:
    parser.error("parameter -c (a fasta contig file) is required")
if not options.bamfiles:
    parser.error("parameter -f (BAM files) is required")

#check that bam files exists
for file_ in options.bamfiles:
    try:
        open(file_)
    except IOError as e:
        sys.exit("couldn't find BAM file: " + file_ + " check that the path is correct and that the file exists")
    try:
        open(file_ + '.bai')
    except IOError as e:
        print "couldn't find index file: ", file_ + '.bai', " check that the path is correct and that the file exists"
        sys.exit(0)
#check that contig files exists
try:
    open(options.contigfile)
except IOError as e:
    sys.exit("couldn't open contig file " + options.contigfile + " check that the path is correct and that the file exists")

if options.development:
    import guppy

main(options)

if options.development:
    h = guppy.hpy()
    print h.heap()

