'''
Utilities to generate sequenza-specific files.
'''
from pileup import parse_pileup

def line_worker(line, depth_sum, qlimit=20, qformat='sanger', hom_t=0.85, het_t=0.35):
   '''
   After the 3 files are syncronized we need to transform the pileup in a
   readable format, find the alleles, and compute the allele frequency in tumor
   '''
   if line:
      bases_list = ['A', 'C', 'G', 'T']
      chromosome, position, ref, p1_str, p2_str, gc = line
      p1_list = p1_str.split()
      p2_list = p2_str.split()
      if int(p1_list[0]) + int(p2_list[0]) >= depth_sum and len(p1_list) == len(p2_list):
         p1_mu  = parse_pileup(ref, p1_list, qlimit, qformat)
         sum_p1 = float(sum(p1_mu[2:6]))
         if sum_p1 > 0:
            p1_freq = [x/sum_p1 for x in p1_mu[2:6]]
            sort_freq_p1 = sorted(p1_freq, reverse = True)
            if sort_freq_p1[0] >= hom_t:
               # Homozygous positions here
               i        = p1_freq.index(sort_freq_p1[0])
               p2_mu    = parse_pileup(ref, p2_list, qlimit, qformat)
               sum_p2   = float(sum(p2_mu[2:6]))
               if sum_p2 > 0:
                  p2_freq  = [x/sum_p2 for x in p2_mu[2:6]]
                  homoz_p2 = p2_freq[i]
                  no_zero_idx   = [val for val in range(len(p2_freq)) if p2_freq[val] > 0]
                  no_zero_bases = [str(bases_list[ll])+str(round(p2_freq[ll], 3)) for ll in no_zero_idx if ll != i]
                  if no_zero_bases == []:
                     no_zero_bases = '.'
                     strands_bases = '0'
                  else:
                     no_zero_bases = ":".join(map(str, no_zero_bases))
                     strands_bases = [str(bases_list[ll])+str(round(p2_mu[6][ll]/float(p2_mu[2+ll]), 3)) for ll in no_zero_idx if ll != i]
                     strands_bases = ":".join(map(str, strands_bases))
                  #homoz_p2 = p2_mu[2 + i]/sum_p2
                  # chromosome, n_base, base_ref, depth_normal, depth_sample, depth.ratio, Af, Bf, ref.zygosity, GC-content, percentage reads above quality, AB.ref, AB.tum
                  line_out = [chromosome, position, p1_mu[0], p1_mu[1], p2_mu[1], round(p2_mu[1]/float(p1_mu[1]), 3), round(homoz_p2, 3), 0, 'hom', gc, int(sum_p2), bases_list[i], no_zero_bases, strands_bases]
                  return line_out
               else:
                  pass
            else:
               if sort_freq_p1[1] >= het_t:
                  # Heterozygous position here
                  allele = list()
                  for b in xrange(4):
                     if p1_freq[b] >= het_t:
                        allele.append(bases_list[b])
                  if len(allele) == 2:
                     p2_mu    = parse_pileup(ref, p2_list, qlimit, qformat)
                     sum_p2   = float(sum(p2_mu[2:6]))
                     if sum_p2 > 0:
                        i        = bases_list.index(allele[0])
                        ii       = bases_list.index(allele[1])
                        het_a_p2 = p2_mu[2 + i]/sum_p2
                        het_b_p2 = p2_mu[2 + ii]/sum_p2
                        if  het_a_p2 >= het_b_p2:
                           line_out = [chromosome, position, p1_mu[0], p1_mu[1], p2_mu[1], round(p2_mu[1]/float(p1_mu[1]), 3), round(het_a_p2 , 3), round(het_b_p2 , 3) , 'het', gc, int(sum_p2), bases_list[i]+bases_list[ii], ".", "0"]
                           return line_out
                        elif het_a_p2 < het_b_p2:
                           line_out = [chromosome, position, p1_mu[0], p1_mu[1], p2_mu[1], round(p2_mu[1]/float(p1_mu[1]), 3), round(het_b_p2 , 3), round(het_a_p2 , 3) , 'het', gc, int(sum_p2), bases_list[i]+bases_list[ii], ".", "0"]
                           return line_out
                  else:
                     pass
         else:
            pass
      else:
         pass
   else:
      pass

class abfreReduce:
   '''
   Creates a binned version of an abfreq file, given the size of binning in nucleotide
   '''
   def __init__(self, abf, w):
      self.abf = abf
      self.w   = w
      self._status = 1
      self._header = next(self.abf).strip()
      line    = next(self.abf).strip()
      line_ls = line.split('\t')
      self.__refresh__(line_ls, line)
   _sentinel        = object()
   def next(self):
      if self._status == 1:
         try:
            line    = next(self.abf).strip()
            line_ls = line.split('\t')
         except StopIteration:
            self._status = 0
            self.__do_dict__()
            #return [line_ls[1], self.w + self._last_edge, self._n]
            return self.line_dict    
         if self.w + self._last_edge < int(line_ls[1]) or self._last_chromosome != line_ls[0]:
            self.__do_dict__()
            line_dict = self.line_dict
            self.__refresh__(line_ls, line)
            #return [line_ls[1], self.w + self._last_edge, self._n]
            return line_dict 
         else:
            self.__addline__(line_ls, line)
      elif self._status == 0:
         raise StopIteration
   def __refresh__(self, line_ls, line):
      self._last_chromosome = line_ls[0]
      self._last_edge       = int(line_ls[1])
      self._last_position   = self._last_edge
      self._n_depth         = int(line_ls[3])
      self._t_depth         = int(line_ls[4])
      self._ratio           = float(line_ls[5])
      self._gc              = float(line_ls[9])
      self._n               = 1
      self.line_dict = {'top': '', 'middle': [], 'end': ''}
      if float(line_ls[6]) < 1.0:
         self.line_dict['top'] = line_ls
   def __addline__(self, line_ls, line):
      self._last_position    = int(line_ls[1])
      self._n_depth         += int(line_ls[3])
      self._t_depth         += int(line_ls[4])
      self._ratio           += float(line_ls[5])
      self._gc              += float(line_ls[9])
      self._n               += 1
      if float(line_ls[6]) < 1.0:
         self.line_dict['middle'].append(line_ls)
   def __do_dict__(self):
      gc = str(round(self._gc/self._n, 3))
      avg_line = [self._last_chromosome, self._last_position, 'N', self._n_depth/self._n,
                  self._t_depth/self._n, round(self._ratio/self._n, 3), 1.0, 0, 'hom',
                  gc , self._n, 'N', '.']
      self.line_dict['end'] = map(str, avg_line)
      if self.line_dict['top'] == '':
         avg_line[1] = self._last_edge
         self.line_dict['top'] = map(str, avg_line)
      else:
         self.line_dict['top'][9] = gc
      for mid in self.line_dict['middle']:
         mid[9] = gc
   def close(self):
      self.abf.close()
   def __iter__(self):
      return (iter(self.next, self._sentinel))

def stream_fasta(fa_file, window_size, out_queue):
   """
   Read and stream a fasta file in a Pipe
   """
   from misc import grouper
   tmp_line   = ''
   for line in fa_file:
      line = line.strip()
      if line.startswith('>'):
         chromosome = line
         if tmp_line == '':
            #print chromosome
            out_queue.put(chromosome)
         else:
            groups = grouper(window_size, tmp_line.upper())
            for group in groups:
               #print group
               out_queue.put(group)
            tmp_line = ''
            #print chromosome
            out_queue.put(chromosome)
      else:
         tmp_line = tmp_line + line
         if len(tmp_line)%window_size == 0:
            groups = grouper(window_size, tmp_line.upper())
            for group in groups:
               #print group
               out_queue.put(group)
            tmp_line = ''
         else:
            #print tmp_line
            pass
   if tmp_line == '':
      pass
   else:
      groups = grouper(window_size, tmp_line.upper())
      for group in groups:
         #print group
         out_queue.put(group)
   out_queue.put('EOF')

def seq_map(seq_list):
   """
   """
   counter = {"A":0,"C":0,"G":0,"N":0,"T":0}
   for nucleotide in seq_list:
      counter[nucleotide] +=1
   return counter

def process_gc_from_pipe(in_queue, window_size):
   """
   Process the stream of the fasta file and compute the
   GC-content given a fixed window size, and return the 
   GC-content information in the WIG format
   """
   base_counter  = 1
   window_size   = float(window_size)
   while True:
      seq_list   = in_queue.get()
      #print seq_list
      if seq_list != 'EOF':
         if type(seq_list) == str:
            if seq_list.startswith('>'):
               chromosome = seq_list.strip('>')
               print "variableStep chrom=" + chromosome + " span=" + str(int(window_size))
               base_counter = 1
         else:
            seq_list = filter(None,seq_list)
            stats    = seq_map(seq_list)
            seq_len  = len(seq_list)
            if seq_len == window_size:
               if stats['N']/window_size >= 0.75:
                  base_counter = base_counter + window_size
                  pass
               else:
                  gc_percent = 100 * (stats['G'] + stats['C'])/window_size
                  print str(int(base_counter)) + "\t" + str(gc_percent)
                  base_counter = base_counter + window_size
            else:
               if stats['N'] / seq_len >= 0.75:
                  base_counter = base_counter + seq_len
                  pass
               else:
                  gc_percent   = 100 * (stats['G'] + stats['C'])/seq_len
                  print str(int(base_counter)) + "\t" + str(gc_percent)
                  base_counter = base_counter + seq_len
      else:
         break

