#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2013 by Björn Johansson.  All rights reserved.
# This code is part of the Python-dna distribution and governed by its
# license.  Please see the LICENSE.txt file that should have been included
# as part of this package.

import re
import itertools
import datetime
import copy
import os
import textwrap
import StringIO
import warnings

from Bio                    import Alphabet
from Bio                    import SeqIO
from Bio.Alphabet.IUPAC     import IUPACAmbiguousDNA
from Bio.Seq                import Seq
from Bio.Seq                import reverse_complement as rc
from Bio.SeqRecord          import SeqRecord
from Bio.SeqUtils.CheckSum  import seguid

from Bio.SeqFeature         import SeqFeature
from Bio.SeqFeature         import FeatureLocation

from utils                  import eq
from guess_alphabet         import guess_alphabet
from find_sub_strings       import common_sub_strings

class dseq(Seq):

    def __init__(self,
                  watson,
                  crick,
                  ovhg          = None,
                  linear        = None,
                  circular      = None,
                  alphabet      = IUPACAmbiguousDNA):
        '''
       >>> import pydna
       >>> a = pydna.dseq(watson="agcta",crick="gct",ovhg=0)
       >>> a
       dseq(-5)
       agcta
       tcg


       >>> b = pydna.dseq(watson="agt",crick="actta",ovhg=2)
       >>> b
       dseq(-5)
         agt
       attca



       >>> c = pydna.dseq(watson="gatcct",crick="ag",ovhg=-4)
       >>> c
       dseq(-6)
       gatcct
           ga


       '''

        self.watson = watson
        self.crick  = crick
        self.ovhg   = ovhg

        if ovhg is None:
            F,T,L = common_sub_strings(str(watson).lower(),
                                          str(rc(crick).lower()),
                                          min((len(watson),len(crick)))).pop(0)
            self.ovhg = T-F

        sns = ((self.ovhg*" ") + str(self.watson))
        asn = ((-self.ovhg*" ") + str(rc(self.crick)))

        data= "".join([a.strip() or b.strip() for a,b in itertools.izip_longest(sns,asn, fillvalue=" ")])
        self.dsdata = "".join([a for a, b in itertools.izip_longest(sns,asn, fillvalue=" ") if a.lower()==b.lower()])

        if circular == None and linear in (True, False,):
            self._linear   = linear
            self._circular = not linear
        elif linear == None and circular in (True, False,):
            self._circular = circular
            self._linear   = not circular
        elif circular == linear == None:
            self._circular = False
            self._linear   = True
        else:
            raise()

        if (self.circular and
            self.FivePrimeStickyEnd()[0] != "blunt" and
            self.ThreePrimeStickyEnd()[0] != "blunt"):
            raise()

        Seq.__init__(self, data, alphabet)

    def __getitem__(self, slc):

        sns = (self.ovhg*" " + self.watson)[slc]
        asn = (-self.ovhg*" " + self.crick[::-1])[slc]

        ovhg= max((len(sns) - len(sns.lstrip()),
                   -len(asn) + len(asn.lstrip())),
                   key=abs)

        return dseq(sns.strip(),asn[::-1].strip(), ovhg=ovhg, linear=True)

    def __eq__( self, other ):
        try:
            same = (other.watson.lower() == self.watson.lower() and
                    other.crick.lower()  == self.crick.lower()  and
                    other.ovhg == self.ovhg)
        except AttributeError:
            same = False
        return same


    def fig(self):
        return self.__repr__()

    def __repr__(self):
        """Returns a truncated representation of the sequence for debugging."""
        if len(self) > 40:
            #  (a)...(b)
            #  (a)...(b)
            a = 20
            b = 20
            if self.ovhg > 0:
                a = a - self.ovhg
            ovhg3 = len(self.watson) - len(self.crick)+self.ovhg
            #print self.ovhg, ovhg3
            return "{}({}{})\n{}...{}\n{}...{}".format(self.__class__.__name__,
                                                               {True:"-", False:"o"}[self.linear],
                                                               len(self),
                                                               (self.ovhg*" ")+str(self.watson)[:a],
                                                               str(self.watson)[-b:],
                                                               (-self.ovhg*" ")+str(self.crick)[::-1][:a-(-self.ovhg)],
                                                               str(self.crick)[:b-ovhg3][::-1])
        else:
            return "{}({}{})\n{}\n{}".format(self.__class__.__name__,
                                                {True:"-", False:"o"}[self.linear],
                                                len(self),
                                                self.ovhg*" " + self.watson,
                                               -self.ovhg*" "+ self.crick[::-1])

    def rc(self):
        return self.reverse_complement()

    def reverse_complement(self):
        ovhg = len(self.watson) - len(self.crick)+self.ovhg
        return dseq(self.crick, self.watson, ovhg=ovhg, circular = self.circular)

    def loop(self):
        if self.circular:
            return None
        type5, sticky5 = self.FivePrimeStickyEnd()
        type3, sticky3 = self.ThreePrimeStickyEnd()
        if type5 == type3 and str(sticky5) == str(rc(sticky3)):
            self._circular = True
            self._linear   = False
            self.crick = self.crick[-self.ovhg:] + self.crick[:-self.ovhg]
            self._data = self.watson

            self.ovhg = 0
            assert len(self.crick) == len(self.watson)
            return None
        else:
            raise TypeError("5' and 3' sticky ends not compatible!")

    def FivePrimeStickyEnd(self):
        if self.watson and not self.crick:
            return "5'",self.watson.lower()
        if not self.watson and self.crick:
            return "3'",self.crick.lower()
        if self.ovhg < 0:
            sticky = self.watson[:-self.ovhg].lower()
            type_ = "5'"
        elif self.ovhg > 0:
            sticky = self.crick[-self.ovhg:].lower()
            type_ = "3'"
        else:
            sticky = ""
            type_ = "blunt"
        return type_, sticky

    def ThreePrimeStickyEnd(self):
        ovhg = len(self.watson)-len(self.crick)+self.ovhg
        if ovhg < 0:
            sticky = self.crick[:-ovhg].lower()
            type_ = "5'"
        elif ovhg > 0:
            sticky = self.watson[-ovhg:].lower()
            type_ = "3'"
        else:
            sticky = Seq("")
            type_ = "blunt"
        return type_, sticky

    def __add__(self, other):
        # test for circular DNA
        if self.circular:
            raise TypeError("circular DNA cannot be ligated!")
        try:
            if other.circular:
                raise TypeError("circular DNA cannot be ligated!")
        except AttributeError:
            pass

        self_type,  self_sticky  = self.ThreePrimeStickyEnd()
        other_type, other_sticky = other.FivePrimeStickyEnd()
        if  (self_type == other_type and
            str(self_sticky) == str(rc(other_sticky))):
            answer = dseq(self.watson + other.watson,
                          other.crick + self.crick,
                          self.ovhg,)
        else:
            raise TypeError("sticky ends not compatible!")
        return answer

    def fill_in(self,nucleotides="ACGT"):
        nucleotides = set(nucleotides.lower()+nucleotides.upper())
        stuffer = ''
        type, se = self.FivePrimeStickyEnd()
        if type == "5'":
            for n in rc(se):
                if n in nucleotides:
                    stuffer+=n
                else:
                    break
        self.crick += stuffer
        self.ovhg  += len(stuffer)
        stuffer = ''
        type, se = self.ThreePrimeStickyEnd()

        if type == "5'":
            for n in rc(se):
                if n in nucleotides:
                    stuffer+=n
                else:
                    break
        self.watson += stuffer
        return

    def FiveThreeNuclease(self, nucleotides="ACGT"):
        nucleotides = set(nucleotides.lower()+nucleotides.upper())
        type, se = self.FivePrimeStickyEnd()
        if type == "5'":
            self.ovhg=0
            self.watson = self.watson[len(se):]
        x=0
        while True:
            if self.watson[x] in nucleotides:
                break
            x+=1
        self.ovhg=x
        self.watson = self.watson[x:]
        type, se = self.ThreePrimeStickyEnd()
        if type == "5'":
            self.crick = self.crick[len(se):]
        x=0
        while True:
            if self.crick[x] in nucleotides:
                break
            x+=1
        self.crick=self.crick[x:]
        return


    def ThreeFiveNuclease(self,nucleotides=""):
        nucleotides = set(nucleotides.lower()+nucleotides.upper())
        type, se = self.FivePrimeStickyEnd()
        if type == "3'":
            self.ovhg=0
            self.crick = self.crick[:-len(se)]
        x=0
        while True:
            if self.watson[-x] in nucleotides:
                break
            x+=1
        self.ovhg=-x
        self.crick = self.crick[:-x]
        type, se = self.ThreePrimeStickyEnd()
        if type == "3'":
           self.crick = self.crick[len(se):]
        x=0
        while True:
            if self.watson[-x] in nucleotides:
                break
            x+=1
        self.watson=self.watson[:-x]
        return


    def cut(self, *enzymes):
        frags=[self,]
        ''' flatten enzymes '''
        output = []
        stack = []
        stack.extend(reversed(enzymes))
        while stack:
            top = stack.pop()
            if hasattr(top, "__iter__"):
                stack.extend(reversed(top))
            else:
                output.append(top)
        enzymes = output

        if not hasattr(enzymes, '__iter__'):
            enzymes = (enzymes,)


        newfrags=[]
        for enzyme in enzymes:
            for frag in frags:
                if enzyme.search(Seq(frag.dsdata), linear = frag.linear):

                    s = zip([str(s) for s in enzyme.catalyze(Seq(frag.watson),linear = frag.linear)],
                            [str(s) for s in enzyme.catalyze(Seq(frag.crick), linear = frag.linear)[::-1]])

                    if frag.linear:
                        newfrags.append(dseq(*s.pop(0),
                                             ovhg = frag.ovhg,
                                             linear = True))
                        for seqs in s:
                            newfrags.append(dseq(*seqs,
                                                 ovhg = enzyme.ovhg,
                                                 linear = True))
                    else:
                        for seqs in s:
                            newfrags.append(dseq(*seqs,
                                                 ovhg=enzyme.ovhg,
                                                 linear=True))
                else:
                    newfrags.append(frag)
            frags=newfrags
            newfrags=[]
        return frags

    def _get_linear(self):
        return self._linear
    def _set_linear(self, value):
        if not value:
            self.loop()
        else:
            self._linear=True
            self._circular=False
    def _get_circular(self):
        return self._circular
    def _set_circular(self, value):
        if value:
            self.loop()
        else:
            self._circular = False
            self._linear   = True
        if self._circular:
            self.loop()
    linear   = property(_get_linear  , _set_linear,   "I'm the 'linear' property.")
    circular = property(_get_circular, _set_circular, "I'm the 'circular' property.")




class drecord(SeqRecord):

    def __init__(self, record,
                         circular               = None,
                         linear                 = None,
                         filter                 = False,
                         raw_string             = "",
                         parsed_from            = None,
                         *args, **kwargs):

        self.raw                = "not set"
        self.parsed_from        = "not defined"
        self.filtered           = None
        self._circular          = None
        self._linear            = None
        self.guessed_alphabet   = False
        self.warnings           = ""

        if isinstance(record, basestring):
            SeqRecord.__init__(self, dseq(record, rc(record), 0), *args, **kwargs)
        elif hasattr(record, "features"):                    # SeqRecord ?
            for key, value in record.__dict__.items():
                setattr(self, key, value )
            if hasattr(self.seq, "watson"):
                self.seq=copy.copy(self.seq)
            else:
                self.seq=dseq(str(self.seq), str(rc(self.seq)), 0)
        elif hasattr(record, "watson"):                 # dseq ?
            SeqRecord.__init__(self, record, *args, **kwargs)
        elif isinstance(record, Seq):                    # Seq ?
            SeqRecord.__init__(self, dseq(str(record),str(record.reverse_complement()),0, alphabet=record.alphabet), *args, **kwargs)
        else:
            raise TypeError(("record argument needs to be a string,"
                              "Seq, SeqRecord or dseq object,"
                              " got {}").format(type(record)))

        if filter:
            IUPAC_single_alphabet_letters = ("ACBEDGFIHKJMLONQPSRUTWVYXZ"
                                             "acbedgfihkjmlonqpsrutwvyxz")

            filtered_out = "".join([c for c in self.seq if c not in IUPAC_single_alphabet_letters])

            if filtered_out:
                filtered = "".join([c for c in self.seq if c in IUPAC_single_alphabet_letters])
                self.seq = Seq(filtered, self.seq.alphabet)
                self.filtered = filtered_out
                self.warnings += u"{} non-permitted chars were filtered from the sequence!\n".format(", ".join(set(filtered_out)))

        if not isinstance(self.seq.alphabet, (Alphabet.ProteinAlphabet,Alphabet.DNAAlphabet,Alphabet.RNAAlphabet)):
            self.seq.alphabet = guess_alphabet(self.seq)
            self.guessed_alphabet = True
            self.warnings += str(self.seq.alphabet) + " alphabet guessed from sequence"

        if self.id in ("","."):
            self.id = self.name[:7]

        if self.description ==".":
            self.description = ""

        if not 'date' in self.annotations:
            self.annotations.update({"date": datetime.date.today().strftime("%d-%b-%Y").upper()})

        if circular == None and linear in (True, False,):
            self.linear = linear

        elif linear == None and circular in (True, False,):
            self.circular = circular

    def get_linear(self):
        return self.seq.linear
    def set_linear(self, value):
        self.seq.linear = bool(value)
    def get_circular(self):
        return self.seq.circular
    def set_circular(self, value):
        self.seq.circular = bool(value)
    linear   = property(get_linear  , set_linear,   "I'm the 'linear' property.")
    circular = property(get_circular, set_circular, "I'm the 'circular' property.")

    def loop(self):
        self.circular = True

    def seguid(self):
        return seguid(self.seq)

    def stamp(self):
        pattern = "(SEGUID|seguid)\s*\S{27}"
        try:
            stamp = re.search(pattern, self.description).group()
        except AttributeError:
            stamp = "SEGUID {}".format(seguid(self.seq))

        if not self.description:
            self.description = stamp
        elif not re.search(pattern, self.description):
            self.description += " "+stamp

    def looped(self):
        new = copy.deepcopy(self)
        new.circular = True
        for fn, fo in zip(new.features, self.features):
            fn.qualifiers = fo.qualifiers
        return new

    def verify_stamp(self):
        pattern = "(SEGUID|seguid)\s*\S{27}"
        try:
            stamp = re.search(pattern, self.description).group()
        except AttributeError:
            return False
        return seguid(self.seq) == stamp[-27:]

    def format(self,f="gb"):
        s = SeqRecord.format(self,f)
        if f in ("genbank","gb"):
            if self.circular:
                return s[:55]+"circular"+s[63:]
            else:
                return s[:55]+"linear"+s[61:]
        else:
            return s

    def write(self, filename="", f="gb"):
        if not filename:
            filename=self.description+"."+f
        if isinstance(filename, basestring):
            if os.path.isfile(filename):
                seguid_new = self.seguid()
                seguid_old = read(filename).seguid()
                if seguid_new == seguid_old:
                    os.utime(filename, None)
                else:
                    name, ext = os.path.splitext(filename)
                    new_filename = "{}_NEW{}".format(name, ext)
                    print("\n\nseguid(old) = {} in file {}"
                           "\nseguid(new) = {} in file {}\n").format(seguid_old, filename, seguid_new, new_filename)
                    with open(new_filename, "w") as fp:
                        fp.write(self.format(f))

            else:
                with open(filename, "w") as fp:
                    fp.write(self.format(f))
        else:
            with filename as fp:
                fp.write(self.format(f))

    def __str__(self):
        return ("drecord\n"
                 "circular: {}\n"
                 "size: {}\n").format(self.circular, len(self))+SeqRecord.__str__(self)

    def __repr__(self):
        return "drecord({}{})".format({True:"-", False:"o"}[self.linear],len(self))

    def fig(self):
        return self.seq.__repr__()

    def __add__(self, other):
        if hasattr(other, "seq") and hasattr(other.seq, "watson"):
            offset = other.seq.ovhg
            other = drecord(other.seq,
                            id = self.id,
                            name = self.name,
                            description = self.description,
                            features = [f._shift(offset) for f in other.features],
                            annotations = self.annotations.copy(),
                            dbxrefs = self.dbxrefs[:])
            answer = drecord(SeqRecord.__add__(self, other))
        else:
            answer = drecord(SeqRecord.__add__(self, drecord(other)))
            answer.circular = False
        return answer

#    def __radd__(self, other):
#        other = copy.copy(other)
#        other = drecord(other)
#        answer = drecord(other.__add__(self))
#        answer.circular = False
#        return answer

    def __getitem__(self, index):
        answer = drecord(SeqRecord.__getitem__(self, index))
        answer.seq.alphabet = self.seq.alphabet
        return answer

    def cut(self, *enzymes):
        ''' flatten enzymes '''
        frags=[self,]
        output = []
        stack = []
        stack.extend(reversed(enzymes))
        while stack:
            top = stack.pop()
            if hasattr(top, "__iter__"):
                stack.extend(reversed(top))
            else:
                output.append(top)
        enzymes = output
        if not hasattr(enzymes, '__iter__'):
            enzymes = (enzymes,)
        newfrags=[]
        for enz in enzymes:
            for frag in frags:
                wts = Seq(frag.seq.watson)
                crk = Seq(frag.seq.crick)

                if frag.linear:
                    wts+="N"
                    crk+="N"

                ws = [x-1 for x in enz.search(wts, linear = frag.linear)]
                cs = [x-1 for x in enz.search(crk, linear = frag.linear)]

                sitepairs = [(sw, sc) for sw, sc in zip(ws,cs[::-1])
                             if (sw + max(0, frag.seq.ovhg) -
                             max(0, enz.ovhg)
                             ==
                             len(frag.seq.crick)-sc -
                             min(0, frag.seq.ovhg) +
                             min(0, enz.ovhg))]

                sitepairs = sitepairs + [(len(frag.seq.watson), 0)]

                w2, c1 = sitepairs[0]

                nwat = frag.seq.watson[:w2]
                ncrk = frag.seq.crick[c1:]

                newfrag=drecord(dseq(nwat, ncrk, ovhg=frag.seq.ovhg))
                feature_lim = max(len(newfrag.seq.watson)+newfrag.seq.ovhg , len(newfrag.seq.crick)-newfrag.seq.ovhg)
                newfrag.features = frag[:feature_lim].features
                newfrags.append(newfrag)

                for (w1, c2), (w2, c1)  in zip(sitepairs[:-1], sitepairs[1:]):
                    nwat = frag.seq.watson[w1:w2]
                    ncrk = frag.seq.crick[c1:c2]
                    newfrag = drecord(dseq(nwat,ncrk, ovhg=enz.ovhg))
                    newfrag.features = frag[min(w1, len(frag.seq.crick)-c2):max(w2,len(frag.seq.crick)-c1)].features
                    newfrags.append(newfrag)

                    #if str(enz)=="KpnI":
#                    print enz
#                    print newfrag.fig()
#                    print min(w1, len(frag.seq.crick)-c2), "==>",max(w2,len(frag.seq.crick)-c1)
#                    #print w1, len(frag.seq.crick)-c1 ,"-->",w2,len(frag.seq.dsdata)-c2
#                    print
#                    print "----"
                    #print min(w1, len(frag.seq.dsdata)-c1),"-->",max(w2,len(frag.seq.dsdata)-c2)

                if frag.circular:
                    newfrag=newfrags.pop()+newfrags.pop(0)
                    newfrags.append(newfrag)
                if not newfrags:
                    newfrags.append(frag)

            frags=newfrags
            newfrags=[]
            for f in frags:
                f.description = self.description+"_"+"_".join(str(e) for e in enzymes)
        return frags

    def reverse_complement(self):
        return self.rc()

    def rc(self):
        answer= drecord(SeqRecord.reverse_complement(self))
        answer.circular = self.circular
        return answer

    def shifted(self, shift):
        '''
        >>> 1+2
        3
       '''
        if self.linear:
            raise Exception("Only circular DNA can be synced!")
        length=len(self)
        if not 0<=shift<length:
            raise Exception("shift ({}) has to be 0<=shift<length({})",format((shift,length,)))

        new = copy.deepcopy(self)
        new.circular = True
        for fn, fo in zip(new.features, self.features):
            fn.qualifiers = fo.qualifiers

        new.linear = True

        new = (new+new)[shift:shift+length]

        new.circular = True

        new.features = []

        for feature in self.features:
            if not shift in feature:
                new.features.append(feature)
            else:
                new_start = length -(shift-feature.location.start)
                new_end   = feature.location.end-shift
                a = SeqFeature(FeatureLocation(0, new_end),
                               type=feature.type,
                               location_operator=feature.location_operator,
                               strand=feature.strand,
                               id=feature.id,
                               qualifiers=feature.qualifiers,
                               sub_features=None)
                b = SeqFeature(FeatureLocation(new_start, length),
                               type=feature.type,
                               location_operator=feature.location_operator,
                               strand=feature.strand,
                               id=feature.id,
                               qualifiers=feature.qualifiers,
                               sub_features=None)
                c = SeqFeature(FeatureLocation(new_start, new_end),
                               type=feature.type,
                               location_operator="join",
                               strand=feature.strand,
                               id=feature.id,
                               qualifiers=feature.qualifiers,
                               sub_features=[a,b])
                sub_features=[]
                for sf in feature.sub_features:
                    if feature.location.end<shift:
                        sub_features.append(SeqFeature(FeatureLocation(length-feature.location.start,
                                                                       length-feature.location.end),
                                            type=feature.type,
                                            location_operator=feature.location_operator,
                                            strand=feature.strand,
                                            id=feature.id,
                                            qualifiers=feature.qualifiers,
                                            sub_features=None))
                    elif feature.location.start>shift:
                        sub_features.append(SeqFeature(FeatureLocation(feature.location.start-shift,
                                                                       feature.location.end-shift),
                                            type=feature.type,
                                            location_operator=feature.location_operator,
                                            strand=feature.strand,
                                            id=feature.id,
                                            qualifiers=feature.qualifiers,
                                             sub_features=None))
                    else:
                        sub_features.extend() #wraparound(sf))
                c.sub_features.extend(sub_features)
                new.features.append(c)
        return new


    def synced(self, ref, limit = 25):
        '''
       qqq
       '''
        if self.linear:
            raise Exception("Only circular DNA can be synced!")

        sequence = copy.copy(self.seq)
        sequence.linear=True

        a    = str(sequence.watson).lower()
        a_rc = str(sequence.crick).lower()
        sequence_rc = sequence.reverse_complement()
        double_sequence = sequence+sequence

        if hasattr(ref, "seq"):
            b=ref.seq
            if hasattr(ref, "watson"):
                b = str(b.watson).lower()
            else:
                b = str(b).lower()
        else:
            b = str(ref.lower())

        b=b[:len(a)]

        c = common_sub_strings(a+a, b, limit = min(limit, limit*(len(a)/limit)+1))
        d = common_sub_strings(a_rc+a_rc, b, limit = min(limit, limit*(len(a)/limit)+1))

        if c:
            starta, startb, length = c.pop(0)
        else:
            starta, startb, length = 0,0,0

        if d:
            starta_rc, startb_rc, length_rc = d.pop(0)
        else:
            starta_rc, startb_rc, length_rc = 0,0,0

        if not c and not d:
            raise Exception("There is no overlap between sequences!")

        if length_rc>length:
            starta, startb = starta_rc, startb_rc
            sequence = sequence_rc

        if starta>startb:
            if len(a)<len(b):
                ofs = starta-startb + len(b)-len(a)
            else:
                ofs = starta-startb
        elif starta<startb:
            ofs = startb-starta + len(a)-len(b)
            ofs = len(a)-ofs
        elif starta==startb:
            ofs=0
        return self.shifted(ofs)


def read(data, filter = False):
    '''
    read(data, filter = False) --> FormattedRecord object

    returns the first sequence found in data. At least one
    sequence is reqired

    data is a string containing:

    1. an absolute path to a local file the file will be read in text
       mode and parsed for EMBL, FASTA and Genbank sequences
    2. a path to a local directory
       all files in the directory will be parsed as in 1.
    3. a string containing one or more
       sequences in EMBL, GENBANK, or FASTA format
       mixed formats are allowed.
    4. data can be a list or other iterable of 1 - 3

    if filter == True, sequence will be silently filtered
    for allowed characters (see docs for FormattedRecord)
    '''

    results = parse(data, filter)
    try:
        results = results.pop()
    except IndexError:
        if data == '':
            data = 'empty string'
        print "No sequences found in data ({})".format(data[:20])
        raise ValueError()
    return results

def parse(data, filter = False):
    '''
    parse(data, filter = False) --> list of FormattedRecord objects

    returns all sequences found in data. If no sequences are found, an
    empty list is returned.

    data is a string containing:

    1. an absolute path to a local file the file will be read in text
       mode and parsed for EMBL, FASTA and Genbank sequences

    2. a path to a local directory
       all files in the directory will be parsed as in 1.

    3. a string containing one or more
       sequences in EMBL, GENBANK, or FASTA format
       mixed formats are allowed.

    4. data can be a list or other iterable of 1 - 3

    if filter == True, sequences will be silently filtered
    for allowed characters (see docs for FormattedRecord)

    '''
    frs=[]
    raw=""
    if not hasattr(data, '__iter__'):
        data = (data,)
    for item in data:
        if isinstance(item,basestring):
            item = textwrap.dedent(item)
            item = item.strip()
        else:
            continue
        if os.path.isdir(item):
            for file_ in os.listdir(item):
                with open(file_,'r') as f:
                    raw = "\n\n"+f.read()
                frs.extend( parse_string_to_formatted_records(raw) )
                raw=""
        elif os.path.isfile(os.path.join(os.getcwd(),item)):
            with open(item,'r') as f:
                raw = f.read()
            frs.extend(  parse_string_to_formatted_records(raw) )
            raw=""
        else:
            frs.extend( parse_string_to_formatted_records(item) )
    return frs

def parse_seqs(*args,**kwargs):
    '''alias for parse_string_to_formatted_records'''
    return parse_string_to_formatted_records(*args,**kwargs)

def parse_string_to_formatted_records(rawstring, filter = False):
    '''
    parse_string_to_formatted_records(rawstring, filter = True)
    --> list of FormattedRecord objects

    rawstring is a string containing one or more
    sequences in EMBL, GENBANK, or FASTA format
    mixed formats are allowed.

    if filter is True, the input sequences will be filtered
    for allowed characters in biological alphabets
    (see docs for FormattedRecord).

    The function returns a list of FormattedRecord objects
    '''
    from Bio.GenBank import RecordParser
    pattern =  r"(?:>.+\n^(?:^[^>]+?)(?=\n\n|>|LOCUS|ID))|(?:(?:LOCUS|ID)(?:(?:.|\n)+?)^//)"

    rawstring = rawstring.replace( '\r\n', '\n')
    rawstring = rawstring.replace( '\r',   '\n')
    rawseqs = re.findall(pattern,textwrap.dedent(rawstring+"\n\n"),re.MULTILINE)
    sequences=[]

    while rawseqs:
        circular = False
        rawseq = rawseqs.pop(0)
        handle = StringIO.StringIO(rawseq)
        try:
            parsed = SeqIO.parse(handle, "embl").next()
            original_format = "embl"
            if "circular" in rawseq.splitlines()[0]:
                circular = True
        except StopIteration:
            handle.seek(0)
            try:
                parsed = SeqIO.parse(handle, "genbank").next()
                original_format = "genbank"
                handle.seek(0)
                parser = RecordParser()
                residue_type = parser.parse(handle).residue_type
                if "circular" in residue_type:
                    circular = True
            except StopIteration:
                handle.seek(0)
                try:
                    parsed = SeqIO.parse(handle, "fasta").next()
                    original_format = "fasta"
                    if "circular" in rawseq.splitlines()[0]:
                        circular = True
                except StopIteration:
                    continue

        sequences.append( drecord( parsed,
                                   parsed_from = original_format,
                                   raw_string  = rawseq,
                                   circular    = circular,
                                   filter      = filter ))
        handle.close()

    return sequences




if __name__=="__main__":
    import doctest
    doctest.testmod()

    from Bio.Restriction import Acc65I, KpnI, NlaIV, EcoRI, EcoRV
    import pydna


    a = dseq(  'CACANGGTACCNGGTACCNGCGGATATC',
           'AATTGTGTNCCATGGNCCATGGNCGCCTATAGatgc'[::-1], 4)

    #print a.fig()

    a.ThreeFiveNuclease("a")

    #print a.fig()


    b = dseq(    'CACANGGTACCNGGTACCNGCGGATATC',
             'AATTGTGTNCCATGGNCCATGGNCGCCTATAG'[::-1], 4)
    #print b.fig()
    #b.fillin()
    #print b.fig()

    a=pydna.read("/home/bjorn/Dropbox/python-dna-dev/tests/pUC19.gb")


    a=a.synced("cggtgatgacggtgaaaacctctgacacat")

    print a.seq[0:60]




    a = (drecord( dseq(  'AATTCACANGGTACCNGGTACCNGCGGATATC',
                             'GTGTNCCATGGNCCATGGNCGCCTATAG'[::-1], -4)),

         drecord( dseq(      'CACANGGTACCNGGTACCNGCGGATATC',
                             'GTGTNCCATGGNCCATGGNCGCCTATAG'[::-1], 0)),

         drecord( dseq(    'CACANGGTACCNGGTACCNGCGGATATC',
                       'AATTGTGTNCCATGGNCCATGGNCGCCTATAG'[::-1], 4)),)

    enzymes = [Acc65I, NlaIV, KpnI]

    for enz in enzymes:
        for f in a:
            b,c,d = f.cut(enz)
            e=b+c+d
            assert str(e.seq).lower() == str(f.seq).lower()



    a=pydna.read('''

LOCUS       New_DNA                   33 bp ds-DNA     linear       08-NOV-2012
DEFINITION  .
ACCESSION
VERSION
SOURCE      .
  ORGANISM  .
COMMENT
COMMENT     ApEinfo:methylated:1
FEATURES             Location/Qualifiers
     misc_feature    1..11
                     /label=Acc65I-1
                     /ApEinfo_fwdcolor=cyan
                     /ApEinfo_revcolor=green
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     misc_feature    12..18
                     /label=Acc65I-2
                     /ApEinfo_fwdcolor=cyan
                     /ApEinfo_revcolor=green
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     misc_feature    19..33
                     /label=Acc65I-3
                     /ApEinfo_fwdcolor=cyan
                     /ApEinfo_revcolor=green
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     misc_feature    1..15
                     /label=KpnI-1
                     /ApEinfo_fwdcolor=cyan
                     /ApEinfo_revcolor=green
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     misc_feature    16..22
                     /label=KpnI-2
                     /ApEinfo_fwdcolor=cyan
                     /ApEinfo_revcolor=green
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     misc_feature    23..33
                     /label=KpnI-3
                     /ApEinfo_fwdcolor=cyan
                     /ApEinfo_revcolor=green
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     misc_feature    1..13
                     /label=NlaIV-1
                     /ApEinfo_fwdcolor=cyan
                     /ApEinfo_revcolor=green
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     misc_feature    14..20
                     /label=NlaIV-2
                     /ApEinfo_fwdcolor=cyan
                     /ApEinfo_revcolor=green
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     misc_feature    21..33
                     /label=NlaIV-3
                     /ApEinfo_fwdcolor=cyan
                     /ApEinfo_revcolor=green
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
ORIGIN
        1 GAATTCacan ggtaccnGGT ACCngcgGAT ATC
//

    ''')


    assert a.seguid()=="di3hL8t2G4iQQsxlm/CtvnUMBz8"


    assert ([x.qualifiers["label"][0] for x in a.features] ==
    ['Acc65I-1', 'Acc65I-2', 'Acc65I-3', 'KpnI-1', 'KpnI-2',
     'KpnI-3', 'NlaIV-1', 'NlaIV-2', 'NlaIV-3'])

    b,c,d = a.cut(Acc65I)

    print [x.qualifiers["label"][0] for x in b.features] == ['Acc65I-1', 'KpnI-1', 'NlaIV-1']
    print [x.qualifiers["label"][0] for x in c.features] == ['Acc65I-2', 'KpnI-2', 'NlaIV-2']
    print [x.qualifiers["label"][0] for x in d.features] == ['Acc65I-3', 'KpnI-3', 'NlaIV-3']
    e = b+c+d
    print sorted([x.qualifiers["label"][0] for x in e.features])  == [x.qualifiers["label"][0] for x in a.features]
    assert str(a.seq)==str(e.seq)

    b,c,d = a.cut(KpnI)
    print [x.qualifiers["label"][0] for x in b.features] == ['Acc65I-1', 'KpnI-1', 'NlaIV-1']
    print [x.qualifiers["label"][0] for x in c.features] == ['Acc65I-2', 'KpnI-2', 'NlaIV-2']
    print [x.qualifiers["label"][0] for x in d.features] == ['Acc65I-3', 'KpnI-3', 'NlaIV-3']
    e = b+c+d
    print sorted([x.qualifiers["label"][0] for x in e.features])  == [x.qualifiers["label"][0] for x in a.features]

    b,c,d = a.cut(NlaIV)
    print [x.qualifiers["label"][0] for x in b.features] == ['Acc65I-1', 'NlaIV-1']
    print [x.qualifiers["label"][0] for x in c.features] == ['NlaIV-2']
    print [x.qualifiers["label"][0] for x in d.features] == [ 'KpnI-3', 'NlaIV-3']
    e = b+c+d
    assert str(a.seq)==str(e.seq)

    b,c = a.cut(EcoRI)
    e = b+c
    assert str(a.seq)==str(e.seq)

    b,c = a.cut(EcoRV)
    e = b+c
    assert str(a.seq)==str(e.seq)

    b,c,d = a.cut(EcoRI,EcoRV)
    e = b+c+d

    assert str(a.seq)==str(e.seq)

    b,c,d, f = a.cut(Acc65I,EcoRI)
    e = b+c+d+f
    assert str(a.seq)==str(e.seq)

    b,c,d, f = a.cut(EcoRI,Acc65I)
    e = b+c+d+f
    assert str(a.seq)==str(e.seq)
    print "done!"

    seqs = parse('../tests/RefDataBjorn.fas', filter=False)

    assert len(seqs) == 771
    assert list(set([len (a) for a in seqs])) == [901]

    for i,s in enumerate(seqs):
        a = s.description
        b = a.split("|")
        c =  "|".join([b[0],b[1],b[3]])
        s.id = b[2].replace(" ","_")+"_"+str(i)
        s.description = ""
        if b[3]=="Zenion hololepis":
            s.id = b[3].replace(" ","_")+"_"+str(i)
        s.seq.alphabet = IUPACAmbiguousDNA()
    print "done! II"