#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
    findsubstrings
    ~~~~~~~~~~~~~~

    The Python-dna package.


'''

import numpy as np
cimport numpy as np
cimport cython
np.import_array()

@cython.boundscheck(False)
@cython.wraparound(False)

def common_sub_strings(char* stringx, char* stringy, int limit=25):
    '''
    common_sub_strings(stringx , stringy , limit=25)

    Finds all the common substrings between stringx and stringy
    longer than limit. This function is case sensitive.

    returns a list of tuples describing the substrings
    The list is sorted longest -> shortest.

    Example:

    [(startx1,starty1,length1),(startx2,starty2,length2), ...]

    startx1 = position in x where substring 1 starts
    starty1 = position in y where substring 1 starts
    length  = lenght of substring

    '''

    cdef int x
    cdef int y

    cdef int maxx = len(stringx)
    cdef int maxy = len(stringy)

    cdef np.ndarray[np.int_t, ndim=2]  M = np.zeros((maxx+1,maxy+1), dtype=np.int )

    matches=[]

    limit=limit-1

    for x in range(1,maxx+1):
        for y in range(1,maxy+1):
            if stringx[x-1] == stringy[y-1]:
                M[x,y] = M[x-1,y-1] + 1
            else:
                M[x,y] = 0
                if M[x-1,y-1]>limit:
                    matches.append((x-1-M[x-1,y-1], y-1-M[x-1,y-1], M[x-1,y-1]))

    matches.extend([( x-M[x,maxy],
             maxy-M[x,maxy],
             M[x,maxy] )
             for x in range(1,maxx) if M[x,maxy]>limit])

    matches.extend([( maxx-M[maxx,y],
             y-M[maxx,y],
             M[maxx,y] )
             for y in range(1,maxy+1) if M[maxx,y]>limit])

    return sorted(matches, key=lambda x: x[2], reverse=True)

if __name__=="__main__":
    import time

    xy=[
    ("zzzzzGGATCCaaa", "uGGATCCxxx",6),
    ("GGATCC","GGATCC",6),
    ("uGGATCC","zGGATCC",6),
    ("zzzzzGGATCCGAATTCaaa", "uGGATCCGAATTCxxx",6),
    ("GGATCCGAATTC","GGATCCGAATTC",6),
    ("GGATCCGAATTCu","GGATCCGAATTCz",6),
    ("uGGATCCGAATTC","zGGATCCGAATTC",6),
    ('''ATGTCGTCGTCAATTACAGATGAGAAAATATCTGGTGAACAGCAACAACCTGCTGGCAGAAAACTATACTATAACACAAGTACATTTGCAGAGCCTCCTCTAGTGGACGGAGAAGGTAACCCTATAAATTATGAGCCGGAAGTTTACAACCCGGATCACGAAAAGCTATACCATAACCCATCACTGCCTGCACAATCAATTCAGGATACAAGAGATGATGAATTGCTGGAAAGAGTTTATAGCCAGGATCAAGGTGTAGAGTATGAGGAAGATGAAGAGGATAAGCCAAACCTAAGCGCTGCGTCCATTAAAAGTTATGCTTTAACGAGATTTACGTCCTTACTGCACATCCACGAGTTTTCTTGGGAGAATGTCAATCCCATACCCGAACTGCGCAAAATGACATGGCAGAATTGGAACTATTTTTTTATGGGTTATTTTGCGTGGTTGTCTGCGGCTTGGGCCTTCTTTTGCGTTTCAGTATCAGTCGCTCCATTGGCTGAACTATATGACAGACCAACCAAGGACATCACCTGGGGGTTGGGATTGGTGTTATTTGTTCGTTCAGCAGGTGCTGTCATATTTGGTTTATGGACAGATAAGTCTTCCAGAAAGTGGCCGTACATTACATGTTTGTTCTTATTTGTCATTGCACAACTCTGTACTCCATGGTGTGACACATACGAGAAATTTCTGGGCGTAAGGTGGATAACCGGTATTGCTATGGGAGGAATTTACGGATGTGCTTCTGCAACAGCGATTGAAGATGCACCTGTGAAAGCACGTTCGTTCCTATCAGGTCTATTTTTTTCTGCTTACGCTATGGGGTTCATATTTGCTATCATTTTTTACAGAGCCTTTGGCTACTTTAGGGATGATGGCTGGAAAATATTGTTTTGGTTTAGTATTTTTCTACCAATTCTACTAATTTTCTGGAGATTGTTATGGCCTGAAACGAAATACTTCACCAAGGTTTTGAAAGCCCGTAAATTAATATTGAGTGACGCAGTGAAAGCTAATGGTGGCGAGCCTCTACCAAAAGCCAACTTTAAACAAAAGATGGTATCCATGAAGAGAACAGTTCAAAAGTACTGGTTGTTGTTCGCATATTTGGTTGTTTTATTGGTGGGTCCAAATTACTTGACTCATGCTTCTCAAGACTTGTTGCCAACCATGCTGCGTGCCCAATTAGGCCTATCCAAGGATGCTGTCACTGTCATTGTAGTGGTTACCAACATCGGTGCTATTTGTGGGGGTATGATATTTGGACAGTTCATGGAAGTTACTGGAAGAAGATTAGGCCTATTGATTGCATGCACAATGGGTGGTTGCTTCACCTACCCTGCATTTATGTTGAGAAGCGAAAAGGCTATATTAGGTGCCGGTTTCATGTTATATTTTTGTGTCTTTGGTGTCTGGGGTATCCTGCCCATTCACCTTGCAGAGTTGGCCCCTGCTGATGCAAGGGCTTTGGTTGCCGGTTTATCTTACCAGCTAGGTAATCTAGCTTCTGCAGCGGCTTCCACGATTGAGACACAGTTAGCTGATAGATACCCATTAGAAAGAGATGCCTCTGGTGCTGTGATTAAAGAAGATTATGCCAAAGTTATGGCTATCTTGACTGGTTCTGTTTTCATCTTCACATTTGCTTGTGTTTTTGTTGGCCATGAGAAATTCCATCGTGATTTGTCCTCTCCTGTTATGAAGAAATATATAAACCAAGTGGAAGAATACGAAGCCGATGGTCTTTCGATTAGTGACATTGTTGAACAAAAGACGGAATGTGCTTCAGTGAAGATGATTGATTCGAACGTCTCAAAGACATATGAGGAGCATATTGAGACCGTTTAA''','''ATGTCGTCGTCAATTACAGATGAGAAAATATCTGGTGAACAGCAACAACCTGCTGGCAGAAAACTATACTATAACACAAGTACATTTGCAGAGCCTCCTCTAGTGGACGGAGAAGGTAACCCTATAAATTATGAGCCGGAAGTTTACAACCCGGATCACGAAAAGCTATACCATAACCCATCACTGCCTGCACAATCAATTCAGGATACAAGAGATGATGAATTGCTGGAAAGAGTTTATAGCCAGGATCAAGGTGTAGAGTATGAGGAAGATGAAGAGGATAAGCCAAACCTAAGCGCTGCGTCCATTAAAAGTTATGCTTTAACGAGATTTACGTCCTTACTGCACATCCACGAGTTTTCTTGGGAGAATGTCAATCCCATACCCGAACTGCGCAAAATGACATGGCAGAATTGGAACTATTTTTTTATGGGTTATTTTGCGTGGTTGTCTGCGGCTTGGGCCTTCTTTTGCGTTTCAGTATCAGTCGCTCCATTGGCTGAACTATATGACAGACCAACCAAGGACATCACCTGGGGGTTGGGATTGGTGTTATTTGTTCGTTCAGCAGGTGCTGTCATATTTGGTTTATGGACAGATAAGTCTTCCAGAAAGTGGCCGTACATTACATGTTTGTTCTTATTTGTCATTGCACAACTCTGTACTCCATGGTGTGACACATACGAGAAATTTCTGGGCGTAAGGTGGATAACCGGTATTGCTATGGGAGGAATTTACGGATGTGCTTCTGCAACAGCGATTGAAGATGCACCTGTGAAAGCACGTTCGTTCCTATCAGGTCTATTTTTTTCTGCTTACGCTATGGGGTTCATATTTGCTATCATTTTTTACAGAGCCTTTGGCTACTTTAGGGATGATGGCTGGAAAATATTGTTTTGGTTTAGTATTTTTCTACCAATTCTACTAATTTTCTGGAGATTGTTATGGCCTGAAACGAAATACTTCACCAAGGTTTTGAAAGCCCGTAAATTAATATTGAGTGACGCAGTGAAAGCTAATGGTGGCGAGCCTCTACCAAAAGCCAACTTTAAACAAAAGATGGTATCCATGAAGAGAACAGTTCAAAAGTACTGGTTGTTGTTCGCATATTTGGTTGTTTTATTGGTGGGTCCAAATTACTTGACTCATGCTTCTCAAGACTTGTTGCCAACCATGCTGCGTGCCCAATTAGGCCTATCCAAGGATGCTGTCACTGTCATTGTAGTGGTTACCAACATCGGTGCTATTTGTGGGGGTATGATATTTGGACAGTTCATGGAAGTTACTGGAAGAAGATTAGGCCTATTGATTGCATGCACAATGGGTGGTTGCTTCACCTACCCTGCATTTATGTTGAGAAGCGAAAAGGCTATATTAGGTGCCGGTTTCATGTTATATTTTTGTGTCTTTGGTGTCTGGGGTATCCTGCCCATTCACCTTGCAGAGTTGGCCCCTGCTGATGCAAGGGCTTTGGTTGCCGGTTTATCTTACCAGCTAGGTAATCTAGCTTCTGCAGCGGCTTCCACGATTGAGACACAGTTAGCTGATAGATACCCATTAGAAAGAGATGCCTCTGGTGCTGTGATTAAAGAAGATTATGCCAAAGTTATGGCTATCTTGACTGGTTCTGTTTTCATCTTCACATTTGCTTGTGTTTTTGTTGGCCATGAGAAATTCCATCGTGATTTGTCCTCTCCTGTTATGAAGAAATATATAAACCAAGTGGAAGAATACGAAGCCGATGGTCTTTCGATTAGTGACATTGTTGAACAAAAGACGGAATGTGCTTCAGTGAAGATGATTGATTCGAACGTCTCAAAGACATATGAGGAGCATATTGAGACCGTTTAA''',25)]

    for x,y,lim in xy:
        x="".join(x.lower().splitlines())
        y="".join(y.lower().splitlines())
        before = time.time()
        result = common_sub_strings(x,y,lim)
        t = (time.time() - before)
        if result:
            for a in result:
                print "common substring length {0}, starting at position {1} in x".format(a[2],a[0])
                print "and at position {0} in y, this took {1} seconds" .format(a[1],t)
                print
        else:
            print "no common substrings found between x {0} chars and y {1} chars, this took {2} seconds".format(len(x),len(y),t)


'''
common substring length 6, starting at position 0 in x
and at position 0 in y, this took 2.09808349609e-05 seconds

common substring length 6, starting at position 1 in x
and at position 1 in y, this took 1.69277191162e-05 seconds

common substring length 12, starting at position 5 in x
and at position 1 in y, this took 2.40802764893e-05 seconds

common substring length 12, starting at position 0 in x
and at position 0 in y, this took 2.00271606445e-05 seconds

common substring length 12, starting at position 0 in x
and at position 0 in y, this took 1.8835067749e-05 seconds

common substring length 12, starting at position 1 in x
and at position 1 in y, this took 1.90734863281e-05 seconds

common substring length 1851, starting at position 0 in x
and at position 0 in y, this took 0.0836689472198 seconds
'''


