#!/usr/bin/env python
#
# Copyright John Reid 2012
#

"""
Reads sequences from a FASTA file and shortens each sequence to match the lengths of sequences in another FASTA file.
"""


import sys, corebio.seq_io.fasta_io as F, corebio.seq
import bioinfutils
from optparse import OptionParser


#
# Parse the options
#
option_parser = OptionParser()
option_parser.add_option(
    '-r',
    '--repeat',
    action='store_true',
    default=False,
    help="Loop over the sequences to match.",
)
options, args = option_parser.parse_args()


#
# Check args
#
if 3 != len(args):
    print >>sys.stderr, 'USAGE: %s <FASTA to match> <FASTA to shorten> <output FASTA>' % sys.argv[0]
    sys.exit(-1)
fasta_to_match, fasta_to_shorten, fasta_output = args


#
# Read the sequences to match
#
alphabet = corebio.seq.reduced_nucleic_alphabet
lengths = [len(seq) for seq in F.iterseq(bioinfutils.open_input(fasta_to_match), alphabet)]
lengths.sort()

#
# Shorten the sequences
#
seqs_to_shorten = [seq for seq in F.iterseq(bioinfutils.open_input(fasta_to_shorten), alphabet)]
seqs_to_shorten.sort(key = lambda seq: len(seq))
output = bioinfutils.open_output(fasta_output)
to_shorten = iter(seqs_to_shorten)
for length in lengths:
    while True:
        try:
            might_shorten = to_shorten.next()
            if len(might_shorten) >= length:
                F.writeseq(output, bioinfutils.shorten(might_shorten, length))
                break
        except StopIteration:
            if options.repeat:
                to_shorten = iter(seqs_to_shorten)
            else:
                raise ValueError('Did not have %d long enough sequences.' % len(lengths))

