#!/usr/bin/env python
#
# Copyright John Reid 2012
#

"""
Reads sequences from a FASTA file and shortens each sequence to match the lengths of sequences in another FASTA file.
"""


import sys, corebio.seq_io.fasta_io as F, corebio.seq
import bioinfutils


#
# Check args
#
if 4 != len(sys.argv):
    print >>sys.stderr, 'USAGE: %s <FASTA to match> <FASTA to shorten> <output FASTA>' % sys.argv[0]
    sys.exit(-1)
fasta_to_match, fasta_to_shorten, fasta_output = sys.argv[1:4]


#
# Read the sequences to match
#
alphabet = corebio.seq.reduced_nucleic_alphabet
lengths = [len(seq) for seq in F.iterseq(bioinfutils.open_input(fasta_to_match), alphabet)]
lengths.sort()

#
# Shorten the sequences
#
seqs_to_shorten = [seq for seq in F.iterseq(bioinfutils.open_input(fasta_to_shorten), alphabet)]
seqs_to_shorten.sort(key = lambda seq: len(seq))
if len(lengths) > len(seqs_to_shorten):
    raise ValueError(
        'Want to match lengths of %d sequences in %s but only have %d sequences in %s' % (
            len(lengths), fasta_to_match, len(seqs_to_shorten), fasta_to_shorten
        )
    )
output = bioinfutils.open_output(fasta_output)
for length, to_shorten in zip(lengths, seqs_to_shorten):
    F.writeseq(output, bioinfutils.shorten(seq, length))

