#!/usr/bin/env python

import optparse, sys, os, pprint
import MetagenomeDB as mdb

p = optparse.OptionParser(description = """Part of the MetagenomeDB toolkit.
Export nucleotide or aminoacid sequences from the database. Those sequences
can be in any format supported by Biopython (see http://biopython.org/wiki/SeqIO).""")

p.add_option("-C", "--collection", dest = "collection_name", metavar = "STRING",
	help = "Name of the collection to retrieve the sequences from (mandatory).")

p.add_option("-r", "--recursive", dest = "recursive", action = "store_true", default = False,
	help = """By default only the sequences belonging to the collection provided
are exported. If set, this option will force all sequences belonging to sub-collections
to be exported as well.""")

p.add_option("-o", "--output", dest = "output_fn", metavar = "FILENAME",
	help = "Destination for the sequences (optional). Default: standard output.")

p.add_option("-f", "--format", dest = "output_format", metavar = "STRING", default = "fasta",
	help = """Format of the sequences (optional). Default: %default (see
http://biopython.org/wiki/SeqIO for a list of the formats supported)""")

p.add_option("--id-setter", dest = "sequence_id_setter", metavar = "PYTHON CODE", default = "%",
	help = """Python code to reformat sequence identifiers (optional); '%s' will
be replaced by a sequence name. Default: %default".""")

p.add_option("-v", "--verbose", dest = "verbose", action = "store_true", default = False)
p.add_option("--no-progress-bar", dest = "display_progress_bar", action = "store_false", default = True)
p.add_option("--dry-run", dest = "dry_run", action = "store_true", default = False)
p.add_option("--version", dest = "display_version", action = "store_true", default = False)

g = optparse.OptionGroup(p, "Filtering")

g.add_option("-p", "--property-filter", dest = "property_filter", action = "append", nargs = 2, metavar = "KEY VALUE",
	help = """Filter the sequences according to a given key and value. If several
filters are declared, only sequences satisfying them all will be returned (optional).""")

g.add_option("-w", "--white-list", dest = "white_list_fn", metavar = "FILENAME",
	help = """Text file to read sequence names from (one name per line). Only
sequences with names found in this file will be returned (optional).""")

g.add_option("-b", "--black-list", dest = "black_list_fn", metavar = "FILENAME",
	help = """Text file to read sequence names from (one name per line). Only
sequences with names not found in this file will be returned (optional).""")

p.add_option_group(g)

g = optparse.OptionGroup(p, "Connection")

connection_parameters = {}
def declare_connection_parameter (option, opt, value, parser):
	connection_parameters[opt[2:]] = value

g.add_option("--host", dest = "connection_host", metavar = "HOSTNAME",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """Host name or IP address of the MongoDB server (optional). Default:
'host' property in ~/.MetagenomeDB, or 'localhost' if not found.""")

g.add_option("--port", dest = "connection_port", metavar = "INTEGER",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """Port of the MongoDB server (optional). Default: 'port' property
in ~/.MetagenomeDB, or 27017 if not found.""")

g.add_option("--db", dest = "connection_db", metavar = "STRING",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """Name of the database in the MongoDB server (optional). Default:
'db' property in ~/.MetagenomeDB, or 'MetagenomeDB' if not found.""")

g.add_option("--user", dest = "connection_user", metavar = "STRING",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """User for the MongoDB server connection (optional). Default:
'user' property in ~/.MetagenomeDB, or none if not found.""")

g.add_option("--password", dest = "connection_password", metavar = "STRING",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """Password for the MongoDB server connection (optional). Default:
'password' property in ~/.MetagenomeDB, or none if not found.""")

p.add_option_group(g)

(p, a) = p.parse_args()

def error (msg):
	msg = str(msg)
	if msg.endswith('.'):
		msg = msg[:-1]
	print >>sys.stderr, "ERROR: %s." % msg
	sys.exit(1)

if (p.display_version):
	print mdb.version
	sys.exit(0)

if (not p.collection_name):
	error("A collection name must be provided")

if (p.white_list_fn) and (not os.path.exists(p.white_list_fn)):
	error("File '%s' not found" % p.white_list_fn)

if (p.black_list_fn) and (not os.path.exists(p.black_list_fn)):
	error("File '%s' not found" % p.black_list_fn)

try:
	set_sequence_id = eval("lambda x: " + p.sequence_id_setter.replace('%', 'x').replace("\\x", '%'))

except SyntaxError as e:
	error("Invalid setter: %s\n%s^" % (e.text, ' ' * (e.offset + 22)))

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

if (p.verbose):
	mdb.max_verbosity()

try:
	mdb.connect(**connection_parameters)
except Exception as msg:
	error(msg)

try:
	collection = mdb.Collection.find_one({"name": p.collection_name})

except Exception as msg:
	error(msg)

if (collection == None):
	error("Unknown collection '%s'" % p.collection_name)

if (p.output_fn == None):
	output_fh = sys.stdout
	output_fn = "<standard output>"
else:
	output_fh = open(p.output_fn, 'w')

collections = [collection]

if (p.recursive):
	def crawl (c):
		for subc in c.list_sub_collections():
			collections.append(subc)
			crawl(subc)

	crawl(collection)

if (p.property_filter):
	try:
		filter = {}
		for (key, value) in p.property_filter:
			filter[key] = mdb.tools.parse_value_and_modifier(value)

	except Exception as msg:
		error(msg)
else:
	filter = None

def read_list (fn):
	i = open(fn, 'rU')
	list = {}

	while True:
		line = i.readline()
		if (line == ''):
			break

		line = line.strip()
		if (line == ''):
			continue

		list[line] = True

	return list

whitelist = read_list(p.white_list_fn) if (p.white_list_fn) else None
blacklist = read_list(p.black_list_fn) if (p.black_list_fn) else None

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

try:
	from Bio import SeqIO
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord
except:
	error("The BioPython library is not installed.\nTry 'easy_install biopython'")

print "Exporting sequences to '%s' (%s format) ..." % (p.output_fn, p.output_format)

class ProgressBar:
	def __init__ (self, upper = None):
		self.__min = 0.0
		self.__max = upper + 0.0

	def display (self, value):
		f = (value - self.__min) / (self.__max - self.__min) # fraction
		p = 100 * f # percentage
		s = int(round(80 * f)) # bar size

		sys.stdout.write(' ' * 2 + ('.' * s) + " %4.2f%%\r" % p)
		sys.stdout.flush()

	def clear (self):
		sys.stdout.write(' ' * (2 + 80 + 8) + "\r")
		sys.stdout.flush()

n_sequences_total = 0

try:
	for collection in collections:
		n_sequences = collection.count_sequences()

		print "  from %scollection '%s' (%s sequence%s before filtering)" % (
			{True: '', False: "sub"}[n_sequences_total == 0],
			collection["name"],
			n_sequences,
			{True: 's', False: ''}[n_sequences > 1]
		)

		pb = ProgressBar(n_sequences)
		n_sequences_exported = 0

		def sequences():
			global n_sequences_exported

			for sequence in collection.list_sequences(filter):
				name = set_sequence_id(sequence["name"])

				if (whitelist != None) and (name not in whitelist):
					continue

				if (blacklist != None) and (name in blacklist):
					continue

				record = SeqRecord(
					id = name,
					seq = Seq(sequence["sequence"]),
					description = sequence.get_property("description", ''),
				)

				yield record

				n_sequences_exported += 1
				if (p.display_progress_bar):
					pb.display(n_sequences_exported)

			if (p.display_progress_bar):
				pb.clear()

		if (p.dry_run):
			[sequence for sequence in sequences()]
		else:
			SeqIO.write(sequences(), output_fh, p.output_format)

		print "    %s sequence%s exported." % (n_sequences_exported, {True: 's', False: ''}[n_sequences_exported > 1])
		n_sequences_total += n_sequences_exported

	print "Done (%s sequence%s exported total)." % (n_sequences_total, {True: 's', False: ''}[n_sequences_total > 1])

	if (p.dry_run):
		print "(dry run)"

except Exception as msg:
	error(msg)
