#!/usr/bin/env python

import optparse, sys, os, pprint

p = optparse.OptionParser(description = """Part of the MetagenomeDB toolkit.
Imports nucleotide or aminoacid sequences into the database. Those sequences
can be in any format supported by Biopython (see http://biopython.org/wiki/SeqIO).""")

g = optparse.OptionGroup(p, "Sequences")

g.add_option("-i", "--input", dest = "input_fn", metavar = "FILENAME",
	help = "Sequences to import (mandatory).")

g.add_option("-f", "--format", dest = "input_format", metavar = "STRING", default = "fasta",
	help = "Format of the sequences file (optional). Default: %default")

g.add_option("-s", "--sequence-property", dest = "sequence_properties", nargs = 2, action = "append", metavar = "KEY VALUE",
	help = "Sequence property (optional).")

p.add_option_group(g)

g = optparse.OptionGroup(p, "Collection")

g.add_option("-C", "--collection-name", dest = "collection_name", metavar = "STRING",
	help = "Name of the collection the sequences belong to (mandatory).")

g.add_option("-c", "--collection-property", dest = "collection_properties", nargs = 2, action = "append", metavar = "KEY VALUE",
	help = "Collection property (optional).")

g.add_option("-r", "--relationship-property", dest = "relationship_properties", nargs = 2, action = "append", metavar = "KEY VALUE",
	help = "Sequence to collection relationship property (optional).")

p.add_option_group(g)

p.add_option("-v", "--verbose", dest = "verbose", action = "store_true", default = False)
p.add_option("--dry-run", dest = "dry_run", action = "store_true", default = False)

g = optparse.OptionGroup(p, "Connection")

g.add_option("--host", dest = "connection_host", metavar = "HOSTNAME", default = "localhost",
	help = "Host name or IP address of the MongoDB server (optional). Default: %default")

g.add_option("--port", dest = "connection_port", metavar = "INTEGER", default = 27017,
	help = "Port of the MongoDB server (optional). Default: %default")

g.add_option("--db", dest = "connection_db", metavar = "STRING", default = "MetagenomeDB",
	help = "Name of the database in the MongoDB server (optional). Default: '%default'")

g.add_option("--user", dest = "connection_user", metavar = "STRING", default = '',
	help = "User for the MongoDB server connection (optional). Default: '%default'")

g.add_option("--password", dest = "connection_password", metavar = "STRING", default = '',
	help = "Password for the MongoDB server connection (optional). Default: '%default'")

p.add_option_group(g)

(p, a) = p.parse_args()

def error (msg):
	if str(msg).endswith('.'):
		msg = str(msg)[:-1]
	print >>sys.stderr, "ERROR: %s." % msg
	sys.exit(1)

if (p.input_fn == None):
	error("A sequence file must be provided")

if (not os.path.exists(p.input_fn)):
	error("File '%s' not found" % p.input_fn)

if (not p.collection_name) and (not p.collection_properties):
	error("A collection name or description must be provided")

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

import MetagenomeDB as mdb

if (p.verbose):
	mdb.max_verbosity()

if (p.connection_host or p.connection_port or p.connection_db or p.connection_user or p.connection_password):
	try:
		mdb.connect(p.connection_host, p.connection_port, p.connection_db, p.connection_user, p.connection_password)
	except Exception as msg:
		error(msg)

# Creation of a collection
if (p.collection_properties):
	m = {}
	for (key, value) in p.collection_properties:
		m[key] = mdb.tools.parse_value_and_modifier(value)

	if ("_id" in m):
		print >>sys.stderr, "WARNING: A '_id' field was found in the collection description and ignored."
		del m["_id"]

	if ("name" in m) and (mdb.Collection.find_one({"name": m["name"]}) != None):
		error("Duplicate collection '%s'" % m["name"])

	try:
		collection = mdb.Collection(m)
		collection.commit()

	except ValueError, msg:
		error("Malformed collection description: %s" % msg)

# Retrieval of an existing collection
elif (p.collection_name):
	collection = mdb.Collection.find_one({"name": p.collection_name})

	if (collection == None):
		error("Unknown collection '%s'" % p.collection_name)

if (p.relationship_properties):
	m = {}
	for (key, value) in p.relationship_properties:
		m[key] = mdb.tools.parse_value_and_modifier(value)

	p.relationship_properties = m
else:
	p.relationship_properties = {}

if (p.sequence_properties):
	m = []
	for (key, value) in p.sequence_properties:
		m.append((key, mdb.tools.parse_value_and_modifier(value)))

	p.sequence_properties = m

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

try:
	from Bio import SeqIO
except:
	error("The BioPython library is not installed.\nTry 'easy_install biopython'")

def read():
	try:
		parser = SeqIO.parse(open(p.input_fn, 'rU'), p.input_format)
	except ValueError as msg:
		error(msg)

	return parser

print "Importing '%s' (%s) ..." % (p.input_fn, p.input_format)

class ProgressBar:
	def __init__ (self, upper = None):
		self.__min = 0.0
		self.__max = upper + 0.0

	def display (self, value):
		f = (value - self.__min) / (self.__max - self.__min) # fraction
		p = 100 * f # percentage
		s = int(round(80 * f)) # bar size

		sys.stdout.write(' ' * 2 + ('.' * s) + " %4.2f%%\r" % p)
		sys.stdout.flush()

	def clear (self):
		sys.stdout.write(' ' * (2 + 80 + 8) + "\r")
		sys.stdout.flush()

seen = {}
for record in read():
	if (record.id in seen):
		error("Duplicate sequence '%s'" % record.id)

	seen[record.id] = True

N, n = len(seen), 0
pb = ProgressBar(N)

for record in read():
	entry = {
		"name": record.id,
		"sequence": str(record.seq),
		"length": len(record.seq),
	}

	if (hasattr(record, "description")):
		entry["description"] = record.description

	if (p.sequence_properties):
		for (key, value) in p.sequence_properties:
			if (key in entry):
				error("Reserved field '%s'" % key)

			entry[key] = value

	sequence = mdb.Sequence(entry)

	if (p.dry_run):
		print pprint.pformat(sequence.get_properties())
		continue

	sequence.add_to_collection(collection, p.relationship_properties)
	sequence.commit()

	n += 1
	pb.display(n)

pb.clear()

print "  %s sequences imported." % n
