#!/usr/bin/env python

import optparse, sys, os, pprint
import MetagenomeDB as mdb

p = optparse.OptionParser(description = """Part of the MetagenomeDB toolkit.
Imports nucleotide or aminoacid sequences into the database. Those sequences
can be in any format supported by Biopython (see http://biopython.org/wiki/SeqIO).""")

g = optparse.OptionGroup(p, "Input")

g.add_option("-i", "--input", dest = "input_fn", metavar = "FILENAME",
	help = "Sequences to import (mandatory).")

g.add_option("-f", "--format", dest = "input_format", metavar = "STRING", default = "fasta",
	help = "Format of the sequences file (optional). Default: %default")

g.add_option("-s", "--sequence-property", dest = "sequence_properties", nargs = 2, action = "append", metavar = "KEY VALUE",
	help = """Property to annotate all sequences with (optional); this option
can be used multiple times.""")

g.add_option("-C", "--collection", dest = "collection_name", metavar = "STRING",
	help = "Name of the collection the sequences belong to (mandatory).")

g.add_option("-c", "--collection-property", dest = "collection_properties", nargs = 2, action = "append", metavar = "KEY VALUE",
	help = """Property to annotate the collection with (optional); this option
can be used multiple times.""")

g.add_option("-r", "--relationship-property", dest = "relationship_properties", nargs = 2, action = "append", metavar = "KEY VALUE",
	help = """Property to annotate all relationships between sequences and the
collection (optional); this option can be used multiple times.""")

p.add_option_group(g)

g = optparse.OptionGroup(p, "Errors handling")

g.add_option("--ignore-duplicates", dest = "ignore_duplicates", action = "store_true", default = False,
	help = "If set, ignore duplicate objects errors.")

p.add_option_group(g)

g = optparse.OptionGroup(p, "Connection")

connection_parameters = {}
def declare_connection_parameter (option, opt, value, parser):
	connection_parameters[opt[2:]] = value

g.add_option("--host", dest = "connection_host", metavar = "HOSTNAME",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """Host name or IP address of the MongoDB server (optional). Default:
'host' property in ~/.MetagenomeDB, or 'localhost' if not found.""")

g.add_option("--port", dest = "connection_port", metavar = "INTEGER",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """Port of the MongoDB server (optional). Default: 'port' property
in ~/.MetagenomeDB, or 27017 if not found.""")

g.add_option("--db", dest = "connection_db", metavar = "STRING",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """Name of the database in the MongoDB server (optional). Default:
'db' property in ~/.MetagenomeDB, or 'MetagenomeDB' if not found.""")

g.add_option("--user", dest = "connection_user", metavar = "STRING",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """User for the MongoDB server connection (optional). Default:
'user' property in ~/.MetagenomeDB, or none if not found.""")

g.add_option("--password", dest = "connection_password", metavar = "STRING",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """Password for the MongoDB server connection (optional). Default:
'password' property in ~/.MetagenomeDB, or none if not found.""")

p.add_option_group(g)

g = optparse.OptionGroup(p, "Other options")

g.add_option("-v", "--verbose", dest = "verbose", action = "store_true", default = False,
	help = "Increase the verbosity of MetagenomeDB.")

g.add_option("--no-progress-bar", dest = "display_progress_bar", action = "store_false", default = True,
	help = "Hide the progress bar.")

g.add_option("--dry-run", dest = "dry_run", action = "store_true", default = False,
	help = "Display current operation on screen rather than performing it.")

g.add_option("--version", dest = "display_version", action = "store_true", default = False,
	help = "Display the version of the MetagenomeDB toolkit.")

p.add_option_group(g)

(p, a) = p.parse_args()

def error (msg):
	msg = str(msg)
	if msg.endswith('.'):
		msg = msg[:-1]
	print >>sys.stderr, "ERROR: %s." % msg
	sys.exit(1)

if (p.display_version):
	print mdb.version
	sys.exit(0)

if (p.input_fn == None):
	error("A sequence file must be provided")

if (not os.path.exists(p.input_fn)):
	error("File '%s' not found" % p.input_fn)

if (not p.collection_name) and (not p.collection_properties):
	error("A collection name or description must be provided")

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

if (p.verbose):
	mdb.max_verbosity()

try:
	mdb.connect(**connection_parameters)
except Exception as msg:
	error(msg)

# Creation of a collection
if (p.collection_properties):
	m = {}
	for (key, value) in p.collection_properties:
		m[key] = mdb.tools.parse_value_and_modifier(value)

	if ("_id" in m):
		print >>sys.stderr, "WARNING: A '_id' field was found in the collection description and ignored."
		del m["_id"]

	if ("name" in m) and (mdb.Collection.find_one({"name": m["name"]}) != None):
		error("Duplicate collection '%s'" % m["name"])

	try:
		collection = mdb.Collection(m)
		collection.commit()

	except ValueError as msg:
		error("Malformed collection description: %s" % msg)

# Retrieval of an existing collection
elif (p.collection_name):
	collection = mdb.Collection.find_one({"name": p.collection_name})

	if (collection == None):
		error("Unknown collection '%s'" % p.collection_name)

if (p.relationship_properties):
	m = {}
	for (key, value) in p.relationship_properties:
		m[key] = mdb.tools.parse_value_and_modifier(value)

	p.relationship_properties = m
else:
	p.relationship_properties = {}

if (p.sequence_properties):
	m = []
	for (key, value) in p.sequence_properties:
		m.append((key, mdb.tools.parse_value_and_modifier(value)))

	p.sequence_properties = m

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

try:
	from Bio import SeqIO
except:
	error("The BioPython library is not installed.\nTry 'easy_install biopython'")

def read():
	try:
		parser = SeqIO.parse(p.input_fn, p.input_format)
	except ValueError as msg:
		error(msg)

	return parser

print "Importing '%s' (%s format) ..." % (p.input_fn, p.input_format)

print "  checking the input file ..."

seen = {}
for record in read():
	if (record.id in seen):
		error("Duplicate sequence '%s'" % record.id)

	seen[record.id] = True

n = len(seen)

if (n == 0):
	error("The input file contains no sequence")

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

print "  importing sequences ..."

class ProgressBar:
	def __init__ (self, upper = None):
		self.__min = 0.0
		self.__max = upper + 0.0

	def display (self, value):
		f = (value - self.__min) / (self.__max - self.__min) # fraction
		p = 100 * f # percentage
		s = int(round(80 * f)) # bar size

		sys.stdout.write(' ' * 2 + ('.' * s) + " %4.2f%%\r" % p)
		sys.stdout.flush()

	def clear (self):
		sys.stdout.write(' ' * (2 + 80 + 8) + "\r")
		sys.stdout.flush()

pb = ProgressBar(n)
n = 0

for record in read():
	entry = {
		"name": record.id,
		"sequence": str(record.seq),
		"length": len(record.seq),
	}

	if (hasattr(record, "description")):
		entry["description"] = record.description

	# see http://en.wikipedia.org/wiki/FASTQ_format#Variations
	# for an explanation of the different quality scales
	if ("phred_quality" in record.letter_annotations):
		entry["quality"] = {
			"values": record.letter_annotations["phred_quality"],
			"scale": "PHRED"
		}

	elif ("solexa_quality" in record.letter_annotations):
		entry["quality"] = {
			"values": record.letter_annotations["solexa_quality"],
			"scale": "Solexa"
		}

	if (p.sequence_properties):
		for (key, value) in p.sequence_properties:
			if (key in entry):
				error("Reserved field '%s'" % key)

			entry[key] = value

	try:
		sequence = mdb.Sequence(entry)

		if (p.dry_run):
			print pprint.pformat(sequence.get_properties())
			continue

		sequence.add_to_collection(collection, p.relationship_properties)
		sequence.commit()

	except (mdb.errors.DBConnectionError, mdb.errors.DBOperationError) as msg:
		error(msg)

	except mdb.errors.DuplicateObjectError as msg:
		if (p.ignore_duplicates):
			print >>sys.stderr, "WARNING: %s" % str(msg)
		else:
			error(msg)

	n += 1
	if (p.display_progress_bar):
		pb.display(n)

if (p.display_progress_bar):
	pb.clear()

print "  %s sequence%s imported." % (n, {True: 's', False: ''}[n > 1])

if (p.dry_run):
	print "(dry run)"
