#!/usr/bin/env python

import optparse, sys, os, re, time, pprint

p = optparse.OptionParser(description = """Part of the MetagenomeDB toolkit.
Imports ACE-formatted mapping between reads and contigs into the database.""")

g = optparse.OptionGroup(p, "ACE")

g.add_option("-i", "--input", dest = "input_fn", metavar = "FILENAME",
	help = "ACE file (mandatory).")

g.add_option("-R", "--reads-collection", dest = "reads_collection", metavar = "STRING",
	help = "Name of the collection the reads belong to (mandatory).")

g.add_option("--read-id-getter", dest = "read_id_getter", metavar = "PYTHON CODE", default = "%",
	help = "Python code to reformat read identifiers (optional); '%s' will be replaced by a Biopython ACE read object. Default: %default")

g.add_option("--ignore-missing-reads", dest = "ignore_missing_reads", action = "store_true", default = False,
	help = "If set, ignore reads that are not found in the reads collection.")

g.add_option("-C", "--contigs-collection", dest = "contigs_collection", metavar = "STRING",
	help = "Name of the collection the contigs belong to (mandatory).")

g.add_option("--contig-id-getter", dest = "contig_id_getter", metavar = "PYTHON CODE", default = "%",
	help = "Python code to reformat contigs identifiers (optional); '%s' will be replaced by a Biopython ACE record object. Default: %default")

g.add_option("--ignore-missing-contigs", dest = "ignore_missing_contigs", action = "store_true", default = False,
	help = "If set, ignore contigs that are not found in the contigs collection.")

p.add_option_group(g)

p.add_option("-v", "--verbose", dest = "verbose", action = "store_true", default = False)
p.add_option("--dry-run", dest = "dry_run", action = "store_true", default = False)

g = optparse.OptionGroup(p, "Connection")

g.add_option("--host", dest = "connection_host", metavar = "HOSTNAME", default = "localhost",
	help = "Host name or IP address of the MongoDB server (optional). Default: %default")

g.add_option("--port", dest = "connection_port", metavar = "INTEGER", default = 27017,
	help = "Port of the MongoDB server (optional). Default: %default")

g.add_option("--db", dest = "connection_db", metavar = "STRING", default = "MetagenomeDB",
	help = "Name of the database in the MongoDB server (optional). Default: '%default'")

g.add_option("--user", dest = "connection_user", metavar = "STRING", default = '',
	help = "User for the MongoDB server connection (optional). Default: '%default'")

g.add_option("--password", dest = "connection_password", metavar = "STRING", default = '',
	help = "Password for the MongoDB server connection (optional). Default: '%default'")

p.add_option_group(g)

(p, a) = p.parse_args()

def error (msg):
	if str(msg).endswith('.'):
		msg = str(msg)[:-1]
	print >>sys.stderr, "ERROR: %s." % msg
	sys.exit(1)

if (p.input_fn == None):
	error("An ACE file must be provided")

if (not os.path.exists(p.input_fn)):
	error("File '%s' not found" % p.input_fn)

if (p.reads_collection == None)  or (p.contigs_collection == None):
	error("A collection must be provided for both reads and contigs")

try:
	get_read_id = eval("lambda x: " + p.read_id_getter.replace('%', 'x.rd.name'))
	get_contig_id = eval("lambda x: " + p.contig_id_getter.replace('%', 'x.name'))

except SyntaxError, e:
	error("Invalid getter: %s\n%s^" % (e.text, ' ' * (e.offset + 22)))

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

import MetagenomeDB as mdb

try:
	from Bio.Sequencing import Ace
except:
	error("The Biopython library must be installed\nTry 'easy_install Biopython'")

if (p.verbose):
	mdb.max_verbosity()

if (p.connection_host or p.connection_port or p.connection_db or p.connection_user or p.connection_password):
	try:
		mdb.connect(p.connection_host, p.connection_port, p.connection_db, p.connection_user, p.connection_password)
	except Exception as msg:
		error(msg)

print "Importing '%s' ..." % p.input_fn

print "  validating read and contig sequences ..."

Reads, Contigs = {}, {}
DuplicateReads, DuplicateContigs = {}, {}

try:
	reads = mdb.Collection.find_one({"name": p.reads_collection})
	if (reads == None):
		error("Unknown collection '%s'" % p.reads_collection)

	for read in reads.list_sequences():
		read_name = str(read["name"])
		if (read_name in Reads):
			DuplicateReads[read_name] = True

		Reads[read_name] = read

	contigs = mdb.Collection.find_one({"name": p.contigs_collection})
	if (contigs == None):
		error("Unknown collection '%s'" % p.contigs_collection)

	for contig in contigs.list_sequences():
		contig_name = str(contig["name"])
		if (contig_name in Contigs):
			DuplicateContigs[contig_name] = True

		Contigs[contig_name] = contig

except Exception as msg:
	error(msg)

i = Ace.parse(open(p.input_fn, 'r'))
c = 0

for contig in i:
	contig_id = get_contig_id(contig)

	if (not contig_id in Contigs):
		msg = "Unknown contig '%s'" % contig_id
		if (p.ignore_missing_contigs):
			print >>sys.stderr, "WARNING: " + msg
			continue
		else:
			error(msg)

	if (contig_id in DuplicateContigs):
		error("Duplicate contig sequence '%s'" % contig_id)

	for read in contig.reads:
		read_id = get_read_id(read)

		if (not read_id in Reads):
			msg = "Unknown read '%s' (mapped to contig '%s')" % (read_id, contig_id)
			if (p.ignore_missing_reads):
				print >>sys.stderr, "WARNING: " + msg
				continue
			else:
				error(msg)

		if (read_id in DuplicateReads):
			error("Duplicate read sequence '%s'" % read_id)

		c += 1

print "  importing mapping ..."

class ProgressBar:
	def __init__ (self, upper = None):
		self.__min = 0.0
		self.__max = upper + 0.0

	def display (self, value):
		f = (value - self.__min) / (self.__max - self.__min) # fraction
		p = 100 * f # percentage
		s = int(round(80 * f)) # bar size

		sys.stdout.write(' ' * 2 + ('.' * s) + " %4.2f%%\r" % p)
		sys.stdout.flush()

	def clear (self):
		sys.stdout.write(' ' * (2 + 80 + 8) + "\r")
		sys.stdout.flush()

pb = ProgressBar(c)

i = Ace.parse(open(p.input_fn, 'r'))
c = 0

for contig in i:
	contig_id = get_contig_id(contig)
	if (not contig_id in Contigs):
		continue

	contig_o = Contigs[contig_id]

	for read_idx, read in enumerate(contig.reads):
		read_id = get_read_id(read)
		if (not read_id in Reads):
			continue

		read_o = Reads[read_id]

		r = {
			"type": "part-of",
			"position": contig.af[read_idx].padded_start
		}

		if (p.dry_run):
			print "    read '%s' to contig '%s'" % (read_id, contig_id)
			for line in pprint.pformat(r).split('\n'):
				print "      %s" % line
		else:
			read_o.relate_to_sequence(contig_o, r)
			read_o.commit()

		c += 1
		pb.display(c)

pb.clear()

print "    done."

if (p.dry_run):
	print "(dry run)"
