#!/usr/bin/env python

import optparse, sys, os, re, shlex, datetime, pprint
import MetagenomeDB as mdb

p = optparse.OptionParser(description = """Part of the MetagenomeDB toolkit.
Imports sequences deduplication information generated by CD-HIT-454 into the database.""")

g = optparse.OptionGroup(p, "Input")

g.add_option("-i", "--input", dest = "input_clstr_fn", metavar = "FILENAME",
	help = """File with the clusters assigned by CD-HIT-454 (mandatory). Do not
use the file with a '.clstr' extension but the one with '.bak.clstr'.""")

g.add_option("-l", "--input-log", dest = "input_log_fn", metavar = "FILENAME",
	help = "File with the log from the CD-HIT-454 run (mandatory).")

g.add_option("-C", "--collection", dest = "sequences_collection_name", metavar = "STRING",
	help = """Name of the collection that contains the sequences that have been
processed by CD-HIT-454 (mandatory).""")

g.add_option("--id-getter", dest = "id_getter", metavar = "PYTHON CODE", default = "%",
	help = """Python code to reformat sequence identifers (optional); '%' will
be replaced by the sequence identifier. Default: %default""")

p.add_option_group(g)

p.add_option("-v", "--verbose", dest = "verbose", action = "store_true", default = False)
p.add_option("--no-progress-bar", dest = "display_progress_bar", action = "store_false", default = True)
p.add_option("--dry-run", dest = "dry_run", action = "store_true", default = False)
p.add_option("--version", dest = "display_version", action = "store_true", default = False)

g = optparse.OptionGroup(p, "Connection")

connection_parameters = {}
def declare_connection_parameter (option, opt, value, parser):
	connection_parameters[opt[2:]] = value

g.add_option("--host", dest = "connection_host", metavar = "HOSTNAME",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """Host name or IP address of the MongoDB server (optional). Default:
'host' property in ~/.MetagenomeDB, or 'localhost' if not found.""")

g.add_option("--port", dest = "connection_port", metavar = "INTEGER",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """Port of the MongoDB server (optional). Default: 'port' property
in ~/.MetagenomeDB, or 27017 if not found.""")

g.add_option("--db", dest = "connection_db", metavar = "STRING",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """Name of the database in the MongoDB server (optional). Default:
'db' property in ~/.MetagenomeDB, or 'MetagenomeDB' if not found.""")

g.add_option("--user", dest = "connection_user", metavar = "STRING",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """User for the MongoDB server connection (optional). Default:
'user' property in ~/.MetagenomeDB, or none if not found.""")

g.add_option("--password", dest = "connection_password", metavar = "STRING",
	type = "string", action = "callback", callback = declare_connection_parameter,
	help = """Password for the MongoDB server connection (optional). Default:
'password' property in ~/.MetagenomeDB, or none if not found.""")

p.add_option_group(g)

(p, a) = p.parse_args()

def error (msg):
	msg = str(msg)
	if msg.endswith('.'):
		msg = msg[:-1]
	print >>sys.stderr, "ERROR: %s." % msg
	sys.exit(1)

if (p.display_version):
	print mdb.version
	sys.exit(0)

if (p.input_clstr_fn == None):
	error("A CD-HIT-454 output file must be provided")

if (not os.path.exists(p.input_clstr_fn)):
	error("File '%s' not found" % p.input_clstr_fn)

if (p.input_log_fn == None):
	error("A CD-HIT-454 log file must be provided")

if (not os.path.exists(p.input_log_fn)):
	error("File '%s' not found" % p.input_log_fn)

if (not p.sequences_collection_name):
	error("A collection name must be provided")

try:
	get_sequence_id = eval("lambda x: " + p.id_getter.replace('%', 'x').replace("\\x", '%'))

except SyntaxError as e:
	error("Invalid getter: %s\n%s^" % (e.text, ' ' * (e.offset + 22)))

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

if (p.verbose):
	mdb.max_verbosity()

try:
	mdb.connect(**connection_parameters)
except Exception as msg:
	error(msg)

try:
	collection = mdb.Collection.find_one({"name": p.sequences_collection_name})
	if (collection == None):
		error("Unknown collection '%s'" % p.sequences_collection_name)

except mdb.errors.DBConnectionError as msg:
	error(msg)

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

def readlines (fn):
	fh = open(fn, 'rU')

	while True:
		line = fh.readline()
		if (line == ''):
			break

		line = line.strip()
		if (line == ''):
			continue

		yield line

print "Extracting metainformation from '%s' ..." % p.input_log_fn

is_header = False
header = []
for line in readlines(p.input_log_fn):
	if (line.startswith('=')):
		if (is_header == False):
			is_header = True
			continue
		else:
			break

	header.append(line.strip())

if (is_header == False) or (len(header) == 0):
	error("Invalid log file: no header found")

template = re.compile("Program: CD-HIT, V([0-9\.]+).*?Command: .*?cd-hit-454\s(.*?) Started: ([a-zA-Z0-9 :]+)")
match = template.match(' '.join(header))

if (match == None):
	error("Invalid log file: malformed header")

# CD-HIT version number
cd_hit_version = match.group(1)

# CD-HIT command line options
parser = optparse.OptionParser()

options = (
	("i", "string", None),
	("o", "string", None),
	("c", "float", 0.98),
	("b", "int", 10),
	("M", "int", None),
	("T", "int", None),
	("n", "int", 10),
	("aL", "float", 0.0),
	("AL", "int", 99999999),
	("aS", "float", 0.0),
	("AS", "int", 99999999),
	("B", "int", None),
	("g", "int", 0),
	("D", "int", 1),
	("match", "int", 2),
	("mismatch", "int", -1),
	("gap", "int", -3),
	("gap-ext", "int", -1),
)

for (option, type, default) in options:
	parser.add_option("--" + option, dest = option, type = type, default = default)

items = shlex.split(match.group(2))
if (len(items) % 2 != 0):
	error("Invalid command line options for CD-HIT-454: %s" % match.group(2))

for i, item in enumerate(items):
	if (i % 2 == 0):
		items[i] = '-' + item

try:
	options, dummy = parser.parse_args(items)
except:
	error("Invalid command line options for CD-HIT-454: %s" % match.group(2))

# retrieving of non-null option values
cd_hit_options = {}
for key, value in options.__dict__.iteritems():
	if (value != None):
		cd_hit_options[key] = value

# removal of non algorithm-specific options
for option in "ioMTB":
	if (option in cd_hit_options):
		del cd_hit_options[option]

# CD-HIT run date
try:
	cd_hit_run_date = datetime.datetime.strptime(match.group(3), "%a %b %d %H:%M:%S %Y")
except:
	error("Invalid run date: '%s'" % match.group(3))

print "Importing '%s' ..." % p.input_clstr_fn

print "  validating sequence identifiers ..."

entry = re.compile("([0-9]+)\s+[0-9]+nt, >(.*?)\.\.\.\s+(?:(\*)|(?:at ([0-9\:]+)+/([+\-])/([0-9\.]+)%))")

representatives = {}
n = 0
for line in readlines(p.input_clstr_fn):
	match = entry.match(line)
	if (match == None):
		error("Malformed line: \"%s\"" % line)

	cluster_id = match.group(1)
	sequence_id = get_sequence_id(match.group(2))

	candidates = list(collection.list_sequences({"name": sequence_id}))

	if (len(candidates) == 0):
		error("Unknown sequence '%s'" % sequence_id)

	if (len(candidates) > 1):
		error("Ambiguous sequence '%s'" % sequence_id)

	if (match.group(3) == '*'):
		if (cluster_id in representatives):
			error("Cluster #%s has more than one representative" % cluster_id)

		representatives[cluster_id] = sequence_id

	n += 1

if (n == 0):
	error("No cluster assignment in the input")

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

print "  importing clusters ..."

class ProgressBar:
	def __init__ (self, upper = None):
		self.__min = 0.0
		self.__max = upper + 0.0

	def display (self, value):
		f = (value - self.__min) / (self.__max - self.__min) # fraction
		p = 100 * f # percentage
		s = int(round(80 * f)) # bar size

		sys.stdout.write(' ' * 4 + ('.' * s) + " %4.2f%%\r" % p)
		sys.stdout.flush()

	def clear (self):
		sys.stdout.write(' ' * (4 + 80 + 8) + "\r")
		sys.stdout.flush()

pb = ProgressBar(n)
n = 0

for line in readlines(p.input_clstr_fn):
	match = entry.match(line)

	cluster_id = match.group(1)
	sequence_id = get_sequence_id(match.group(2))

	sequence_o = list(collection.list_sequences({"name": sequence_id}))[0]

	# first case: the sequence is a representative
	if (match.group(3) == '*'):
		if (p.dry_run):
			print "    sequence '%s' is representative of cluster #%s" % (sequence_id, cluster_id)

	# second case: the sequence belong to a cluster
	else:
		if (not cluster_id in representatives):
			error("Cluster #%s has no representative; unable to add '%s' to it" % (cluster_id, sequence_id))

		representative = list(collection.list_sequences({"name": representatives[cluster_id]}))[0]
		sequence_start, sequence_stop, representative_start, representative_stop = [int(v) for v in match.group(4).split(':')]
		strand = match.group(5)

		r = {
			"type": "similar-to",
			"run": {
				"date": {
					"year": cd_hit_run_date.year,
					"month": cd_hit_run_date.month,
					"day": cd_hit_run_date.day
				},
				"algorithm": {
					"name": "cd-hit-454",
					"version": cd_hit_version,
					"parameters": cd_hit_options
				}
			},
			"score": {
				"percent_identity": float(match.group(6))
			},
			"alignment": {
				"source_coordinates": (sequence_start, sequence_stop),
				"target_coordinates": (representative_start, representative_stop)
			}
		}

		if (p.dry_run):
			print "    sequence '%s' is a member of cluster #%s" % (sequence_id, cluster_id)
			for line in pprint.pformat(r).split('\n'):
				print "      %s" % line
		else:
			sequence_o.relate_to_sequence(representative, r)

			sequence_o.commit()

	if (p.display_progress_bar):
		pb.display(n)

	n += 1

if (p.display_progress_bar):
	pb.clear()
