#!/usr/bin/env python

# Parameters for the script:

# name of the collection containing the contigs
contigs_collection_name = "my collection"

# name of the file to receive the graph in Cytoscape format
# see http://cytoscape.org/manual/Cytoscape2_8Manual.html#SIF%20Format
network_edges_fn = "network.sif"

# name of the file to receive the graph's edge annotations in Cytoscape format
# see http://www.cytoscape.org/manual/Cytoscape2_8Manual.html#Cytoscape%20Attribute%20File%20Format
network_edges_annotations_fn = "network.eda"

# name of the file to receive the graph's node annotations in Cytoscape format
network_nodes_annotations_fn = "network.noa"

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

# first step, we import the MetagenomeDB API, as well as some other libraries
import MetagenomeDB as mdb
import sys

# then we check if the contigs' collection exists
contigs_collection_o = mdb.Collection.find_one({"name": contigs_collection_name})

if (contigs_collection_o == None):
	print >>sys.stderr, "ERROR: no collection found with name '%s'" % contigs_collection_name
	sys.exit(1)

# we retrieve a list of all contigs in this collection; in our
# example we request all sequences in the collection with a value
# 'contig' for the property 'class'. However, this is optional if
# you do know your collection only contains contigs.
contigs = list(contigs_collection_name.list_sequences({"class": "contig"}))

print "number of contigs to consider: %s" % len(contigs)

# we create a dictionary to store the graph's edges
edges = {}

def order (a, b):
	if (a > b):
		return b, a
	else:
		return a, b

# we now iterate through all non-redundant pairs of contigs; the
# number of all possible pairs is given by the following formula:
n = len(contigs) * (len(contigs) - 1) / 2

print "number of pairs of contigs to consider: %s" % n

c = 0
for i, contig_a in enumerate(contigs):

	# for each contig in contig_a we request the list of the
	# reads that has been used for its assembly. Those are
	# sequences that are related to the contig and have (for
	# example) a value 'read' for the property 'class'
	reads_a = contig_a.list_related_sequences(mdb.Direction.INGOING, {"class": "read"})

	for contig_b in contigs[i+1:]:

		# similarily, we request the reads for contig_b
		reads_b = contig_b.list_related_sequences(mdb.Direction.INGOING, {"class": "read"})

		# for speed purpose in the computation below we will
		# transform this later list of reads into a dictionary
		reads_b = {}.fromkeys(reads_b)

		# we then check if those two lists overlap; we also
		# calculate the Jaccard similarity, as the cardinal
		# of the intersection of the two lists divided by
		# the cardinal of their union.
		union, intersection = {}, {}

		for item in reads_a:
			if (item in reads_b):
				intersection[item] = True

			union[item] = True

		for item in reads_b:
			union[item] = True

		jaccard_similarity = 100.0 * len(intersection) / len(union)

		# if we do have some overlap, then we store this edge
		if (jaccard_similarity > 0):
			edges[order(contig_a, contig_b)] = jaccard_similarity

		# we display some information about our progress
		c += 1
		p = 100.0 * c / n
		if (p % 5 == 0):
			print "%d%% done" % p

# we now save those edges and their annotations (Jaccard similarity)
edges_fh = open(network_edges_fn, 'w')
edges_annotations_fh = open(network_edges_annotations_fn, 'w')
print >>edges_annotations_fh, "NumberOfSharedReads"

nodes = {}
for (contig_a, contig_b) in sorted(edges.keys()):
	print >>network_fh, "%s	shared_reads	%s" % (contig_a["name"], contig_b["name"])
	print >>edges_annotations_fh, "%s (shared_reads) %s = %s" % (contig_a["name"], contig_b["name"], edges[(contig_a, contig_b)])

	nodes[contig_a] = True
	nodes[contig_b] = True

# finally, we store the nodes annotations (contigs' length)
nodes_annotations_fh = open(network_nodes_annotations_fn, 'w')
print >>nodes_annotations_fh, "ContigLength"

for contig in sorted(nodes.keys()):
	print >>nodes_annotations_fh, "%s = %s" % (contig["name"], contig["length"])
