#!/usr/bin/env python

# Copyright (C) 2011-2012 CRS4.
#
# This file is part of Seal.
#
# Seal is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# Seal is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License along
# with Seal.  If not, see <http://www.gnu.org/licenses/>.

#########################################################
# You need a working Hadoop cluster to use this.
#########################################################

import fileinput
import glob
import os
import re
import site
import sys

IntegrationTestDir = os.path.realpath(os.path.join(os.path.dirname(__file__), "..", ".."))
site.addsitedir(IntegrationTestDir)

from seal_integration_test import SealIntegrationTest

class TestUnpairedWarning(SealIntegrationTest):
	def __init__(self):
		SealIntegrationTest.__init__(self, os.path.realpath(os.path.dirname(__file__)) )

	def run_program(self, hdfs_input, hdfs_output):
		cmd = ["%s/scripts/seal" % self.seal_dir, "prq",
		  "--traditional-ids",
		  "-D", "seal.prq.warning-only-if-unpaired=true",
		  "-D", "seal.prq.min-bases-per-read=0",
		  "-D", "seal.prq.drop-failed-filter=false",
		  "-D", "hbam.qseq-input.base-quality-encoding=sanger",
		  "--num-reducers", "1",
		  hdfs_input, hdfs_output]
		self.logger.debug("Command: %s", " ".join(cmd))

		stdout = self.run_cmd_and_output_if_failure(cmd)

		unpaired = int(re.search("Unpaired=(\d+)$", stdout, re.MULTILINE).group(1))
		if unpaired != 1:
			self.logger.error("Unpaired reads not counted properly! (expected 1 but got %d)", unpaired)
			raise StandardError(self.test_name)

	def process_output(self):
		# Returns True if the output is as expected.
		# Returns False otherwise
		# expected output:
		# CRESSIA_129:1:1:1003:1171#0	READ_1_SEQUENCE_1	READ_1_QUALITY_1	READ_2_SEQUENCE_1	READ_2_QUALITY_1
		# CRESSIA_129:1:1:1010:1209#0	READ_1_SEQUENCE_3	READ_1_QUALITY_3	READ_2_SEQUENCE_3	READ_2_QUALITY_3
		# etc etc...
		read_pattern = re.compile("READ_(\d+)")
		expected_reads = [ '1', '1', '2', '2' ]

		sequence_pattern = re.compile("(?:SEQUENCE|QUALITY)_(\d+)")

		if any( map(lambda s: s == 0, map(os.path.getsize, sys.argv[1:])) ):
			raise SealTestException("we have at least one empty output file")

		for line in fileinput.input(glob.glob(os.path.join(self.output_dir, "part-*"))):
			# ensure we have the two different reads
			reads = read_pattern.findall(line)
			if reads != expected_reads:
				raise SealTestException("got unexpected reads pattern '%s' (expected %s)" % (reads, expected_reads))

			# ensure the line only references one sequence
			matches = sequence_pattern.findall(line)

			if len(matches) != 4:
				raise SealTestException("expected 4 sequence ids by found %d" % len(matches))

			if any(matches[0] != matches[i] for i in range(len(matches))):
				raise SealTestException("output record contains mismatched sequence fragments" % matches)

if __name__ == '__main__':
	success = TestUnpairedWarning().test_method()
	sys.exit( 0 if success else 1 )

