#!/usr/bin/env python

# Description: Detects which positions of a VCF file are in a FASTA file. It creates another VCF file by adding extra information showing whether the base in that position coincides with the nucleotide at the FASTA or not.
#
# Usage: vcfPosfasta.py [-h] -v VCF -f FASTA [-c COUNT] -o OUTPUT
#	 vcfPosfasta.py [-h] [-v VCF] -f FASTA [-c COUNT] | STDOUT
#	 STDIN | vcfPosfasta.py [-h] -f FASTA [-c COUNT] [-o OUTPUT]
#	 STDIN | vcfPosfasta.py [-h] -f FASTA [-c COUNT] | STDOUT
#
# Required software: Samtools
#
# Developed by J. Leno-Colorado

import sys, os, getopt, re, argparse, gzip
from argparse import RawTextHelpFormatter
import time
from time import clock

start = time.time()

def main(argv):

	#####################
	## Command options ##
	#####################

	text = "Description:\nDetects which positions of a VCF file are in a FASTA file (say an outgroup).\nIt adds an extra column ANC to INFO field in VCF file: \n\tANC=0\tThe nucleotide in the FASTA file matches with the reference allele of this position in the VCF file\n\tANC=1\tThe nucleotide in the FASTA file matches with the alternative allele of this position in the VCF file\n\tANC=N\tThe nucleotide in the FASTA file does not match with any of alleles of this position in the VCF file\nRequired software: Samtools\nDeveloped by J. Leno-Colorado"
	msg_usage = "\tvcfPosfasta.py [-h] -v VCF -f FASTA [-c COUNT] -o OUTPUT\n\tSTDIN | vcfPosfasta.py [-h] -f FASTA [-c COUNT] | STDOUT"
	parser = argparse.ArgumentParser(description=text, usage=msg_usage, formatter_class=RawTextHelpFormatter)
	parser.add_argument("-v", "--vcf", help="Input VCF file (compressed or uncompressed). Can be STDIN.")
	parser.add_argument("-f", "--fasta", help="Fasta file with wich to compare (bgzip compressed or uncompressed).", required=True)
	parser.add_argument("-c", "--count", help="Counts the number of positions that coincides with the reference or the alternative allele and saves it in an output file.")
	parser.add_argument("-o", "--output", help="Output VCF file; STDOUT if undefined")
	args = parser.parse_args()

	# If command -v is present in the command line, the input is obtained from a VCF file (compressed (gzip) or uncompressed), else it is obtained from the STDIN:
	if args.vcf:
		inputfile = args.vcf
		if re.search(".gz", inputfile):
			vcf = gzip.open(inputfile, 'r')
		else:
			vcf = open(inputfile, 'r')
	else:
		if not sys.stdin.isatty():
			vcf = sys.stdin
		else:
			print "\nError! Input is required (STDIN or VCF file)\n\nUsage:\tvcfPosfasta.py [-h] [-v VCF] -f FASTA [-c COUNT] [-o OUTPUT]\n\tvcfPosfasta.py [-h] [-v VCF] -f FASTA [-c COUNT] | STDOUT\n\tSTDIN | vcfPosfasta.py [-h] -f FASTA [-c COUNT] [-o OUTPUT]\n\tSTDIN | vcfPosfasta.py [-h] -f FASTA [-c COUNT] | STDOUT\n"
			sys.exit(2)

	if args.fasta:
		fastafile = args.fasta
	else:
		sys.exit(2)

	####################################################################
	## Comparison SNPs of VCF file with the nucleotides of FASTA file ##
	####################################################################

	count_ref=0
	count_alt=0
	count_not=0

	# If command -o is present in the command line, the output is saved in an external file:
	if args.output:
		outputfile = args.output	
		result = open(outputfile, 'w')
		for x in vcf:
			if re.search('#', x):
				result.write(x + "\n")
			else:
				chrom = x.split("\t")[0]
				pos = x.split("\t")[1]
				ref = x.split("\t")[3]
				alt = x.split("\t")[4]
				command = "samtools faidx " + fastafile + " " + chrom + ":" + pos + "-" + pos + " | grep -v '>'"
				nt = os.popen(command)
				nucl = nt.read()
				nucl = nucl.rstrip()
				alt = alt.rstrip()
				if nucl == ref:
					count_ref = count_ref + 1
					result.write(chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + x.split("\t")[7] + ";ANC=0\t" + x.split("\t")[8] + "\t" + x.split("\t")[9] + "\n")
				elif nucl == alt:
					count_alt = count_alt + 1
					result.write(chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + x.split("\t")[7] + ";ANC=1\t" + x.split("\t")[8] + "\t" + x.split("\t")[9] + "\n")
				else:
					count_not = count_not + 1
					result.write(chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + x.split("\t")[7] + ";ANC=N\t" + x.split("\t")[8] + "\t" + x.split("\t")[9] + "\n")
		if args.count:
			countfile = args.count
			countf = open(countfile, 'w')
			countf.write("Reference alleles: " + str(count_ref) + "\n")
			countf.write("Alternative alleles: " + str(count_alt) + "\n")
			countf.write("Different nucleotide: " + str(count_not) + "\n")
	
		result.close()
		vcf.close()

	# If command -o is not present in the command line, the output is displayed:
	else:
		for x in vcf:
			if re.search('#', x):
				x = x.rstrip()
				if re.search("#CHROM", x):
					print '##INFO=<ID=ANC,Type=Integer,Description="Coincidence with the FASTA ' + "'" + fastafile + "'" + '">'
				print x
			else:
				chrom = x.split("\t")[0]
				pos = x.split("\t")[1]
				ref = x.split("\t")[3]
				alt = x.split("\t")[4]
				geno = "\t".join(x.split("\t")[9:-1])
				command = "samtools faidx " + fastafile + " " + chrom + ":" + pos + "-" + pos + " | grep -v '>'"
				nt = os.popen(command)
				nucl = nt.read()
				nucl = nucl.rstrip()
				alt = alt.rstrip()
				if nucl == ref:
					count_ref = count_ref + 1
					if x.split("\t")[7] == ".":
						print chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + "ANC=0\t" + x.split("\t")[8] + "\t" + geno
					else:
						print chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + x.split("\t")[7] + ";ANC=0\t" + x.split("\t")[8] + "\t" + geno
				elif nucl == alt:
					count_alt = count_alt + 1
					if x.split("\t")[7] == ".":
						print chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + "ANC=1\t" + x.split("\t")[8] + "\t" + geno
					else:
						print chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + x.split("\t")[7] + ";ANC=1\t" + x.split("\t")[8] + "\t" + geno
				else:
					count_not = count_not + 1
					if x.split("\t")[7] == ".":
						print chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + "ANC=N\t" + x.split("\t")[8] + "\t" + geno
					else:
						print chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + x.split("\t")[7] + ";ANC=N\t" + x.split("\t")[8] + "\t" + geno

		# If command -c is present in the command line, an extra output is generated with the count of alleles (alternative or reference) that coincides with the FASTA file:
		if args.count:
			countfile = args.count
			countf = open(countfile, 'w')
			count_total = count_ref + count_alt + count_not
			perc_ref = float("{0:.2f}".format((float(count_ref) * 100)/float(count_total)))
			perc_alt = float("{0:.2f}".format((float(count_alt) * 100)/float(count_total)))
			perc_not = float("{0:.2f}".format((float(count_not) * 100)/float(count_total)))
			countf.write("Reference alleles: " + str(count_ref) + " (" + str(perc_ref) + "%)" + "\n")
			countf.write("Alternative alleles: " + str(count_alt) + " (" + str(perc_alt) + "%)"  + "\n")
			countf.write("Different nucleotide: " + str(count_not) + " (" + str(perc_not) + "%)"  + "\n")

if __name__ == "__main__":
	main(sys.argv[1:])