#!/usr/bin/env python # Description: Detects which positions of a VCF file are in a FASTA file. It creates another VCF file by adding extra information showing whether the base in that position coincides with the nucleotide at the FASTA or not. # # Usage: vcfPosfasta.py [-h] -v VCF -f FASTA [-c COUNT] -o OUTPUT # vcfPosfasta.py [-h] [-v VCF] -f FASTA [-c COUNT] | STDOUT # STDIN | vcfPosfasta.py [-h] -f FASTA [-c COUNT] [-o OUTPUT] # STDIN | vcfPosfasta.py [-h] -f FASTA [-c COUNT] | STDOUT # # Required software: Samtools # # Developed by J. Leno-Colorado import sys, os, getopt, re, argparse, gzip from argparse import RawTextHelpFormatter import time from time import clock start = time.time() def main(argv): ##################### ## Command options ## ##################### text = "Description:\nDetects which positions of a VCF file are in a FASTA file (say an outgroup).\nIt adds an extra column ANC to INFO field in VCF file: \n\tANC=0\tThe nucleotide in the FASTA file matches with the reference allele of this position in the VCF file\n\tANC=1\tThe nucleotide in the FASTA file matches with the alternative allele of this position in the VCF file\n\tANC=N\tThe nucleotide in the FASTA file does not match with any of alleles of this position in the VCF file\nRequired software: Samtools\nDeveloped by J. Leno-Colorado" msg_usage = "\tvcfPosfasta.py [-h] -v VCF -f FASTA [-c COUNT] -o OUTPUT\n\tSTDIN | vcfPosfasta.py [-h] -f FASTA [-c COUNT] | STDOUT" parser = argparse.ArgumentParser(description=text, usage=msg_usage, formatter_class=RawTextHelpFormatter) parser.add_argument("-v", "--vcf", help="Input VCF file (compressed or uncompressed). Can be STDIN.") parser.add_argument("-f", "--fasta", help="Fasta file with wich to compare (bgzip compressed or uncompressed).", required=True) parser.add_argument("-c", "--count", help="Counts the number of positions that coincides with the reference or the alternative allele and saves it in an output file.") parser.add_argument("-o", "--output", help="Output VCF file; STDOUT if undefined") args = parser.parse_args() # If command -v is present in the command line, the input is obtained from a VCF file (compressed (gzip) or uncompressed), else it is obtained from the STDIN: if args.vcf: inputfile = args.vcf if re.search(".gz", inputfile): vcf = gzip.open(inputfile, 'r') else: vcf = open(inputfile, 'r') else: if not sys.stdin.isatty(): vcf = sys.stdin else: print "\nError! Input is required (STDIN or VCF file)\n\nUsage:\tvcfPosfasta.py [-h] [-v VCF] -f FASTA [-c COUNT] [-o OUTPUT]\n\tvcfPosfasta.py [-h] [-v VCF] -f FASTA [-c COUNT] | STDOUT\n\tSTDIN | vcfPosfasta.py [-h] -f FASTA [-c COUNT] [-o OUTPUT]\n\tSTDIN | vcfPosfasta.py [-h] -f FASTA [-c COUNT] | STDOUT\n" sys.exit(2) if args.fasta: fastafile = args.fasta else: sys.exit(2) #################################################################### ## Comparison SNPs of VCF file with the nucleotides of FASTA file ## #################################################################### count_ref=0 count_alt=0 count_not=0 # If command -o is present in the command line, the output is saved in an external file: if args.output: outputfile = args.output result = open(outputfile, 'w') for x in vcf: if re.search('#', x): result.write(x + "\n") else: chrom = x.split("\t")[0] pos = x.split("\t")[1] ref = x.split("\t")[3] alt = x.split("\t")[4] command = "samtools faidx " + fastafile + " " + chrom + ":" + pos + "-" + pos + " | grep -v '>'" nt = os.popen(command) nucl = nt.read() nucl = nucl.rstrip() alt = alt.rstrip() if nucl == ref: count_ref = count_ref + 1 result.write(chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + x.split("\t")[7] + ";ANC=0\t" + x.split("\t")[8] + "\t" + x.split("\t")[9] + "\n") elif nucl == alt: count_alt = count_alt + 1 result.write(chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + x.split("\t")[7] + ";ANC=1\t" + x.split("\t")[8] + "\t" + x.split("\t")[9] + "\n") else: count_not = count_not + 1 result.write(chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + x.split("\t")[7] + ";ANC=N\t" + x.split("\t")[8] + "\t" + x.split("\t")[9] + "\n") if args.count: countfile = args.count countf = open(countfile, 'w') countf.write("Reference alleles: " + str(count_ref) + "\n") countf.write("Alternative alleles: " + str(count_alt) + "\n") countf.write("Different nucleotide: " + str(count_not) + "\n") result.close() vcf.close() # If command -o is not present in the command line, the output is displayed: else: for x in vcf: if re.search('#', x): x = x.rstrip() if re.search("#CHROM", x): print '##INFO=' print x else: chrom = x.split("\t")[0] pos = x.split("\t")[1] ref = x.split("\t")[3] alt = x.split("\t")[4] geno = "\t".join(x.split("\t")[9:-1]) command = "samtools faidx " + fastafile + " " + chrom + ":" + pos + "-" + pos + " | grep -v '>'" nt = os.popen(command) nucl = nt.read() nucl = nucl.rstrip() alt = alt.rstrip() if nucl == ref: count_ref = count_ref + 1 if x.split("\t")[7] == ".": print chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + "ANC=0\t" + x.split("\t")[8] + "\t" + geno else: print chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + x.split("\t")[7] + ";ANC=0\t" + x.split("\t")[8] + "\t" + geno elif nucl == alt: count_alt = count_alt + 1 if x.split("\t")[7] == ".": print chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + "ANC=1\t" + x.split("\t")[8] + "\t" + geno else: print chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + x.split("\t")[7] + ";ANC=1\t" + x.split("\t")[8] + "\t" + geno else: count_not = count_not + 1 if x.split("\t")[7] == ".": print chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + "ANC=N\t" + x.split("\t")[8] + "\t" + geno else: print chrom + "\t" + pos + "\t" + x.split("\t")[2] + "\t" + ref + "\t" + alt + "\t" + x.split("\t")[5] + "\t" + x.split("\t")[6] + "\t" + x.split("\t")[7] + ";ANC=N\t" + x.split("\t")[8] + "\t" + geno # If command -c is present in the command line, an extra output is generated with the count of alleles (alternative or reference) that coincides with the FASTA file: if args.count: countfile = args.count countf = open(countfile, 'w') count_total = count_ref + count_alt + count_not perc_ref = float("{0:.2f}".format((float(count_ref) * 100)/float(count_total))) perc_alt = float("{0:.2f}".format((float(count_alt) * 100)/float(count_total))) perc_not = float("{0:.2f}".format((float(count_not) * 100)/float(count_total))) countf.write("Reference alleles: " + str(count_ref) + " (" + str(perc_ref) + "%)" + "\n") countf.write("Alternative alleles: " + str(count_alt) + " (" + str(perc_alt) + "%)" + "\n") countf.write("Different nucleotide: " + str(count_not) + " (" + str(perc_not) + "%)" + "\n") if __name__ == "__main__": main(sys.argv[1:])