#!/usr/bin/env python

'''
Replace the leading and trailing gaps of all sequences in a fasta file with N's

You have to set two things, the alignment filename and the output filename.
If you want both of these files are going to be in the same directory as the script, then you can
just specify names. If they're not, you'll need to specify file paths too, e.g.

"/Users/Rob/Desktop/alignment_file.fasta"

To run this script, first make sure you have python installed (should be done already on a mac)
then cd to the directory with this script in it, and at the command prompt in terminal type:

python replace_gaps.py

# Copyright (c) 2010, Robert Lanfear rob.lanfear ---AT--- gmail.com 
I make no guarantees that this script will do anything useful, or that it won't do anything bad.
Although it works on my machine.
Feel free to modify this and pass it on, but please do acknowledge me when doing so.
'''

import re



alignment_filename = "align.fasta" 
output_filename = "align_clean.fasta"

######################        Import fasta file as dict        #################################
alignment_file = open(alignment_filename, 'r')
#and set up the alignment as a dictionary
seqdict = {}
#get the seqs from the alignment file
start = 0 # a cheap trick to do this simply
for line in alignment_file:
	if line.startswith(">"): #it's a new sequence
		#first, stash the old sequence into the dictionary
		if start == 1:
			seqdict[seq_name] = seq
		seq_name = line
		seq = '' #an empty sequence
	else: #we must have a sequence, maybe with a newline on the end of it
		seq_part = line.strip("\n") #remove newlines as we go
		seq = ''.join([seq, seq_part])
	start=1
#add the last sequence, which you'll have missed because I'm too lazy to write this properly
seqdict[seq_name] = seq	
alignment_file.close()

###############################   Deal with sequences    #######################################
output_file = open(output_filename, 'w')
for name in seqdict:
	
	seq = seqdict[name]
	
	start_gaps = re.search("^-*", seq).group(0)
	start_Ns   = start_gaps.replace("-", "N")
	end_gaps   = re.search("-*$", seq).group(0)
	end_Ns     = end_gaps.replace("-", "N")
	
	seq = re.sub("^-*", start_Ns, seq)
	seq = re.sub("-*$", end_Ns, seq)

	seqdict[name] = seq	
	
	#now write that to the output file
	output_file.write("%s%s\n" %(name, seq))	

output_file.close()	
	
	