# A biopython script to calculate sequence identity score
# input is an aligned multiple protein, or DNA, or RNA sequences
# with identical length including gaps
import sys
import numpy as np
from Bio import AlignIO
alignment = AlignIO.read("seq.faa", "fasta")
align_array = np.array([list(rec) for rec in alignment], np.character)
l=len(alignment)
seq2=np.arange(l)
seq=np.arange(l)
a=np.arange(l*l).reshape(l,l)
b1=a
for m in range(0,len(alignment)):
seq[m]=0
for n in range(0,alignment.get_alignment_length()):
if align_array[m,n]!="-":
seq[m]=seq[m]+1
continue
pass
pass
#for i in range(0,len(alignment)): #without parallel running
for i in range(800,1000): #parallel running, give start and finish sequence ID
for k in range(0,len(alignment)):
seq2[k]=0
b1[i,k]=0
for j in range(0,alignment.get_alignment_length()):
if align_array[k,j]!="-":
seq2[k]=seq2[k]+1
if (align_array[i,j]==align_array[k,j]) and align_array[k,j]!="-":
b1[i,k]=b1[i,k]+1
continue
continue
pass
seq1=seq[i]
if seq1 < seq2[k]:
sequ=seq2[k]
else:
sequ=seq1
p=1.0*b1[i,k]/sequ
print "%3d %3d %3d %3d %3d %3d %9.6f" %(i,k,seq1,seq2[k],sequ,b1[i,k],p)
pass
pass
No comments:
Post a Comment