Saturday, November 16, 2013

A biopython script to count the sequence identity from aligned sequences input

# A biopython script to calculate sequence identity score
# input is an aligned multiple protein, or DNA, or RNA sequences 
# with identical length including gaps
import sys
import numpy as np
from Bio import AlignIO
alignment = AlignIO.read("seq.faa", "fasta")
align_array = np.array([list(rec) for rec in alignment], np.character)
l=len(alignment)
seq2=np.arange(l)
seq=np.arange(l)
a=np.arange(l*l).reshape(l,l)
b1=a
for m in range(0,len(alignment)):
    seq[m]=0
    for n in range(0,alignment.get_alignment_length()):
        if align_array[m,n]!="-":
           seq[m]=seq[m]+1
        continue
    pass
pass  
#for i in range(0,len(alignment)):   #without parallel running
for i in range(800,1000): #parallel running, give start and finish sequence ID
    for k in range(0,len(alignment)):
        seq2[k]=0
        b1[i,k]=0
        for j in range(0,alignment.get_alignment_length()):
           if align_array[k,j]!="-":
             seq2[k]=seq2[k]+1
             if (align_array[i,j]==align_array[k,j]) and align_array[k,j]!="-":
                  b1[i,k]=b1[i,k]+1
             continue
           continue
        pass
        seq1=seq[i]
        if seq1 < seq2[k]:
           sequ=seq2[k]
        else:
           sequ=seq1
        p=1.0*b1[i,k]/sequ
        print "%3d %3d %3d %3d %3d %3d %9.6f" %(i,k,seq1,seq2[k],sequ,b1[i,k],p)
    pass
pass