Tuesday, June 16, 2009

Motif Discovery 2

To help with exploration of the Gibbs sampler, I wrote a small module that generates new motifs, mutates them, and embeds them in random sequence. The critical variables are N (the number of sequences and motifs), M ( the length of the motif), and SZ (the length of the sequence in which the motif is embedded at a random position). The frequencies at which each position varies can be adjusted. This code needs to be on the Python search path under the filename motif.py for use as described in the next post.


# generate a new motif length = SZ
# howmany = N; average match = f
import random
nt = 'ACGT'

# given a master motif s and
# a list of frequencies of conservation
# generate a variant
def mutateMotif(m,freqL):
retL = list()
for c,f in zip(m,freqL):
if random.random() > f:
retL.append(random.choice(nt))
else:
retL.append(c)
return ''.join(retL)

def getMotifList(N,M,SZ):
L = list()
# make a new random master motif s
for i in range(M):
L.append(random.choice(nt))
m = ''.join(L)

# positions are conserved at these freqs
freqL = [0.9,0.8,0.7,0.6,0.5]
freqL *= M/len(freqL) + 1
# do mutagenesis
L = list()
for i in range(N):
L.append(mutateMotif(m,freqL))
return m,L

def embedMotif(m,M,SZ):
# embed motif in random sequence of len SZ
L = [random.choice(nt) for i in range(SZ)]
i = random.choice(range(SZ-M))
L[i:i+M] = list(m)
return i,''.join(L)

def show(m,seq):
i = seq.find(m)
j = i+len(m)
print i
print seq[:i] + '\n'
print seq[i:j] + '\n'
print seq[j:] + '\n'

# we generate a list of motif, seq pairs
# each motif derived from a master motif

def getTestSet(D=None):
if not D:
D = {'N':10,'M':10,'SZ':50 }
N=D['N'] # number of seqs
M=D['M'] # length of motif
SZ=D['SZ'] # length of seqs

s,L = getMotifList(N,M,SZ)
retL = list()
for m in L:
i,seq = embedMotif(m,M,SZ)
retL.append((i,m,seq))
return retL

if __name__ == '__main__':
#random.seed(157)
L = getTestSet()
for i,m,seq in L:
#show(motif,seq)
print str(i).rjust(3),m,seq
print
i,m,seq = L[0]
show(m,seq)

Sample output:

 21 CGATGCCCCA GCATATAGTTGATTGCGAACCCGATGCCCCAGTCTCCGACGACTTGCCAA
23 CGCTGCCTTG CACGCTCGTACGATTAGGTAGGGCGCTGCCTTGACTTAAGGAGTAAGCCG
26 CGGTGCCTCT TGTTAACTAATAGGGAGCAGCGAGTTCGGTGCCTCTCCAGCGTTTCTGTA
15 CGGCGCCTCG TTGTGCTGAAACTCACGGCGCCTCGCTGGATGCGAAGTGCCTACTCGATA
37 CGGTTCCTCT TAGTTTCTAATAGCCAGCGTCCTAGTGGCATTCGGGACGGTTCCTCTACG
36 CGGACCGTCT CTAGCTAAGCACTGCATTGGAAGGACAGCGGAATCGCGGACCGTCTATGC
22 CGTTGCCTTA CGATAAGGCGACCCGGATGCACCGTTGCCTTAGCAATGCCTTGTCGCCCA
37 CGCTGCCTCG GCGTCGAATCGCAATCTATAGTTTTTAGTATCATTGACGCTGCCTCGCGT
15 CGGACTCTCC CTCCCTGGCCGGACACGGACTCTCCCTTAGTCGGCCAAACTGCTAAAGAG
24 CGGTGCCTCG TTACATTAGGTTGTAGTCATTAGACGGTGCCTCGGTACAGGATTTACCAG

21
GCATATAGTTGATTGCGAACC

CGATGCCCCA

GTCTCCGACGACTTGCCAA