Monday, February 8, 2010

Unifrac (3): Simulating sequences



As part of my exploration of Unifrac (first and second posts), I'm going to need some sequences.

I posted about this before, then pulled that post. In fact, I pushed out this very post yesterday, and am now editing heavily. I'm finding a conflict between what's needed to make a good example for showing how the calculations work, and what's needed for a good example in terms of the final result. I've decided to keep this here, but be aware that we may not use these sequences going forward.

This script fetches a bunch of prototype sequences from Genbank. Then it distributes them to three "samples" according to a hard-coded distribution. In the process, the sequences are mutagenized a bit. Finally, the sequences are aligned, rooted (by hand) and then, as shown above, a plot is made in R of the phylogenetic tree.

[UPDATE: I modified the code slightly to give a bit more heterogeneity to the sequences. The code is updated, and here is a plot of the new phylogenetic tree.

Also, I fixed an awkward point about the old code: I have made sure that the tree we use is a real, rooted tree by incorporating Thermotoga as an outgroup.]



Here's the code (R code at the end):


import random, os, sys
random.seed(137)
from cogent import LoadSeqs, DNA, LoadTree
from cogent.db.ncbi import EFetch
from cogent.app.muscle import align_unaligned_seqs
from cogent.app.fasttree import build_tree_from_alignment

def fetch_ncbi_data(ofile,s):
# get the seqs from Genbank
input = [e.split() for e in s.strip().split('\n')]
id_list = [t[0] for t in input]
names = [t[1] for t in input]
ef = EFetch(id=','.join(id_list), rettype='fasta')
data = ef.read().strip()

# title lines are too long, replace by genus_species
rL = list()
for i,e in enumerate(data.split('\n\n')):
old_title, seq = e.strip().split('\n',1)
new_title = '>' + names[i]
seq = seq[:500]
rL.append('\n'.join([new_title,seq]))
FH = open(ofile,'w')
FH.write('\n\n'.join(rL))
FH.close()

def mutagenize(seq, mrate=1):
L = list(seq)
D = { 'A':'CGT', 'C':'AGT', 'G':'ACT', 'T':'ACG' }
N = int(mrate / 100.0 * len(seq))
X = len(seq)
for i in range(N):
j = random.choice(range(X))
nt = L[j]
if not nt in 'ACGT': continue
L[j] = random.choice(D[nt])
return ''.join(L)

def distribute_seqs(ifile,ofile):
# set up our samples
FH = open(ifile,'r')
data = FH.read().strip().split('\n\n')
FH.close()
seqs = list()
for e in data:
title,seq = e.split('\n',1)
seqs.append(''.join(seq.split()))

outgroup = '>Thermotoga\n' + seqs.pop()

A = {0:5,1:5,2:0,3:1,4:0,5:1,6:1,7:1} # A has lots of Firmicutes
B = {0:0,1:1,2:5,3:5,4:1,5:0,6:1,7:1} # B has Bacteroidetes
C = {0:1,1:0,2:1,3:0,4:5,5:5,6:1,7:1} # C has enterics
dL = [A,B,C]
L = list()

for distr, sample in zip(dL,list('ABC')):
counter = 1
for k in distr:
seq = seqs[k]
n = distr[k]
for i in range(n):
if n == 1: mrate = 5
else: mrate = random.choice((1,2,3))
copy = mutagenize(seq[:],mrate)
name = sample + str(counter)
L.append(DNA.makeSequence(copy,name))
counter += 1
FH = open(ofile,'w')
L = [seq.toFasta() for seq in L]
L.insert(0,outgroup)
FH.write('\n\n'.join(L))
FH.close()

def align_seqs(ifile,ofile):
seqs = LoadSeqs(ifile, moltype=DNA, aligned=False)
aln = align_unaligned_seqs(seqs, DNA)
aln.writeToFile(ofile)
return aln

def get_tree(ifile):
aln = LoadSeqs(ifile, moltype=DNA, aligned=True)
tr = build_tree_from_alignment(aln,moltype=DNA)
return tr

#===============================================
s = '''
AY005045.1 Streptococcus_mitis_bv2
D83363.1 Staphylococcus_epidermidis_14990
L14639.1 Capnocytophaga_gingivalis
AB053940.1 Tannerella_forsythensis_HA3
EU009197.1 Shigella_sonnei_FBD023
AB435616.1 Serratia_marcescens_JCM24201
AB302401.1 Pseudomonas_cinnamophila
AF411020.1 Achromobacter_xylosoxidans_AU1011
AJ401017.1 Thermotoga_maritima_SL7
'''

fn1 = 'rRNA_gb.fasta'
fn2 = 'samples.fasta'
fn3 = 'samples.aln.fasta'
fn4 = 'samples.tree'

if not os.path.exists(fn1) or False:
fetch_ncbi_data(fn1,s)
if not os.path.exists(fn2) or False:
distribute_seqs(fn1,fn2)

if not os.path.exists(fn3) or True:
aln = align_seqs(fn2,fn3)
tr = get_tree(fn3)
# re-root manually
print tr.asciiArt()
n = tr.getNodeMatchingName('Thermotoga')
for a in n.ancestors(): print a.Name

tr2 = tr.rootedAt(n.ancestors()[0].Name)
tree_str = tr2.getNewick(with_distances=True)
FH = open(fn4,'w')
FH.write(tree_str + '\n')
FH.close()

tr = LoadTree(fn4)
print tr.asciiArt()

'''
R code:
library(ape)
setwd('Desktop')
tr = read.tree('temp/samples.tree')
colors = rep('black',length(tr$tip.label))
o = grep('A',tr$tip.label)
colors[o] = 'red'
o = grep('B',tr$tip.label)
colors[o] = 'blue'
o = grep('C',tr$tip.label)
colors[o] = 'darkgreen'
plot(tr,tip.color=colors)
axisPhylo()
'''