Wednesday, February 3, 2010

Unifrac analysis: simulating sequences

I want to show an example of phylogenetic analysis using Unifrac (website (old), PMID 16893466). Before we do that, we need to get some sequences to work with. I am going to do that here, and I'll talk about Unifrac next time.

We're going to simulate sequences from two different populations. The simulation is based on 4 authentic rRNA sequences that we get from Genbank:


AY005045.1     Streptococcus_mitis_bv2
AB302401.1 Pseudomonas_cinnamophila
L14639.1 Capnocytophaga_gingivalis
AF411020.1 Achromobacter_xylosoxidans_AU1011


We will simulate the two samples by picking sequences with some distribution. To start with, I'm going to try this one:


    A_distr = {0:1,1:3,2:5,3:1}
B_distr = {0:0,1:3,2:1,3:5}


So, sample A will have 1 sequence from S. mitis, 3 from P. cinnamophila, etc. As I work through the Unifrac analysis, I might go back and change this to make the results more interesting, but this is a reasonable start.

Also, we'll mutagenize each sequence. The script to set this up is at the end of the post. Note that you must have PyCogent installed to follow along at home. You will also need muscle and FastTree installed somewhere accessible on your $PATH.

Here is a tree based on the sequences as printed out at the Terminal by the script.


          /-A3
|
| /-A4
| /0.042---|
| | \-B2
|-0.744---|
| | /-B1
---------| \0.623---|
| \-B3
|
| /-A2
| |
| | /-A1
| | |
| | | /-A7
\0.578---| /0.966---| /0.644---|
| | | | \-A6
| | | |
| | \1.000---| /-A9
| | | /0.389---|
| | | | \-B4
\0.996---| \0.225---|
| | /-A5
| \0.822---|
| \-A8
|
| /-A10
| |
\1.000---| /-B8
| |
\0.591---| /-B5
| /0.950---|
| | \-B6
\0.618---|
| /-B7
\0.742---|
\-B9



import random, os, sys
random.seed(137)
from cogent import LoadSeqs, DNA
from cogent.db.ncbi import EFetch
from cogent.app.muscle import align_unaligned_seqs
from cogent.app.fasttree import build_tree_from_alignment

def fetch_ncbi_data(ofile,s):
# get the seqs from Genbank
input = [e.split() for e in s.strip().split('\n')]
id_list = [t[0] for t in input]
names = [t[1] for t in input]
ef = EFetch(id=','.join(id_list), rettype='fasta')
data = ef.read().strip()

# title lines are too long, replace by genus_species
rL = list()
for i,e in enumerate(data.split('\n\n')):
old_title, seq = e.strip().split('\n',1)
new_title = '>' + names[i]
seq = seq[:500]
rL.append('\n'.join([new_title,seq]))
FH = open(ofile,'w')
FH.write('\n\n'.join(rL))
FH.close()

def mutagenize(seq, percent=5):
L = list(seq)
D = { 'A':'CGT', 'C':'AGT', 'G':'ACT', 'T':'ACG' }
N = int(percent / 100.0 * len(seq))
X = len(seq)
for i in range(N):
j = random.choice(range(X))
nt = L[j]
if not nt in 'ACGT': continue
L[j] = random.choice(D[nt])
return ''.join(L)

def distribute_seqs(ifile,ofile,mut_freq=10):
# set up our two samples
FH = open(ifile,'r')
data = FH.read().strip().split('\n\n')
FH.close()

A_distr = {0:1,1:3,2:5,3:1}
B_distr = {0:0,1:3,2:1,3:5}
A = list()
B = list()
x = y = 0
for i,e in enumerate(data):
title,seq = e.split('\n',1)
seq = ''.join(seq.split())
for j in range(A_distr[i]):
x += 1
copy = mutagenize(seq[:],mut_freq)
new_seq = DNA.makeSequence(copy,'A' + str(x))
A.append(new_seq)
for j in range(B_distr[i]):
y += 1
copy = mutagenize(seq[:],mut_freq)
new_seq = DNA.makeSequence(copy,'B' + str(y))
B.append(new_seq)

FH = open(ofile,'w')
L = [seq.toFasta() for seq in A + B]
FH.write('\n\n'.join(L))
FH.close()

def align_seqs(ifile,ofile):
seqs = LoadSeqs(ifile, moltype=DNA, aligned=False)
aln = align_unaligned_seqs(seqs, DNA)
aln.writeToFile(ofile)
return aln

def get_tree(ifile):
aln = LoadSeqs(ifile, moltype=DNA, aligned=True)
tr = build_tree_from_alignment(aln,moltype=DNA)
return tr

#===============================================
s = '''
AY005045.1 Streptococcus_mitis_bv2
AB302401.1 Pseudomonas_cinnamophila
L14639.1 Capnocytophaga_gingivalis
AF411020.1 Achromobacter_xylosoxidans_AU1011
'''

fn1 = 'rRNA_gb.fasta'
fn2 = 'samples.fasta'
fn3 = 'samples.aln.fasta'
fn4 = 'samples.tree'

if not os.path.exists(fn1): fetch_ncbi_data(fn1,s)
if not os.path.exists(fn2): distribute_seqs(fn1,fn2)
if not os.path.exists(fn3): aln = align_seqs(fn2,fn3)
tr = get_tree(fn3)
print tr.asciiArt()

tree_str = tr.getNewick(with_distances=True)
FH = open(fn4,'w')
FH.write(tree_str + '\n')
FH.close()