# analyses figure for lactobacillus no translational selection category 1) plotted for highly expressed genes and tRNA 2) Oeoen Lmellis Lmellifer plotted separately
# pdf("six_codon_no_selection.pdf",width=16,height=8)
# par(mfrow=c(4,8))
#
#
# for (i in 1:3){
#   codon_aa_table <- read.table(six_codon_aa[i],header=FALSE)
#
#   lacto_table <- codon_aa_table[c(18,20,21,23:27,29,30:34,35,46,48,49,50,53:64),]
#   lacto_mean <- sapply(lacto_table,mean,na.rm=TRUE)
#
#   oeoen_table <- codon_aa_table[c(22),]
#   hon2_table <- codon_aa_table[c(51),]
#   bin4_table <- codon_aa_table[c(52),]
#
#   lacto_high <- c(lacto_mean[[2]],lacto_mean[[3]],lacto_mean[[6]],lacto_mean[[4]],lacto_mean[[5]],lacto_mean[[7]])
#   barplot(lacto_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2,ylab=y_lab[i],cex.lab=font_label)
#   if(i==1)title(main=top_title[1],line=3)
#   if(i==1)title(main=main_title[1],line=2)
#   lacto_tRNA <- c(lacto_mean[[14]],lacto_mean[[15]],lacto_mean[[18]],lacto_mean[[16]],lacto_mean[[17]],lacto_mean[[19]])
#   barplot(lacto_tRNA,col=color_tRNA,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,y_max_tRNA),las=2)
#   if(i==1)title(main=top_title[2],line=3)
#   if(i==1)title(main=main_title[2],line=2)
#
#   oeoen_high <- c(oeoen_table[1,2],oeoen_table[1,3],oeoen_table[1,6],oeoen_table[1,4],oeoen_table[1,5],oeoen_table[1,7])
#   barplot(oeoen_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2)
#   if(i==1)title(main=top_title[3],line=3)
#   if(i==1)title(main=main_title[3],line=2)
#   oeoen_tRNA <- c(oeoen_table[1,14],oeoen_table[1,15],oeoen_table[1,18],oeoen_table[1,16],oeoen_table[1,17],oeoen_table[1,19])
#   barplot(oeoen_tRNA,col=color_tRNA,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,y_max_tRNA),las=2)
#   if(i==1)title(main=top_title[4],line=3)
#   if(i==1)title(main=main_title[4],line=2)
#
#   hon2_high <- c(hon2_table[1,2],hon2_table[1,3],hon2_table[1,6],hon2_table[1,4],hon2_table[1,5],hon2_table[1,7])
#   barplot(hon2_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2)
#   if(i==1)title(main=top_title[5],line=3)
#   if(i==1)title(main=main_title[5],line=2)
#   hon2_tRNA <- c(hon2_table[1,14],hon2_table[1,15],hon2_table[1,18],hon2_table[1,16],hon2_table[1,17],hon2_table[1,19])
#   barplot(hon2_tRNA,col=color_tRNA,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,y_max_tRNA),las=2)
#   if(i==1)title(main=top_title[6],line=3)
#   if(i==1)title(main=main_title[6],line=2)
#
#   bin4_high <- c(bin4_table[1,2],bin4_table[1,3],bin4_table[1,6],bin4_table[1,4],bin4_table[1,5],bin4_table[1,7])
#   barplot(bin4_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2)
#   if(i==1)title(main=top_title[7],line=3)
#   if(i==1)title(main=main_title[7],line=2)
#   bin4_tRNA <- c(bin4_table[1,14],bin4_table[1,15],bin4_table[1,18],bin4_table[1,16],bin4_table[1,17],bin4_table[1,19])
#   barplot(bin4_tRNA,col=color_tRNA,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,y_max_tRNA),las=2)
#   if(i==1)title(main=top_title[8],line=3)
#   if(i==1)title(main=main_title[8],line=2)
# }
#
# three_codon_aa <- 'ile'
# three_codon_codon <- c('AUA','AUU','AUC')
# color <- c('darkslategray3','darkslategray3','firebrick1')
# color_tRNA <- c('skyblue3','skyblue3','indianred3')
# y_lab <- 'Ile'
#
# codon_aa_table <- read.table('ile',header=FALSE)
#
# lacto_table <- codon_aa_table[c(18,20,21,23:27,29,30:34,35,46,48,49,50,53:64),]
# lacto_mean <- sapply(lacto_table,mean,na.rm=TRUE)
#
# oeoen_table <- codon_aa_table[c(22),]
# hon2_table <- codon_aa_table[c(51),]
# bin4_table <- codon_aa_table[c(52),]
#
# lacto_high <- c(lacto_mean[[2]],lacto_mean[[3]],lacto_mean[[4]])
# barplot(lacto_high,col=color,names.arg=three_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2,ylab=y_lab,cex.lab=font_label,space=1)
# lacto_tRNA <- c(lacto_mean[[8]],lacto_mean[[9]],lacto_mean[[10]])
# barplot(lacto_tRNA,col=color_tRNA,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,y_max_tRNA),las=2,space=1)
#
# oeoen_high <- c(oeoen_table[1,2],oeoen_table[1,3],oeoen_table[1,4])
# barplot(oeoen_high,col=color,names.arg=three_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2,space=1)
# oeoen_tRNA <- c(oeoen_table[1,8],oeoen_table[1,9],oeoen_table[1,10])
# barplot(oeoen_tRNA,col=color_tRNA,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,y_max_tRNA),las=2,space=1)
#
# hon2_high <- c(hon2_table[1,2],hon2_table[1,3],hon2_table[1,4])
# barplot(hon2_high,col=color,names.arg=three_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2,space=1)
# hon2_tRNA <- c(hon2_table[1,8],hon2_table[1,9],hon2_table[1,10])
# barplot(hon2_tRNA,col=color_tRNA,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,y_max_tRNA),las=2,space=1)
#
# bin4_high <- c(bin4_table[1,2],bin4_table[1,3],bin4_table[1,4])
# barplot(bin4_high,col=color,names.arg=three_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2,space=1)
# bin4_tRNA <- c(bin4_table[1,8],bin4_table[1,9],bin4_table[1,10])
# barplot(bin4_tRNA,col=color_tRNA,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,y_max_tRNA),las=2,space=1)
#
# dev.off()
# new main figure 1) only plotted for highly expressed genes 2)do not include category of no translational selection 3) Gvag LbDel LbFerI plotted separately
pdf("Figure7 six_codon.pdf",width=6.75,height=5.5)
par(mfrow=c(4,5))
for (i in 1:3){
codon_aa_table <- read.table(six_codon_aa[i],header=FALSE)
lbferi_table <- codon_aa_table[c(19),]
lbdel_table <- codon_aa_table[c(55),]
lacto_table <- codon_aa_table[c(18,20,21,23:27,29,30:34,35,46,48,49,50,53:64),]
lacto_mean <- sapply(lacto_table,mean,na.rm=TRUE)
bifido_table <- codon_aa_table[c(4:8,10:17),]
bifido_mean <- sapply(bifido_table,mean,na.rm=TRUE)
gvag_table <- codon_aa_table[c(9),]
lacto_high <- c(lacto_mean[[2]],lacto_mean[[3]],lacto_mean[[6]],lacto_mean[[4]],lacto_mean[[5]],lacto_mean[[7]])
barplot(lacto_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2,ylab=y_lab[i],cex.lab=font_label)
if(i==1)title(main=main_title[1],line=3)
gvag_high <- c(gvag_table[1,2],gvag_table[1,3],gvag_table[1,6],gvag_table[1,4],gvag_table[1,5],gvag_table[1,7])
barplot(gvag_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2)
if(i==1)title(main=main_title[2],line=3)
lbdel_high <- c(lbdel_table[1,2],lbdel_table[1,3],lbdel_table[1,6],lbdel_table[1,4],lbdel_table[1,5],lbdel_table[1,7])
barplot(lbdel_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2)
if(i==1)title(main=main_title[3],line=3)
lbferi_high <- c(lbferi_table[1,2],lbferi_table[1,3],lbferi_table[1,6],lbferi_table[1,4],lbferi_table[1,5],lbferi_table[1,7])
barplot(lbferi_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2)
if(i==1)title(main=main_title[4],line=3)
bifido_high <- c(bifido_mean[[2]],bifido_mean[[3]],bifido_mean[[6]],bifido_mean[[4]],bifido_mean[[5]],bifido_mean[[7]])
barplot(bifido_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2)
if(i==1)title(main=main_title[5],line=3)
}
three_codon_aa <- 'ile'
three_codon_codon <- c('AUA','AUU','AUC')
color <- c('darkslategray3','darkslategray3','firebrick1')
color_tRNA <- c('skyblue3','skyblue3','indianred3')
y_lab <- 'Ile'
codon_aa_table <- read.table('ile',header=FALSE)
lbferi_table <- codon_aa_table[c(19),]
lbdel_table <- codon_aa_table[c(55),]
lacto_table <- codon_aa_table[c(18,20,21,23:27,29,30:34,35,46,48,49,50,53:64),]
lacto_mean <- sapply(lacto_table,mean,na.rm=TRUE)
bifido_table <- codon_aa_table[c(4:8,10:17),]
bifido_mean <- sapply(bifido_table,mean,na.rm=TRUE)
gvag_table <- codon_aa_table[c(9),]
lacto_high <- c(lacto_mean[[2]],lacto_mean[[3]],lacto_mean[[4]])
barplot(lacto_high,col=color,names.arg=three_codon_codon,cex.names=font_names,ylim=y_lim,las=2,ylab=y_lab,cex.lab=font_label,space=1)
gvag_high <- c(gvag_table[1,2],gvag_table[1,3],gvag_table[1,4])
barplot(gvag_high,col=color,names.arg=three_codon_codon,cex.names=font_names,ylim=y_lim,las=2,space=1)
lbdel_high <- c(lbdel_table[1,2],lbdel_table[1,3],lbdel_table[1,4])
barplot(lbdel_high,col=color,names.arg=three_codon_codon,cex.names=font_names,ylim=y_lim,las=2,space=1)
lbferi_high <- c(lbferi_table[1,2],lbferi_table[1,3],lbferi_table[1,4])
barplot(lbferi_high,col=color,names.arg=three_codon_codon,cex.names=font_names,ylim=y_lim,las=2,space=1)
bifido_high <- c(bifido_mean[[2]],bifido_mean[[3]],bifido_mean[[4]])
barplot(bifido_high,col=color,names.arg=three_codon_codon,cex.names=font_names,ylim=y_lim,las=2,space=1)
dev.off()
six_codon_aa <- c('arg','leu','ser')
six_codon_codon <- list(c('CGA','CGU','AGA','CGG','CGC','AGG'),c('CUA','CUU','UUA','CUG','CUC','UUG'),c('UCA','UCU','AGU','UCG','UCC','AGC'))
font_names <- 1
font_label <- 1.5
color <- c('darkslategray3','darkslategray3','darkslategray3','firebrick1','firebrick1','firebrick1')
color_tRNA <- c('skyblue3','skyblue3','skyblue3','indianred3','indianred3','indianred3')
y_lab <- c('Arg','Leu','Ser')
y_max_tRNA <- 4
y_lim <- c(0,0.8)
main_title <- c('Lactobacillus (GC3s~30%)','G. vaginalis (GC3s~32%)','L. delbrueckii (GC3s~62%)','L. fermentum (GC3s~63%)','Bifidobacterium (GC3s~75%)')
#main_title <- c('high','tRNA','high','tRNA','high','tRNA','high','tRNA')
#top_title <- c('Lactobacillus','Lactobacillus','O. oeni','O. oeni','L. mellis','L. mellis','L. mellifer','L. mellifer')
#main_title_bifido <- c('All','High','tRNA','All','High','tRNA')
#top_title_bifido <- c(NA,'Bifidobacteria (GC3~75%)',NA,NA,'G. vaginalis (GC3~32%)',NA)
#main_title_lacto <- c('All','High','tRNA','All','High','tRNA','All','High','tRNA')
#top_title_lacto <- c(NA,'Lactobacillus(GC3~30%)',NA,NA,'Mutation shift(GC3~62%)',NA,NA,'No selection(GC3~30%)',NA)
# analyses figure for lactobacillus no translational selection category 1) plotted for highly expressed genes and tRNA 2) Oeoen Lmellis Lmellifer plotted separately
# pdf("six_codon_no_selection.pdf",width=16,height=8)
# par(mfrow=c(4,8))
#
#
# for (i in 1:3){
#   codon_aa_table <- read.table(six_codon_aa[i],header=FALSE)
#
#   lacto_table <- codon_aa_table[c(18,20,21,23:27,29,30:34,35,46,48,49,50,53:64),]
#   lacto_mean <- sapply(lacto_table,mean,na.rm=TRUE)
#
#   oeoen_table <- codon_aa_table[c(22),]
#   hon2_table <- codon_aa_table[c(51),]
#   bin4_table <- codon_aa_table[c(52),]
#
#   lacto_high <- c(lacto_mean[[2]],lacto_mean[[3]],lacto_mean[[6]],lacto_mean[[4]],lacto_mean[[5]],lacto_mean[[7]])
#   barplot(lacto_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2,ylab=y_lab[i],cex.lab=font_label)
#   if(i==1)title(main=top_title[1],line=3)
#   if(i==1)title(main=main_title[1],line=2)
#   lacto_tRNA <- c(lacto_mean[[14]],lacto_mean[[15]],lacto_mean[[18]],lacto_mean[[16]],lacto_mean[[17]],lacto_mean[[19]])
#   barplot(lacto_tRNA,col=color_tRNA,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,y_max_tRNA),las=2)
#   if(i==1)title(main=top_title[2],line=3)
#   if(i==1)title(main=main_title[2],line=2)
#
#   oeoen_high <- c(oeoen_table[1,2],oeoen_table[1,3],oeoen_table[1,6],oeoen_table[1,4],oeoen_table[1,5],oeoen_table[1,7])
#   barplot(oeoen_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2)
#   if(i==1)title(main=top_title[3],line=3)
#   if(i==1)title(main=main_title[3],line=2)
#   oeoen_tRNA <- c(oeoen_table[1,14],oeoen_table[1,15],oeoen_table[1,18],oeoen_table[1,16],oeoen_table[1,17],oeoen_table[1,19])
#   barplot(oeoen_tRNA,col=color_tRNA,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,y_max_tRNA),las=2)
#   if(i==1)title(main=top_title[4],line=3)
#   if(i==1)title(main=main_title[4],line=2)
#
#   hon2_high <- c(hon2_table[1,2],hon2_table[1,3],hon2_table[1,6],hon2_table[1,4],hon2_table[1,5],hon2_table[1,7])
#   barplot(hon2_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2)
#   if(i==1)title(main=top_title[5],line=3)
#   if(i==1)title(main=main_title[5],line=2)
#   hon2_tRNA <- c(hon2_table[1,14],hon2_table[1,15],hon2_table[1,18],hon2_table[1,16],hon2_table[1,17],hon2_table[1,19])
#   barplot(hon2_tRNA,col=color_tRNA,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,y_max_tRNA),las=2)
#   if(i==1)title(main=top_title[6],line=3)
#   if(i==1)title(main=main_title[6],line=2)
#
#   bin4_high <- c(bin4_table[1,2],bin4_table[1,3],bin4_table[1,6],bin4_table[1,4],bin4_table[1,5],bin4_table[1,7])
#   barplot(bin4_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2)
#   if(i==1)title(main=top_title[7],line=3)
#   if(i==1)title(main=main_title[7],line=2)
#   bin4_tRNA <- c(bin4_table[1,14],bin4_table[1,15],bin4_table[1,18],bin4_table[1,16],bin4_table[1,17],bin4_table[1,19])
#   barplot(bin4_tRNA,col=color_tRNA,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,y_max_tRNA),las=2)
#   if(i==1)title(main=top_title[8],line=3)
#   if(i==1)title(main=main_title[8],line=2)
# }
#
# three_codon_aa <- 'ile'
# three_codon_codon <- c('AUA','AUU','AUC')
# color <- c('darkslategray3','darkslategray3','firebrick1')
# color_tRNA <- c('skyblue3','skyblue3','indianred3')
# y_lab <- 'Ile'
#
# codon_aa_table <- read.table('ile',header=FALSE)
#
# lacto_table <- codon_aa_table[c(18,20,21,23:27,29,30:34,35,46,48,49,50,53:64),]
# lacto_mean <- sapply(lacto_table,mean,na.rm=TRUE)
#
# oeoen_table <- codon_aa_table[c(22),]
# hon2_table <- codon_aa_table[c(51),]
# bin4_table <- codon_aa_table[c(52),]
#
# lacto_high <- c(lacto_mean[[2]],lacto_mean[[3]],lacto_mean[[4]])
# barplot(lacto_high,col=color,names.arg=three_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2,ylab=y_lab,cex.lab=font_label,space=1)
# lacto_tRNA <- c(lacto_mean[[8]],lacto_mean[[9]],lacto_mean[[10]])
# barplot(lacto_tRNA,col=color_tRNA,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,y_max_tRNA),las=2,space=1)
#
# oeoen_high <- c(oeoen_table[1,2],oeoen_table[1,3],oeoen_table[1,4])
# barplot(oeoen_high,col=color,names.arg=three_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2,space=1)
# oeoen_tRNA <- c(oeoen_table[1,8],oeoen_table[1,9],oeoen_table[1,10])
# barplot(oeoen_tRNA,col=color_tRNA,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,y_max_tRNA),las=2,space=1)
#
# hon2_high <- c(hon2_table[1,2],hon2_table[1,3],hon2_table[1,4])
# barplot(hon2_high,col=color,names.arg=three_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2,space=1)
# hon2_tRNA <- c(hon2_table[1,8],hon2_table[1,9],hon2_table[1,10])
# barplot(hon2_tRNA,col=color_tRNA,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,y_max_tRNA),las=2,space=1)
#
# bin4_high <- c(bin4_table[1,2],bin4_table[1,3],bin4_table[1,4])
# barplot(bin4_high,col=color,names.arg=three_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2,space=1)
# bin4_tRNA <- c(bin4_table[1,8],bin4_table[1,9],bin4_table[1,10])
# barplot(bin4_tRNA,col=color_tRNA,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,y_max_tRNA),las=2,space=1)
#
# dev.off()
# new main figure 1) only plotted for highly expressed genes 2)do not include category of no translational selection 3) Gvag LbDel LbFerI plotted separately
pdf("Figure7 six_codon.pdf",width=6.75,height=5.5)
par(mfrow=c(4,5))
for (i in 1:3){
codon_aa_table <- read.table(six_codon_aa[i],header=FALSE)
lbferi_table <- codon_aa_table[c(19),]
lbdel_table <- codon_aa_table[c(55),]
lacto_table <- codon_aa_table[c(18,20,21,23:27,29,30:34,35,46,48,49,50,53:64),]
lacto_mean <- sapply(lacto_table,mean,na.rm=TRUE)
bifido_table <- codon_aa_table[c(4:8,10:17),]
bifido_mean <- sapply(bifido_table,mean,na.rm=TRUE)
gvag_table <- codon_aa_table[c(9),]
lacto_high <- c(lacto_mean[[2]],lacto_mean[[3]],lacto_mean[[6]],lacto_mean[[4]],lacto_mean[[5]],lacto_mean[[7]])
barplot(lacto_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2,ylab=y_lab[i],cex.lab=font_label)
if(i==1)title(main=main_title[1],line=3)
gvag_high <- c(gvag_table[1,2],gvag_table[1,3],gvag_table[1,6],gvag_table[1,4],gvag_table[1,5],gvag_table[1,7])
barplot(gvag_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2)
if(i==1)title(main=main_title[2],line=3)
lbdel_high <- c(lbdel_table[1,2],lbdel_table[1,3],lbdel_table[1,6],lbdel_table[1,4],lbdel_table[1,5],lbdel_table[1,7])
barplot(lbdel_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2)
if(i==1)title(main=main_title[3],line=3)
lbferi_high <- c(lbferi_table[1,2],lbferi_table[1,3],lbferi_table[1,6],lbferi_table[1,4],lbferi_table[1,5],lbferi_table[1,7])
barplot(lbferi_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2)
if(i==1)title(main=main_title[4],line=3)
bifido_high <- c(bifido_mean[[2]],bifido_mean[[3]],bifido_mean[[6]],bifido_mean[[4]],bifido_mean[[5]],bifido_mean[[7]])
barplot(bifido_high,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=y_lim,las=2)
if(i==1)title(main=main_title[5],line=3)
}
three_codon_aa <- 'ile'
three_codon_codon <- c('AUA','AUU','AUC')
color <- c('darkslategray3','darkslategray3','firebrick1')
color_tRNA <- c('skyblue3','skyblue3','indianred3')
y_lab <- 'Ile'
codon_aa_table <- read.table('ile',header=FALSE)
lbferi_table <- codon_aa_table[c(19),]
lbdel_table <- codon_aa_table[c(55),]
lacto_table <- codon_aa_table[c(18,20,21,23:27,29,30:34,35,46,48,49,50,53:64),]
lacto_mean <- sapply(lacto_table,mean,na.rm=TRUE)
bifido_table <- codon_aa_table[c(4:8,10:17),]
bifido_mean <- sapply(bifido_table,mean,na.rm=TRUE)
gvag_table <- codon_aa_table[c(9),]
lacto_high <- c(lacto_mean[[2]],lacto_mean[[3]],lacto_mean[[4]])
barplot(lacto_high,col=color,names.arg=three_codon_codon,cex.names=font_names,ylim=y_lim,las=2,ylab=y_lab,cex.lab=font_label,space=1)
gvag_high <- c(gvag_table[1,2],gvag_table[1,3],gvag_table[1,4])
barplot(gvag_high,col=color,names.arg=three_codon_codon,cex.names=font_names,ylim=y_lim,las=2,space=1)
lbdel_high <- c(lbdel_table[1,2],lbdel_table[1,3],lbdel_table[1,4])
barplot(lbdel_high,col=color,names.arg=three_codon_codon,cex.names=font_names,ylim=y_lim,las=2,space=1)
lbferi_high <- c(lbferi_table[1,2],lbferi_table[1,3],lbferi_table[1,4])
barplot(lbferi_high,col=color,names.arg=three_codon_codon,cex.names=font_names,ylim=y_lim,las=2,space=1)
bifido_high <- c(bifido_mean[[2]],bifido_mean[[3]],bifido_mean[[4]])
barplot(bifido_high,col=color,names.arg=three_codon_codon,cex.names=font_names,ylim=y_lim,las=2,space=1)
dev.off()
# pdf("six_codon_bifido.pdf",width=12,height=8)
# par(mfrow=c(4,6))
#
# for (i in 1:3){
#   codon_aa_table <- read.table(six_codon_aa[i],header=FALSE)
#
#   high_gc_aa_table <- codon_aa_table[c(4:8,10:17),]
#   high_gc_mean <- sapply(high_gc_aa_table,mean,na.rm=TRUE)
#
#   low_gc_aa_table <- codon_aa_table[c(9),]
#   low_gc_mean <- sapply(low_gc_aa_table,mean,na.rm=TRUE)
#
#   high_all_mean <- c(high_gc_mean[[8]],high_gc_mean[[9]],high_gc_mean[[12]],high_gc_mean[[10]],high_gc_mean[[11]],high_gc_mean[[13]])
#   barplot(high_all_mean,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,1),las=2,ylab=y_lab[i],cex.lab=font_label)
#   if(i==1)title(main=top_title_bifido[1],line=3)
#   if(i==1)title(main=main_title_bifido[1],line=2)
#   high_high_mean <- c(high_gc_mean[[2]],high_gc_mean[[3]],high_gc_mean[[6]],high_gc_mean[[4]],high_gc_mean[[5]],high_gc_mean[[7]])
#   barplot(high_high_mean,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,1),las=2)
#   if(i==1)title(main=top_title_bifido[2],line=3)
#   if(i==1)title(main=main_title_bifido[2],line=2)
#   high_tRNA_mean <- c(high_gc_mean[[14]],high_gc_mean[[15]],high_gc_mean[[18]],high_gc_mean[[16]],high_gc_mean[[17]],high_gc_mean[[19]])
#   barplot(high_tRNA_mean,col=color_tRNA,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,y_max_tRNA),las=2)
#   if(i==1)title(main=top_title_bifido[3],line=3)
#   if(i==1)title(main=main_title_bifido[3],line=2)
#
#   low_all_mean <- c(low_gc_mean[[8]],low_gc_mean[[9]],low_gc_mean[[12]],low_gc_mean[[10]],low_gc_mean[[11]],low_gc_mean[[13]])
#   barplot(low_all_mean,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,1),las=2)
#   if(i==1)title(main=top_title_bifido[4],line=3)
#   if(i==1)title(main=main_title_bifido[4],line=2)
#   low_high_mean <- c(low_gc_mean[[2]],low_gc_mean[[3]],low_gc_mean[[6]],low_gc_mean[[4]],low_gc_mean[[5]],low_gc_mean[[7]])
#   barplot(low_high_mean,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,1),las=2)
#   if(i==1)title(main=top_title_bifido[5],line=3)
#   if(i==1)title(main=main_title_bifido[5],line=2)
#   low_tRNA_mean <- c(low_gc_mean[[14]],low_gc_mean[[15]],low_gc_mean[[18]],low_gc_mean[[16]],low_gc_mean[[17]],low_gc_mean[[19]])
#   barplot(low_tRNA_mean,col=color_tRNA,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,y_max_tRNA),las=2)
#   if(i==1)title(main=top_title_bifido[6],line=3)
#   if(i==1)title(main=main_title_bifido[6],line=2)
#
# }
#
# three_codon_aa <- 'ile'
# three_codon_codon <- c('AUA','AUU','AUC')
# three_color <- c('darkslategray3','darkslategray3','firebrick1')
# three_color_tRNA <- c('skyblue3','skyblue3','indianred3')
# y_lab <- 'Ile'
#
# codon_aa_table <- read.table('ile',header=FALSE)
#
# high_gc_aa_table <- codon_aa_table[c(4:8,10:17),]
# high_gc_mean <- sapply(high_gc_aa_table,mean,na.rm=TRUE)
#
# low_gc_aa_table <- codon_aa_table[c(9),]
# low_gc_mean <- sapply(low_gc_aa_table,mean,na.rm=TRUE)
#
# high_all_mean <- c(high_gc_mean[[5]],high_gc_mean[[6]],high_gc_mean[[7]])
# barplot(high_all_mean,col=three_color,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,1),las=2,space=1,ylab=y_lab,cex.lab=font_label)
# high_high_mean <- c(high_gc_mean[[2]],high_gc_mean[[3]],high_gc_mean[[4]])
# barplot(high_high_mean,col=three_color,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,1),las=2,space=1)
# high_tRNA_mean <- c(high_gc_mean[[8]],high_gc_mean[[9]],high_gc_mean[[10]])
# barplot(high_tRNA_mean,col=three_color_tRNA,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,y_max_tRNA),las=2,space=1)
#
# low_all_mean <- c(low_gc_mean[[5]],low_gc_mean[[6]],low_gc_mean[[7]])
# barplot(low_all_mean,col=three_color,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,1),las=2,space=1)
# low_high_mean <- c(low_gc_mean[[2]],low_gc_mean[[3]],low_gc_mean[[4]])
# barplot(low_high_mean,col=three_color,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,1),las=2,space=1)
# low_tRNA_mean <- c(low_gc_mean[[8]],low_gc_mean[[9]],low_gc_mean[[10]])
# barplot(low_tRNA_mean,col=three_color_tRNA,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,y_max_tRNA),las=2,space=1)
#
# dev.off()
#
# pdf("six_codon_lacto.pdf",width=15,height=8)
# par(mfrow=c(4,9))
#
# for (i in 1:3){
#   codon_aa_table <- read.table(six_codon_aa[i],header=FALSE)
#
#   high_gc_aa_table <- codon_aa_table[c(19,55),]
#   high_gc_mean <- sapply(high_gc_aa_table,mean,na.rm=TRUE)
#
#   relaxed_aa_table <- codon_aa_table[c(22,51,52),]
#   relaxed_mean <- sapply(relaxed_aa_table,mean,na.rm=TRUE)
#
#   low_gc_aa_table <- codon_aa_table[c(18,20,21,23:27,29,30:34,35,46,48,49,50,53:64),]
#   low_gc_mean <- sapply(low_gc_aa_table,mean,na.rm=TRUE)
#
#   low_all_mean <- c(low_gc_mean[[8]],low_gc_mean[[9]],low_gc_mean[[12]],low_gc_mean[[10]],low_gc_mean[[11]],low_gc_mean[[13]])
#   barplot(low_all_mean,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,1),ylab=y_lab[i],cex.lab=font_label,las=2)
#   if(i==1)title(main=top_title_lacto[1],line=3)
#   if(i==1)title(main=main_title_lacto[1],line=2)
#   low_high_mean <- c(low_gc_mean[[2]],low_gc_mean[[3]],low_gc_mean[[6]],low_gc_mean[[4]],low_gc_mean[[5]],low_gc_mean[[7]])
#   barplot(low_high_mean,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,1),las=2)
#   if(i==1)title(main=top_title_lacto[2],line=3)
#   if(i==1)title(main=main_title_lacto[2],line=2)
#   low_tRNA_mean <- c(low_gc_mean[[14]],low_gc_mean[[15]],low_gc_mean[[18]],low_gc_mean[[16]],low_gc_mean[[17]],low_gc_mean[[19]])
#   barplot(low_tRNA_mean,col=color_tRNA,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,y_max_tRNA),las=2)
#   if(i==1)title(main=top_title_lacto[3],line=3)
#   if(i==1)title(main=main_title_lacto[3],line=2)
#
#   high_all_mean <- c(high_gc_mean[[8]],high_gc_mean[[9]],high_gc_mean[[12]],high_gc_mean[[10]],high_gc_mean[[11]],high_gc_mean[[13]])
#   barplot(high_all_mean,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,1),las=2)
#   if(i==1)title(main=top_title_lacto[4],line=3)
#   if(i==1)title(main=main_title_lacto[4],line=2)
#   high_high_mean <- c(high_gc_mean[[2]],high_gc_mean[[3]],high_gc_mean[[6]],high_gc_mean[[4]],high_gc_mean[[5]],high_gc_mean[[7]])
#   barplot(high_high_mean,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,1),las=2)
#   if(i==1)title(main=top_title_lacto[5],line=3)
#   if(i==1)title(main=main_title_lacto[5],line=2)
#   high_tRNA_mean <- c(high_gc_mean[[14]],high_gc_mean[[15]],high_gc_mean[[18]],high_gc_mean[[16]],high_gc_mean[[17]],high_gc_mean[[19]])
#   barplot(high_tRNA_mean,col=color_tRNA,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,y_max_tRNA),las=2)
#   if(i==1)title(main=top_title_lacto[6],line=3)
#   if(i==1)title(main=main_title_lacto[6],line=2)
#
#   relaxed_all_mean <- c(relaxed_mean[[8]],relaxed_mean[[9]],relaxed_mean[[12]],relaxed_mean[[10]],relaxed_mean[[11]],relaxed_mean[[13]])
#   barplot(relaxed_all_mean,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,1),las=2)
#   if(i==1)title(main=top_title_lacto[7],line=3)
#   if(i==1)title(main=main_title_lacto[7],line=2)
#   relaxed_high_mean <- c(relaxed_mean[[2]],relaxed_mean[[3]],relaxed_mean[[6]],relaxed_mean[[4]],relaxed_mean[[5]],relaxed_mean[[7]])
#   barplot(relaxed_high_mean,col=color,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,1),las=2)
#   if(i==1)title(main=top_title_lacto[8],line=3)
#   if(i==1)title(main=main_title_lacto[8],line=2)
#   relaxed_tRNA_mean <- c(relaxed_mean[[14]],relaxed_mean[[15]],relaxed_mean[[18]],relaxed_mean[[16]],relaxed_mean[[17]],relaxed_mean[[19]])
#   barplot(relaxed_tRNA_mean,col=color_tRNA,names.arg=six_codon_codon[[i]],cex.names=font_names,ylim=c(0,y_max_tRNA),las=2)
#   if(i==1)title(main=top_title_lacto[9],line=3)
#   if(i==1)title(main=main_title_lacto[9],line=2)
# }
#
# three_codon_aa <- 'ile'
# three_codon_codon <- c('AUA','AUU','AUC')
# three_color <- c('darkslategray3','darkslategray3','firebrick1')
# three_color_tRNA <- c('skyblue3','skyblue3','indianred3')
# y_lab <- 'Ile'
#
# codon_aa_table <- read.table('ile',header=FALSE)
#
# high_gc_aa_table <- codon_aa_table[c(19,55),]
# high_gc_mean <- sapply(high_gc_aa_table,mean,na.rm=TRUE)
#
# relaxed_aa_table <- codon_aa_table[c(22,51,52),]
# relaxed_mean <- sapply(relaxed_aa_table,mean,na.rm=TRUE)
#
# low_gc_aa_table <- codon_aa_table[c(18,20,21,23:27,29,30:34,35,46,48,49,50,53:64),]
# low_gc_mean <- sapply(low_gc_aa_table,mean,na.rm=TRUE)
#
# low_all_mean <- c(low_gc_mean[[5]],low_gc_mean[[6]],low_gc_mean[[7]])
# barplot(low_all_mean,col=three_color,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,1),ylab=y_lab,cex.lab=font_label,las=2,space=1)
# low_high_mean <- c(low_gc_mean[[2]],low_gc_mean[[3]],low_gc_mean[[4]])
# barplot(low_high_mean,col=three_color,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,1),las=2,space=1)
# low_tRNA_mean <- c(low_gc_mean[[8]],low_gc_mean[[9]],low_gc_mean[[10]])
# barplot(low_tRNA_mean,col=three_color_tRNA,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,y_max_tRNA),las=2,space=1)
#
# high_all_mean <- c(high_gc_mean[[5]],high_gc_mean[[6]],high_gc_mean[[7]])
# barplot(high_all_mean,col=three_color,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,1),las=2,space=1)
# high_high_mean <- c(high_gc_mean[[2]],high_gc_mean[[3]],high_gc_mean[[4]])
# barplot(high_high_mean,col=three_color,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,1),las=2,space=1)
# high_tRNA_mean <- c(high_gc_mean[[8]],high_gc_mean[[9]],high_gc_mean[[10]])
# barplot(high_tRNA_mean,col=three_color_tRNA,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,y_max_tRNA),las=2,space=1)
#
# relaxed_all_mean <- c(relaxed_mean[[5]],relaxed_mean[[6]],relaxed_mean[[7]])
# barplot(relaxed_all_mean,col=three_color,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,1),las=2,space=1)
# relaxed_high_mean <- c(relaxed_mean[[2]],relaxed_mean[[3]],relaxed_mean[[4]])
# barplot(relaxed_high_mean,col=three_color,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,1),las=2,space=1)
# relaxed_tRNA_mean <- c(relaxed_mean[[8]],relaxed_mean[[9]],relaxed_mean[[10]])
# barplot(relaxed_tRNA_mean,col=three_color_tRNA,names.arg=three_codon_codon,cex.names=font_names,ylim=c(0,y_max_tRNA),las=2,space=1)
#
# dev.off()
codon_aa_table <- read.table(six_codon_aa[i],header=FALSE)
library("sscu", lib.loc="~/R/x86_64-suse-linux-gnu-library/3.2")
detach("package:sscu", unload=TRUE)
library("sscu", lib.loc="~/R/x86_64-suse-linux-gnu-library/3.2")
---
title: "sscu user manual (0.99.0)"
author: "Yu Sun"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Vignette Title}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
# Overview
The package sscu (Strength of Selected Codon Usage) calculates the selective profile in codon usage in bacteria species. Currently, the package has three functions: s_index, optimal_codons and optimal_mutaion_index.
## s_index
The function calculate the s index based on [Paul Sharp's method](http://nar.oxfordjournals.org/content/33/4/1141.long). The method take into account of background mutation rate (in the program, two arguments genomic_cds_file and gc3, are input to calculate the mutation rate), and focus only on codons with universal translational advantages in all bacterial species (in the program, one argument high_cds_file, is input to calculate these codons). Thus the s index can be used to quantify the strength of translational selection and is comparable among different species.
The argument high_cds_file much be specified with the input filepath for the highly expressed genes. The file should be a multifasta file contains 40 highly, including  elongation factor Tu, Ts, G, 50S ribosomal protein L1 to L6, L9 to L20, 30S ribosomal protein S2 to S20. This file can be generated by either directly extract these DNA sequence from genbank file, or parse by blast program. For the four amino acids (Phy, Tyr, Ile and Asn), the C-ending codons are universally preferred than the U-ending codons. Thus, only these four codons were taken into account in the analyses.
The second arguments, genomic_cds_file or gc3, is used to calculate the genomic mutation rate, and one of them must be specified. The genomic_cds_file should be a multifasta file contains all the coding sequences in the genome, and the function use it to calculate the genomic gc3 and mutation rate. If the gc3 value for the genome is known already, you can specify it in the argument gc3. If both the genomic_cds_file and gc3 arguments are specified, the function will use the genomic_cds_file to calculate mutation rate, and neglect the gc3 argument.
Here is an example of how to run it:
```{r}
sscu::s_index(high_cds_file=system.file("sequences/L_kunkeei_highly.ffn",package="sscu"),
genomic_cds_file=system.file("sequences/L_kunkeei_genome_cds.ffn",package="sscu"))
```
In general, S index > 0.3 indicate translational selection is affecting codon usage in the genome. S index of 1.54 suggests strong stranslational selection in L. kunkeei, which is even higher than E. coli (1.49).
## optimal_codons
If you want to know the detailed codon usage information, i.e. which codons are selected preferred (optimal codons), you can run the function optimal_codons in the package. The optimal codons are defined as codons significantly enriched in the highly expressed genes compared to the lowly expressed genes, or other set of reference genes. The function calculate the optimal codon list with p-values, thus user could have a general idea of which codons were preferred by selection in the genome.
The argument high_cds_file should specific the path for the highly expressed gene dataset. It is up to the users how to define which dataset of highly expressed genes. Some studies use the expression data, or Nc value to divide genes into highly/lowly sets. Other studies use a specific dataset, such as only including the very highly expressed genes (ribosomal genes).
The argument ref_cds_file should specific the path for the lowly expressed gene dataset, or any appropriate dataset. In Sharp PM paper (Forces that influence the evolution of codon bias), he used the all gene data set as neutral reference and also get a list of optimal codons.
The argument p_cutoff set the cutoff for p values in the chi.square test. Only codons are significantly enriched in the highly expressed genes are marked with + symbol in the ouotput tables. The codons are significantly lower presented in the highly expressed genes are marked with - symbol. The codons are not significantly differently presented compared to the reference dataset are marked with NA symbol.
The function also output the rscu value for the high expressed dataset and reference dataset.
```{r}
sscu::optimal_codons(high_cds_file=system.file("sequences/L_kunkeei_highly.ffn",package="sscu"),
ref_cds_file=system.file("sequences/L_kunkeei_genome_cds.ffn",package="sscu"))
```
## optimal_mutation_index
The function optimal_mutation index is to estimate the amount of GC-ending optimal codon for the highly expressed genes in the AT-biased mutation background. Although the function focuses on the CG-ending optimal codons in the four and six codon boxes, but it has similar mathematical formula as sscu, thus the index is comparable with the S index. The function also take into account of background mutation rate as sscu index. However, since the set of GC-ending optimal codons are likely to be different among different species, the index can not be compared among different species.
The argument high_cds_file must be specified with the input filepath for the highly expressed genes. The file should be a multifasta file contains 40 highly, including  elongation factor Tu, Ts, G, 50S ribosomal protein L1 to L6, L9 to L20, 30S ribosomal protein S2 to S20. This file can be generated by either directly extract these DNA sequence from genbank file, or parse by blast program. For the four amino acids (Phy, Tyr, Ile and Asn), the C-ending codons are always preferred than the U-ending codons. Thus, only these four codons were taken into account in the analyses.
The arguments, genomic_cds_file, is used to calculate the genomic mutation rate (gc3). The genomic_cds_file should be a multifasta file contains all the coding sequences in the genome, and the function use it to calculate the genomic gc3 and mutation rate.
Currently, the function only calculate GC-ending optimal codon in the AT-biased mutation background, thus genomic gc3 higher than 50% will report an error message "Genomic GC3 must be lower than 50%!". In addition, most of the AT biased genomes do not have any GC-ending optimal codons for the four and six codon boxes, thus the function will report NA as output.
```{r}
sscu::optimal_mutation_index(high_cds_file=system.file("sequences/L_kunkeei_highly.ffn",package="sscu"),
genomic_cds_file=system.file("sequences/L_kunkeei_genome_cds.ffn",package="sscu"))
```
library("seqinr", lib.loc="~/R/x86_64-suse-linux-gnu-library/3.2")
library("Biostrings", lib.loc="~/R/x86_64-suse-linux-gnu-library/3.2")
library("knitr", lib.loc="~/R/x86_64-suse-linux-gnu-library/3.2")
library("rmarkdown", lib.loc="~/R/x86_64-suse-linux-gnu-library/3.2")
install.packages("rmarkdown")
R_LIBS_USER
.Library
