This is the R Markdown for Extended Dat Fig1, which consists of 3 parts.
cd mutation_signature
python make_trinuc_maf.py Somatic_mutation.filter.208.maf Somatic_mutation.filter.trinuc.208.maf
cd ../somatic_mutaion
python main.py Stratton_signatures29_1.txt Somatic_mutation.filter.trinuc.208.maf Somatic_mutation.filter.trinuc.decomposed.208.maf
library(foreach)
cl=as.data.frame(readxl::read_excel("Supplementary+Table+1+Clinical+information+20190710(1).xlsx"))
rownames(cl)=paste("T",cl$`Sequencing ID`,sep="")
cl=cl[!is.element(rownames(cl),c("T37","T274")),]
cl$Gs=paste(cl$G1,cl$G2,sep="+")
cl$Gs[cl$Gs=="3+5"]=">=8"
cl$Gs[cl$Gs=="4+4"]=">=8"
cl$Gs[cl$Gs=="4+5"]=">=8"
cl$Gs[cl$Gs=="5+3"]=">=8"
cl$Gs[cl$Gs=="5+4"]=">=8"
cl$Gs[cl$Gs=="5+5"]=">=8"
cl$Gs[cl$Gs=="NA+NA"]="Not available"
cl$GS[cl$GS==8]=">=8"
cl$GS[cl$GS==9]=">=8"
cl$GS[cl$GS==10]=">=8"
cl$GS[cl$GS=="NA"]="Not available"
cl$`Pathology T`[cl$`Pathology T`=="pT2"]="pT2a/pT2b"
cl$`Pathology T`[cl$`Pathology T`=="pT2a"]="pT2a/pT2b"
cl$`Pathology T`[cl$`Pathology T`=="pT2b"]="pT2a/pT2b"
cl$`Pathology N`[cl$`Pathology N`=="pn0"]="pN0"
cl$`Pathology N`[cl$`Pathology N`=="pNX"]="pNx"
cl$`Pathology N`[cl$`Pathology N`=="pN0"]="Negative"
cl$`Pathology N`[cl$`Pathology N`=="pNx"]="Not available"
cl$`Pathology N`[cl$`Pathology N`=="pN1"]="Positive"
cl$BCR[cl$BCR=="NA"|cl$BCR=="Dead"]="Not available"
cl$`Surgical Margin`[cl$`Surgical Margin`=="NA"]="Not available"
cl$Extraprostatic[cl$Extraprostatic=="NA"]="Not available"
cl$`Seminal Vesical`[cl$`Seminal Vesical`=="NA"]="Not available"
cl$PSA[cl$PSA==">100"]=101
cl[cl=="NO"]="Negative"
cl[cl=="YES"]="Positive"
lc=c(23,14,20,15,10,19)
lc_sum=foreach(i=1:length(lc)) %do% table(cl[,lc[i]])
names(lc_sum)=c("Gleason Score","Pathologic Stage","PSA Recurrence","LymphNodes Metastasis","Bone Metastasis","Margin Satus")
cl$PSA=as.numeric(cl$PSA)
cl$Age=as.numeric(cl$Age)
## Warning: NAs introduced by coercion
age=paste(paste(median(na.omit(cl$Age)),paste(min(na.omit(cl$Age)),max(na.omit(cl$Age)),sep="-"),sep="("),"",sep=")")
psa=paste(paste(round(mean(na.omit(cl$PSA)),1),paste(min(na.omit(cl$PSA)),max(na.omit(cl$PSA)),sep="-"),sep="("),"",sep=")")
item=foreach(i=1:length(lc_sum),.combine=c) %do% paste(names(lc_sum)[i],names(lc_sum[[i]]),sep=": ")
value=foreach(i=1:length(lc_sum),.combine=c) %do% as.numeric(lc_sum[[i]])
table1=data.frame(Item=c("Age","PSA",item),CPGEA=c(age,psa,value))
table1=table1[c(1:2,4:6,3,7,8:12,15,13:14,18,16:17,20,19,23,21:22),]
#write.table(table1,"supplement_table1.xls",sep="\t",row.names = F,quote=F)
table1
## Item CPGEA
## 1 Age 69(50-88)
## 2 PSA 38.8(1.25-1270.14)
## 4 Gleason Score: 3+3 17
## 5 Gleason Score: 3+4 56
## 6 Gleason Score: 4+3 40
## 3 Gleason Score: >=8 94
## 7 Gleason Score: Not available 1
## 8 Pathologic Stage: pT2a/pT2b 28
## 9 Pathologic Stage: pT2c 77
## 10 Pathologic Stage: pT3a 60
## 11 Pathologic Stage: pT3b 33
## 12 Pathologic Stage: pT4 10
## 15 PSA Recurrence: Positive 60
## 13 PSA Recurrence: Negative 127
## 14 PSA Recurrence: Not available 21
## 18 LymphNodes Metastasis: Positive 23
## 16 LymphNodes Metastasis: Negative 155
## 17 LymphNodes Metastasis: Not available 30
## 20 Bone Metastasis: Positive 15
## 19 Bone Metastasis: Negative 193
## 23 Margin Satus: Positive 82
## 21 Margin Satus: Negative 124
## 22 Margin Satus: Not available 2
#signature
library(data.table)
library(ggplot2)
maf = suppressWarnings(fread("Somatic_mutation.filter.trinuc.decomposed.208.maf"))
maf=as.data.frame(maf)
freq1358916=sum(maf$`Number of Mutations`*rowSums(maf[,c(1,3,5,8,9,16)+2]))/sum(maf$`Number of Mutations`)
freq2461224=sum(maf$`Number of Mutations`*rowSums(maf[,c(2,4,6,12)+2]))/sum(maf$`Number of Mutations`)
freq135891630=sum(maf$`Number of Mutations`*rowSums(maf[,c(1,3,5,8,9,16,29)+2]))/sum(maf$`Number of Mutations`)
sig.dt=as.matrix(maf[,-c(1:2)])
row.names(sig.dt)=sub("_WGS","",maf$`Sample Name`)
sig.dt[sig.dt<0.06]=0
sig.dt=sig.dt[,colSums(sig.dt)!=0]
sig.dt=sig.dt*maf$`Number of Mutations`
sig.dt=sig.dt[order(sig.dt[,1],sig.dt[,2],sig.dt[,3],sig.dt[,4],sig.dt[,5],sig.dt[,6],sig.dt[,7],sig.dt[,8],sig.dt[,9],sig.dt[,10],sig.dt[,11],sig.dt[,12],sig.dt[,13],sig.dt[,14],sig.dt[,15],sig.dt[,16],sig.dt[,17]),]
dt=melt(sig.dt)
dt[dt==0]=NA
dt=dt[!is.na(dt$value),]
dt$value=log10(dt$value)
theme_bwmin = function(base_size = 12, base_family = 'Helvetica')
{
theme_bw(base_size = base_size, base_family = base_family) %+replace%
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
strip.background = element_blank(),
axis.line = element_line(size = 0.1),
axis.ticks = element_line(size = 0.1),
legend.key = element_rect(color = 'white'),
legend.key.size = unit(.25, 'cm'),
axis.text.x = element_blank(),axis.ticks.x = element_blank(),axis.line.x = element_blank(),
axis.title.x = element_blank(),
axis.text.y = element_text(size = 12)
)
}
grey_pal = colorRampPalette(RColorBrewer::brewer.pal(9, 'Greys'))(16) # Greys except for hand-picked signatures
sign_cols = c("#e41a1c","cyan4","#4daf4a","deeppink4","#E69F00","#ffff33","#ff7f00","#56B4E9","#a65628","#f781bf",grey_pal[4],"#b2df8a","#377eb8",grey_pal[5],"#ff7f00","#009E73",grey_pal[7],"#CC79A7",grey_pal[9],"#ff7f00",grey_pal[10],"#cab2d6",grey_pal[11],"#D55E00",grey_pal[13],"#ff7f00",grey_pal[14:15],"#984ea3")
names(sign_cols)=colnames(maf)[-c(1:2)]
sp_tab=table(dt$Var2)
sp_freq=sp_tab/208
#pdf("Mutation_Signature_distribution_208.pdf",width = 14,height = 9)
par(mfrow=c(2,1),mar=c(0,5,4,4))
bar=barplot(sp_freq,ylim=c(0,1),names.arg = NA,axes = F,ylab="Percentage of Samples",xlim=c(0,21),col = sign_cols[colnames(sig.dt)])
axis(2,at=seq(0,1,len=6),labels =paste(seq(0,100,len=6),sep="%"),las=1)
par(mar=c(5,5,1,4))
plot(0,0,xlim=c(0,21),ylim=c(0,log10(100000)),pch=16,axes = F,
xlab="Mutation Signature",ylab = "# Mutations",type="n")
axis(2,at=0:5,labels = c(0,10^(1:5)),las=1)
for(i in 1:ncol(sig.dt)){
sub_dt=dt[dt$Var2==colnames(sig.dt)[i],]
if(i%%2==0){
rect(bar[i]-0.6,0,bar[i]+0.6,5,col="gray90",border = NA)
}
points(seq(bar[i]-min(0.45,nrow(sub_dt)*0.01),bar[i]+min(0.45,nrow(sub_dt)*0.01),len=nrow(sub_dt)),sub_dt$value[order(sub_dt$value)],pch=16,col=sign_cols[colnames(sig.dt)[i]])
lines(c(bar[i]-0.25,bar[i]+0.25),rep(median(sub_dt$value),2))
}
text(bar,-0.3,sub("Signature.","",colnames(sig.dt)),xpd=T)
#dev.off()
library("ggpubr")
mt_210=read.table("Mutation_summary_Arv.xls",sep="\t",header=T,stringsAsFactors=F,check.name=F,row.names = 1)
dt=melt(sig.dt)
dt[dt==0]=NA
dt=dt[!is.na(dt$value),]
#signature 8
sub_dt=dt[dt$Var2==colnames(sig.dt)[7],]
rownames(sub_dt)=sub_dt$Var1
sp=intersect(as.character(sub_dt$Var1),rownames(mt_210))
df=data.frame(df_x=mt_210[sp,"GS"],df_y=sub_dt[sp,"value"])
df=df[!is.na(df$df_x),]
p=kruskal.test(formula = df_y~df$df_x, data = df,subset = NULL, na.action = na.omit)$p.value
ggboxplot(df,x="df_x",y="df_y",col="df_x",xlab="GS",ylab="# Mutations",add="jitter",title=paste(colnames(sig.dt)[7],": ","p=",round(p,4),sep=""),)
#signature 16
sub_dt=dt[dt$Var2==colnames(sig.dt)[12],]
rownames(sub_dt)=sub_dt$Var1
sp=intersect(as.character(sub_dt$Var1),rownames(mt_210))
df=data.frame(df_x=mt_210[sp,"GS"],df_y=sub_dt[sp,"value"])
df=df[!is.na(df$df_x),]
p=kruskal.test(formula = df_y~df$df_x, data = df,subset = NULL, na.action = na.omit)$p.value
ggboxplot(df,x="df_x",y="df_y",col="df_x",xlab="GS",ylab="# Mutations",add="jitter",title=paste(colnames(sig.dt)[12],": ","p=",round(p,4),sep=""),)