This is the R Markdown for Extended Dat Fig1, which consists of 3 parts.

Prepare data:

cd mutation_signature
python make_trinuc_maf.py Somatic_mutation.filter.208.maf Somatic_mutation.filter.trinuc.208.maf
cd ../somatic_mutaion
python main.py Stratton_signatures29_1.txt Somatic_mutation.filter.trinuc.208.maf Somatic_mutation.filter.trinuc.decomposed.208.maf

Figure a:

library(foreach)
cl=as.data.frame(readxl::read_excel("Supplementary+Table+1+Clinical+information+20190710(1).xlsx"))
rownames(cl)=paste("T",cl$`Sequencing ID`,sep="")
cl=cl[!is.element(rownames(cl),c("T37","T274")),]

cl$Gs=paste(cl$G1,cl$G2,sep="+")
cl$Gs[cl$Gs=="3+5"]=">=8"
cl$Gs[cl$Gs=="4+4"]=">=8"
cl$Gs[cl$Gs=="4+5"]=">=8"
cl$Gs[cl$Gs=="5+3"]=">=8"
cl$Gs[cl$Gs=="5+4"]=">=8"
cl$Gs[cl$Gs=="5+5"]=">=8"
cl$Gs[cl$Gs=="NA+NA"]="Not available"

cl$GS[cl$GS==8]=">=8"
cl$GS[cl$GS==9]=">=8"
cl$GS[cl$GS==10]=">=8"
cl$GS[cl$GS=="NA"]="Not available"

cl$`Pathology T`[cl$`Pathology T`=="pT2"]="pT2a/pT2b"
cl$`Pathology T`[cl$`Pathology T`=="pT2a"]="pT2a/pT2b"
cl$`Pathology T`[cl$`Pathology T`=="pT2b"]="pT2a/pT2b"

cl$`Pathology N`[cl$`Pathology N`=="pn0"]="pN0"
cl$`Pathology N`[cl$`Pathology N`=="pNX"]="pNx"
cl$`Pathology N`[cl$`Pathology N`=="pN0"]="Negative"
cl$`Pathology N`[cl$`Pathology N`=="pNx"]="Not available"
cl$`Pathology N`[cl$`Pathology N`=="pN1"]="Positive"

cl$BCR[cl$BCR=="NA"|cl$BCR=="Dead"]="Not available"

cl$`Surgical Margin`[cl$`Surgical Margin`=="NA"]="Not available"

cl$Extraprostatic[cl$Extraprostatic=="NA"]="Not available"
cl$`Seminal Vesical`[cl$`Seminal Vesical`=="NA"]="Not available"
cl$PSA[cl$PSA==">100"]=101
cl[cl=="NO"]="Negative"
cl[cl=="YES"]="Positive"
lc=c(23,14,20,15,10,19)
lc_sum=foreach(i=1:length(lc)) %do% table(cl[,lc[i]])
names(lc_sum)=c("Gleason Score","Pathologic Stage","PSA Recurrence","LymphNodes Metastasis","Bone Metastasis","Margin Satus")

cl$PSA=as.numeric(cl$PSA)
cl$Age=as.numeric(cl$Age)
## Warning: NAs introduced by coercion
age=paste(paste(median(na.omit(cl$Age)),paste(min(na.omit(cl$Age)),max(na.omit(cl$Age)),sep="-"),sep="("),"",sep=")")
psa=paste(paste(round(mean(na.omit(cl$PSA)),1),paste(min(na.omit(cl$PSA)),max(na.omit(cl$PSA)),sep="-"),sep="("),"",sep=")")

item=foreach(i=1:length(lc_sum),.combine=c) %do% paste(names(lc_sum)[i],names(lc_sum[[i]]),sep=": ")
value=foreach(i=1:length(lc_sum),.combine=c) %do% as.numeric(lc_sum[[i]])

table1=data.frame(Item=c("Age","PSA",item),CPGEA=c(age,psa,value))
table1=table1[c(1:2,4:6,3,7,8:12,15,13:14,18,16:17,20,19,23,21:22),]
#write.table(table1,"supplement_table1.xls",sep="\t",row.names = F,quote=F)
table1
##                                    Item              CPGEA
## 1                                   Age          69(50-88)
## 2                                   PSA 38.8(1.25-1270.14)
## 4                    Gleason Score: 3+3                 17
## 5                    Gleason Score: 3+4                 56
## 6                    Gleason Score: 4+3                 40
## 3                    Gleason Score: >=8                 94
## 7          Gleason Score: Not available                  1
## 8           Pathologic Stage: pT2a/pT2b                 28
## 9                Pathologic Stage: pT2c                 77
## 10               Pathologic Stage: pT3a                 60
## 11               Pathologic Stage: pT3b                 33
## 12                Pathologic Stage: pT4                 10
## 15             PSA Recurrence: Positive                 60
## 13             PSA Recurrence: Negative                127
## 14        PSA Recurrence: Not available                 21
## 18      LymphNodes Metastasis: Positive                 23
## 16      LymphNodes Metastasis: Negative                155
## 17 LymphNodes Metastasis: Not available                 30
## 20            Bone Metastasis: Positive                 15
## 19            Bone Metastasis: Negative                193
## 23               Margin Satus: Positive                 82
## 21               Margin Satus: Negative                124
## 22          Margin Satus: Not available                  2

Figure g:

#signature
library(data.table)
library(ggplot2)
maf = suppressWarnings(fread("Somatic_mutation.filter.trinuc.decomposed.208.maf"))
maf=as.data.frame(maf)
freq1358916=sum(maf$`Number of Mutations`*rowSums(maf[,c(1,3,5,8,9,16)+2]))/sum(maf$`Number of Mutations`)
freq2461224=sum(maf$`Number of Mutations`*rowSums(maf[,c(2,4,6,12)+2]))/sum(maf$`Number of Mutations`)
freq135891630=sum(maf$`Number of Mutations`*rowSums(maf[,c(1,3,5,8,9,16,29)+2]))/sum(maf$`Number of Mutations`)

sig.dt=as.matrix(maf[,-c(1:2)])
row.names(sig.dt)=sub("_WGS","",maf$`Sample Name`)
sig.dt[sig.dt<0.06]=0
sig.dt=sig.dt[,colSums(sig.dt)!=0]
sig.dt=sig.dt*maf$`Number of Mutations`
sig.dt=sig.dt[order(sig.dt[,1],sig.dt[,2],sig.dt[,3],sig.dt[,4],sig.dt[,5],sig.dt[,6],sig.dt[,7],sig.dt[,8],sig.dt[,9],sig.dt[,10],sig.dt[,11],sig.dt[,12],sig.dt[,13],sig.dt[,14],sig.dt[,15],sig.dt[,16],sig.dt[,17]),]

dt=melt(sig.dt)
dt[dt==0]=NA
dt=dt[!is.na(dt$value),]
dt$value=log10(dt$value)
theme_bwmin = function(base_size = 12, base_family = 'Helvetica')
{
  theme_bw(base_size = base_size, base_family = base_family) %+replace%
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.border = element_blank(),
      strip.background = element_blank(),
      axis.line = element_line(size = 0.1),
      axis.ticks = element_line(size = 0.1),
      legend.key = element_rect(color = 'white'),
      legend.key.size = unit(.25, 'cm'),
      axis.text.x = element_blank(),axis.ticks.x = element_blank(),axis.line.x = element_blank(),
      axis.title.x = element_blank(),
      axis.text.y = element_text(size = 12)
    )
}

grey_pal = colorRampPalette(RColorBrewer::brewer.pal(9, 'Greys'))(16) # Greys except for hand-picked signatures
sign_cols = c("#e41a1c","cyan4","#4daf4a","deeppink4","#E69F00","#ffff33","#ff7f00","#56B4E9","#a65628","#f781bf",grey_pal[4],"#b2df8a","#377eb8",grey_pal[5],"#ff7f00","#009E73",grey_pal[7],"#CC79A7",grey_pal[9],"#ff7f00",grey_pal[10],"#cab2d6",grey_pal[11],"#D55E00",grey_pal[13],"#ff7f00",grey_pal[14:15],"#984ea3")
names(sign_cols)=colnames(maf)[-c(1:2)]
sp_tab=table(dt$Var2)
sp_freq=sp_tab/208

#pdf("Mutation_Signature_distribution_208.pdf",width = 14,height = 9)
par(mfrow=c(2,1),mar=c(0,5,4,4))
bar=barplot(sp_freq,ylim=c(0,1),names.arg = NA,axes = F,ylab="Percentage of Samples",xlim=c(0,21),col = sign_cols[colnames(sig.dt)])
axis(2,at=seq(0,1,len=6),labels =paste(seq(0,100,len=6),sep="%"),las=1)

par(mar=c(5,5,1,4))
plot(0,0,xlim=c(0,21),ylim=c(0,log10(100000)),pch=16,axes = F,
     xlab="Mutation Signature",ylab = "# Mutations",type="n")
axis(2,at=0:5,labels = c(0,10^(1:5)),las=1)
for(i in 1:ncol(sig.dt)){
  sub_dt=dt[dt$Var2==colnames(sig.dt)[i],]
  if(i%%2==0){
    rect(bar[i]-0.6,0,bar[i]+0.6,5,col="gray90",border = NA)
  }
  points(seq(bar[i]-min(0.45,nrow(sub_dt)*0.01),bar[i]+min(0.45,nrow(sub_dt)*0.01),len=nrow(sub_dt)),sub_dt$value[order(sub_dt$value)],pch=16,col=sign_cols[colnames(sig.dt)[i]])
  lines(c(bar[i]-0.25,bar[i]+0.25),rep(median(sub_dt$value),2))
}
text(bar,-0.3,sub("Signature.","",colnames(sig.dt)),xpd=T)

#dev.off()

Figure h:

library("ggpubr")
mt_210=read.table("Mutation_summary_Arv.xls",sep="\t",header=T,stringsAsFactors=F,check.name=F,row.names = 1)

dt=melt(sig.dt)
dt[dt==0]=NA
dt=dt[!is.na(dt$value),]

#signature 8
sub_dt=dt[dt$Var2==colnames(sig.dt)[7],]
rownames(sub_dt)=sub_dt$Var1
sp=intersect(as.character(sub_dt$Var1),rownames(mt_210))
df=data.frame(df_x=mt_210[sp,"GS"],df_y=sub_dt[sp,"value"])
df=df[!is.na(df$df_x),]
p=kruskal.test(formula = df_y~df$df_x, data = df,subset = NULL, na.action = na.omit)$p.value
ggboxplot(df,x="df_x",y="df_y",col="df_x",xlab="GS",ylab="# Mutations",add="jitter",title=paste(colnames(sig.dt)[7],": ","p=",round(p,4),sep=""),)

#signature 16
sub_dt=dt[dt$Var2==colnames(sig.dt)[12],]
rownames(sub_dt)=sub_dt$Var1
sp=intersect(as.character(sub_dt$Var1),rownames(mt_210))
df=data.frame(df_x=mt_210[sp,"GS"],df_y=sub_dt[sp,"value"])
df=df[!is.na(df$df_x),]
p=kruskal.test(formula = df_y~df$df_x, data = df,subset = NULL, na.action = na.omit)$p.value
ggboxplot(df,x="df_x",y="df_y",col="df_x",xlab="GS",ylab="# Mutations",add="jitter",title=paste(colnames(sig.dt)[12],": ","p=",round(p,4),sep=""),)