This is the R Markdown for Extended Dat Fig9, which consists of 7 parts.
library(foreach)
library(maftools)
options(warn=-1)
nct=as.data.frame(read.table("Abroad_meta_data.xls",sep="\t",header = T,stringsAsFactors = F))
rownames(nct)=nct$SAMPLE_ID
#TCGA
tab=as.data.frame(readxl::read_excel("S2-TCGA_Cell2015.xls"))
cl=data.table::dcast(tab,SAMPLE_ID~Subtype,value.var="Subtype")
rownames(cl)=cl$SAMPLE_ID
cl=cl[,-c(1,9)]
cl[!is.na(cl)]=1
cl[is.na(cl)]=0
colnames(cl)=c("ERG","ETV1","ETV4","FLI1","SPOP","FOXA1","IDH1")
cl$sample_type="Primary"
fs=read.table("11.prad_tcga_pub/data_fusions.txt",sep="\t",stringsAsFactors=F,header=T)
fs_tab=table(fs[,c(1,4)])
maf=read.maf("11.prad_tcga_pub/data_mutations_extended.txt", removeDuplicatedVariants = F)
mf=subsetMaf(maf, includeSyn = F, tsb = NULL, genes = NULL,fields = NULL, query = NULL, mafObj = FALSE, isTCGA = FALSE)
mf$Variant_Classification=as.character(mf$Variant_Classification)
mf=mf[mf$Variant_Classification!="Translation_Start_Site"]
mf_tab=table(mf[,c(1,17)])
mt_tab=mf_tab[is.element(rownames(mf_tab),c("FOXA1","SPOP","IDH1")),]
mt_tab[mt_tab==2]=1
mt_freq=rowMeans(mt_tab)
colnames(mt_tab)=gsub("\\.","-",colnames(mt_tab))
cl[colnames(mt_tab)[as.numeric(mt_tab["FOXA1",])!=0],"FOXA1"]=1
cl[colnames(fs_tab)[as.numeric(fs_tab["ETV4",])!=0],"ETV4"]=1
cl[colnames(fs_tab)[as.numeric(fs_tab["ETV1",])!=0],"ETV1"]=1
write.table(cl,"TCGA_2015_cluster.xls",sep="\t",quote=F)
#SU2C
fs=read.table("2.prad_su2c_2015/2.prad_su2c_2015/data_fusions.txt",sep="\t",stringsAsFactors=F,header=T)
maf=read.maf("2.prad_su2c_2015/2.prad_su2c_2015/data_mutations_extended.txt", removeDuplicatedVariants = F)
mf=subsetMaf(maf, includeSyn = F, tsb = NULL, genes = NULL,fields = NULL, query = NULL, mafObj = FALSE, isTCGA = FALSE)
mf$Variant_Classification=as.character(mf$Variant_Classification)
mf_tab=table(mf[,c(1,17)])
mf_tab[mf_tab!=0]=1
mf_freq=rowMeans(mf_tab)
mt_tab=mf_tab[is.element(rownames(mf_tab),c("SPOP","FOXA1","IDH1")),]
fs=table(fs[,c(1,4)])
fs_tab=fs[c("ERG","ETV1","ETV4","FLI1"),]
cl=matrix(0,nrow=150,ncol=7)
rownames(cl)=colnames(mt_tab)
colnames(cl)=c("ERG","ETV1","ETV4","FLI1","SPOP","FOXA1","IDH1")
cl[colnames(fs_tab),rownames(fs_tab)]=t(fs_tab)
cl[,rownames(mt_tab)]=t(mt_tab)
cl=as.data.frame(cl)
cl$sample_type=nct[rownames(cl),"SAMPLE_TYPE"]
write.table(cl,"Su2c_2015_cluster.xls",sep="\t",quote=F)
#MSK
fs=read.table("3.prad_mskcc_2017/3.prad_mskcc_2017/data_fusions.txt",sep="\t",stringsAsFactors=F,header=T)
fs=table(fs[,c(1,4)])
fs_tab=fs[is.element(rownames(fs),c("ERG","ETV1","ETV4","FLI1")),]
maf=read.maf("3.prad_mskcc_2017/3.prad_mskcc_2017/data_mutations_extended.txt", removeDuplicatedVariants = F)
mf=subsetMaf(maf, includeSyn = F, tsb = NULL, genes = NULL,fields = NULL, query = NULL, mafObj = FALSE, isTCGA = FALSE)
mf$Variant_Classification=as.character(mf$Variant_Classification)
mf_tab=table(mf[,c(1,17)])
mf_tab[mf_tab!=0]=1
mf_freq=rowMeans(mf_tab)
mt_tab=mf_tab[is.element(rownames(mf_tab),c("SPOP","FOXA1","IDH1")),]
colnames(mt_tab)=gsub("\\.","-",colnames(mt_tab))
sp=union(colnames(mt_tab),colnames(fs_tab))
cl=matrix(0,nrow=length(sp),ncol=7)
rownames(cl)=sp
colnames(cl)=c("ERG","ETV1","ETV4","FLI1","SPOP","FOXA1","IDH1")
cl[colnames(fs_tab),rownames(fs_tab)]=t(fs_tab)
cl[colnames(mt_tab),rownames(mt_tab)]=t(mt_tab)
cl=as.data.frame(cl)
cl$sample_type=nct[rownames(cl),"SAMPLE_TYPE"]
write.table(cl,"MSKCC2017_cluster.xls",sep="\t",quote=F)
#B/C-Cell 2013
fs=read.table("5.prad_broad_2013/5.prad_broad_2013/data_fusions.txt",sep="\t",stringsAsFactors=F,header=T)
fs=table(fs[,c(1,4)])
fs_tab=fs[is.element(rownames(fs),c("ERG","ETV1","ETV4","FLI1")),]
maf=read.maf("5.prad_broad_2013/5.prad_broad_2013/data_mutations_extended.txt", removeDuplicatedVariants = F)
mf=subsetMaf(maf, includeSyn = F, tsb = NULL, genes = NULL,fields = NULL, query = NULL, mafObj = FALSE, isTCGA = FALSE)
mf$Variant_Classification=as.character(mf$Variant_Classification)
mf_tab=table(mf[,c(1,16)])
mf_tab[mf_tab!=0]=1
mf_freq=rowMeans(mf_tab)
mt_tab=mf_tab[is.element(rownames(mf_tab),c("SPOP","FOXA1","IDH1")),]
sp=union(colnames(mt_tab),colnames(fs_tab))
cl=matrix(0,nrow=length(sp),ncol=7)
rownames(cl)=sp
colnames(cl)=c("ERG","ETV1","ETV4","FLI1","SPOP","FOXA1","IDH1")
cl[colnames(fs_tab),rownames(fs_tab)]=t(fs_tab)
cl[colnames(mt_tab),rownames(mt_tab)]=t(mt_tab)
cl=as.data.frame(cl)
rownames(cl)=sub("-Normal","",rownames(cl))
cl$sample_type=nct[rownames(cl),"SAMPLE_TYPE"]
write.table(cl,"broad2013_cluster.xls",sep="\t",quote=F)
#EU
fs=read.table("7.prad_eururol_2017/data_fusions.txt",sep="\t",stringsAsFactors=F,header=T)
fs=table(fs[,c(1,4)])
fs_tab=fs[is.element(rownames(fs),c("ERG","ETV1","ETV4","FLI1")),]
fs_tab[fs_tab!=0]=1
maf=read.maf("7.prad_eururol_2017/data_mutations_extended.txt", removeDuplicatedVariants = F)
mf=subsetMaf(maf, includeSyn = F, tsb = NULL, genes = NULL,fields = NULL, query = NULL, mafObj = FALSE, isTCGA = FALSE)
mf$Variant_Classification=as.character(mf$Variant_Classification)
mf_tab=table(mf[,c(1,17)])
mf_tab[mf_tab!=0]=1
mf_freq=rowMeans(mf_tab)
mf_freq[order(mf_freq,decreasing=T)][1:10]
mt_tab=as.data.frame(mf_tab[is.element(rownames(mf_tab),c("SPOP","FOXA1","IDH1")),])
colnames(mt_tab)="SPOP"
mt_tab=as.data.frame(t(mt_tab))
sp=union(colnames(mt_tab),colnames(fs_tab))
cl=matrix(0,nrow=length(sp),ncol=7)
rownames(cl)=sp
colnames(cl)=c("ERG","ETV1","ETV4","FLI1","SPOP","FOXA1","IDH1")
cl[colnames(fs_tab),rownames(fs_tab)]=t(fs_tab)
cl[colnames(mt_tab),rownames(mt_tab)]=t(mt_tab)
cl=as.data.frame(cl)
cl$sample_type=nct[rownames(cl),"SAMPLE_TYPE"]
write.table(cl,"EU_cluster.xls",sep="\t",quote=F)
#M/DFCI
fs=read.table("10.prad_MSKCC_DFCI_NG_2018/data_fusions.txt",sep="\t",stringsAsFactors=F,header=T)
fs=table(fs[,c(1,4)])
fs_tab=fs[is.element(rownames(fs),c("ERG","ETV1","ETV4","FLI1")),]
fs_tab[fs_tab!=0]=1
maf=read.maf("10.prad_MSKCC_DFCI_NG_2018/data_mutations_extended.txt", removeDuplicatedVariants = F)
mf=subsetMaf(maf, includeSyn = F, tsb = NULL, genes = NULL,fields = NULL, query = NULL, mafObj = FALSE, isTCGA = FALSE)
mf$Variant_Classification=as.character(mf$Variant_Classification)
mf$Tumor_Sample_Barcode=gsub("\\.","-",mf$Tumor_Sample_Barcode)
mf_tab=table(mf[,c(1,16)])
mf_tab[mf_tab!=0]=1
mf_freq=rowMeans(mf_tab)
mf_freq[order(mf_freq,decreasing=T)][1:10]
mt_tab=mf_tab[is.element(rownames(mf_tab),c("SPOP","FOXA1","IDH1")),]
sp=union(colnames(mt_tab),colnames(fs_tab))
cl=matrix(0,nrow=length(sp),ncol=7)
rownames(cl)=sp
colnames(cl)=c("ERG","ETV1","ETV4","FLI1","SPOP","FOXA1","IDH1")
cl[colnames(fs_tab),rownames(fs_tab)]=t(fs_tab)
cl[colnames(mt_tab),rownames(mt_tab)]=t(mt_tab)
cl=as.data.frame(cl)
nct1=nct
rownames(nct1)=gsub("\\.","-",rownames(nct1))
cl$sample_type=nct1[rownames(cl),"SAMPLE_TYPE"]
write.table(cl,"MSKCC_DFCI2018_cluster.xls",sep="\t",quote=F)
#Michigan
fs=read.table("14.prad_mich_Nature_2012/14.prad_mich_Nature_2012/data_fusions.txt",sep="\t",stringsAsFactors=F,header=T)
fs=table(fs[,c(1,4)])
fs_tab=fs[is.element(rownames(fs),c("ERG","ETV1","ETV4","FLI1")),]
fs_tab[fs_tab!=0]=1
maf=read.maf("14.prad_mich_Nature_2012/14.prad_mich_Nature_2012/data_mutations_extended.txt", removeDuplicatedVariants = F)
mf=subsetMaf(maf, includeSyn = F, tsb = NULL, genes = NULL,fields = NULL, query = NULL, mafObj = FALSE, isTCGA = FALSE)
mf$Variant_Classification=as.character(mf$Variant_Classification)
mf_tab=table(mf[,c(1,16)])
mf_tab[mf_tab!=0]=1
mf_freq=rowMeans(mf_tab)
mf_freq[order(mf_freq,decreasing=T)][1:10]
mt_tab=mf_tab[is.element(rownames(mf_tab),c("SPOP","FOXA1","IDH1")),]
sp=union(colnames(mt_tab),colnames(fs_tab))
cl=matrix(0,nrow=length(sp),ncol=7)
rownames(cl)=sp
colnames(cl)=c("ERG","ETV1","ETV4","FLI1","SPOP","FOXA1","IDH1")
cl[colnames(fs_tab),rownames(fs_tab)]=t(fs_tab)
cl[colnames(mt_tab),rownames(mt_tab)]=t(mt_tab)
cl=as.data.frame(cl)
cl$sample_type=nct[rownames(cl),"SAMPLE_TYPE"]
write.table(cl,"mich_nature2012_cluster.xls",sep="\t",quote=F)
#CPGEA
fs=read.table("Fusion_sampleID_genepair_validate_134_Arv.xls",sep="\t",header=T,stringsAsFactors = F)
gene=unlist(strsplit(fs$Fusion,split="--"))
fs$up_gene=gene[seq(1,length(gene),by=2)]
fs$down_gene=gene[seq(2,length(gene),by=2)]
colnames(fs)[2]="sampleID"
maf=read.maf("Somatic_mutation.filter.208.maf", removeDuplicatedVariants = F)
mf=subsetMaf(maf, includeSyn = F, tsb = NULL, genes = NULL,fields = NULL, query = NULL, mafObj = FALSE, isTCGA = FALSE)
mf$Variant_Classification=as.character(mf$Variant_Classification)
mf$Tumor_Sample_Barcode=as.character(mf$Tumor_Sample_Barcode)
mf=mf[!is.element(mf$Tumor_Sample_Barcode,c("T502_WGS","T13_WGS")),]
mf_tab=table(mf[,c(1,16)])
mf_tab[mf_tab!=0]=1
mt_tab=mf_tab[is.element(rownames(mf_tab),c("SPOP","FOXA1","IDH1")),]
colnames(mt_tab)=sub("_WGS","",colnames(mt_tab))
sp=colnames(mt_tab)
cl=matrix(0,nrow=length(sp),ncol=7)
rownames(cl)=sp
colnames(cl)=c("ERG","ETV1","ETV4","FLI1","SPOP","FOXA1","IDH1")
cl[colnames(mt_tab),rownames(mt_tab)]=t(mt_tab)
cl=as.data.frame(cl)
cl$sample_type="Primary"
sv_dt=read.delim("Gene_pair_sv_annotation_TAD_summary_208.xls",sep="\t",header=T,stringsAsFactors=F)
sv_sp=list()
sv_sp[[1]]=unique(unlist(strsplit(sv_dt$Sample[sv_dt$Pair1_Gene=="ERG"|sv_dt$Pair2_Gene=="ERG"],split=",")))
sv_sp[[2]]=unique(unlist(strsplit(sv_dt$Sample[sv_dt$Pair1_Gene=="ETV1"|sv_dt$Pair2_Gene=="ETV1"],split=",")))
sv_sp[[3]]=unique(unlist(strsplit(sv_dt$Sample[c(grep("ETV4",sv_dt$Pair1_Gene),grep("ETV4",sv_dt$Pair2_Gene))],split=",")))
#sv_sp[[4]]=unique(unlist(strsplit(sv_dt$Sample[sv_dt$Pair1_Gene=="FLI1"|sv_dt$Pair2_Gene=="FLI1"],split=",")))
cl[sv_sp[[1]],"ERG"]=1
cl[sv_sp[[2]],"ETV1"]=1
cl[sv_sp[[3]],"ETV4"]=1
#cl[sv_sp[[4]],"FLI1"]=1
#high_express
hi_ERG=c("T47")
hi_ETV1=c("T729","T84","T34")
hi_ETV4=c("T273")
cl[hi_ERG,"ERG"]=1
cl[hi_ETV1,"ETV1"]=1
cl[hi_ETV4,"ETV4"]=1
cl[fs$sampleID[fs$up_gene=="ERG"|fs$down_gene=="ERG"],"ERG"]=1
cl[fs$sampleID[fs$up_gene=="ETV1"|fs$down_gene=="ETV1"],"ETV1"]=1
cl[fs$sampleID[fs$up_gene=="ETV4"|fs$down_gene=="ETV4"],"ETV4"]=1
cl[fs$sampleID[fs$up_gene=="FLI1"|fs$down_gene=="FLI1"],"FLI1"]=1
write.table(cl,"CPGEA_208_cluster.xls",sep="\t",quote=F)
#TCGA_CPEGA_pipeline
path="TCGA_CPGEA_pipeline_Fusion"
file=dir(path)
sub_fs=foreach(i=1:length(file)) %do% read.table(paste(path,file[i],sep="/"),sep="\t",stringsAsFactors = F,header=T)
for(i in 1:length(file)){
sub_fs[[i]]$Sample=sub(".final.Fusion.specific.for.genes","",file[i])
}
fs=do.call(rbind,sub_fs)
sp_fs=data.frame(Gene=c(fs$up_gene,fs$dw_gene),Sample=c(fs$Sample,fs$Sample))
fs_tab=table(sp_fs)
sub_fs_tab=fs_tab[intersect(rownames(fs_tab),c("ERG","ETV1","ETV4","FLI1")),]
sub_fs_tab[sub_fs_tab!=0]=1
maf=read.maf("TCGA_PRAD_333_WXS_20190724.maf",removeDuplicatedVariants = F)
mf=subsetMaf(maf, includeSyn = F, tsb = NULL, genes = NULL,fields = NULL, query = NULL, mafObj = FALSE, isTCGA = FALSE)
mf$Variant_Classification=as.character(mf$Variant_Classification)
mf_tab=table(mf[,c(1,16)])
mt_tab=mf_tab[is.element(rownames(mf_tab),c("FOXA1","SPOP","IDH1")),]
mt_tab[mt_tab!=0]=1
mt_freq=rowSums(mt_tab)/333
mt_freq
sp=sub("-01","",nct$SAMPLE_ID[grep("tcga_pub",nct$source)])
cl=matrix(0,nrow=333,ncol=7)
rownames(cl)=sp
colnames(cl)=c("ERG","ETV1","ETV4","FLI1","SPOP","FOXA1","IDH1")
cl[sub("-01B","",sub("-01A","",colnames(sub_fs_tab))),rownames(sub_fs_tab)]=t(sub_fs_tab)
cl[sub("-01A","",sub("-01B","",colnames(mt_tab))),rownames(mt_tab)]=t(mt_tab)
cl=as.data.frame(cl)
cl$sample_type=nct[sub("-01","",rownames(cl)),"SAMPLE_TYPE"]
write.table(cl,"TCGA_CPGEA_pipeline_333_cluster.xls",sep="\t",quote=F)
library(foreach)
file=dir()
file=file[grep("cluster.xls",file)][c(1,3,8,4:6,2,9,10)]
sub_file=c("B/C-Cell 2013","EU","SU2C","Michigan","M/DFCI","MSK","CPGEA","TCGA_pub","TCGA_CPGEA_pipeline")
dt=foreach(i=1:length(file)) %do% read.table(file[i],sep="\t",header=T,stringsAsFactors = F)
type=foreach(i=1:length(dt)) %do% dt[[i]]$sample_type
type=unique(unlist(type))
type_col=c("#B85C23","#D79744","gold")
names(type_col)=c(type)
#pdf("Supp_Figure17.pdf",width=23,height = 10)
par(mfrow=c(1,10),mar=c(4,0,4,0))
for(t in 1:9){
smt_cl=dt[[t]]
smt_cl=smt_cl[order(smt_cl[,1],smt_cl[,2],smt_cl[,3],smt_cl[,4],smt_cl[,5],smt_cl[,6],smt_cl[,7],decreasing =rep(T,7)),]
smt_cl=as.data.frame(smt_cl)
smt_cl$x=ifelse(smt_cl[,1]==1,1,ifelse(smt_cl[,2]==1,2,ifelse(smt_cl[,3]==1,3,ifelse(smt_cl[,4]==1,4,ifelse(smt_cl[,5]==1,5,ifelse(smt_cl[,6]==1,6,ifelse(smt_cl[,7]==1,7,8)))))))
cl_space=(as.numeric(as.factor(smt_cl$x[order(smt_cl$x)]))-1)*0
data.fig=smt_cl
data=smt_cl[,1:7]
clst=1:8
cl_mut=list()
for(i in 1:length(clst)){
if(length(which(smt_cl$x==clst[i]))==1){
mut_freq=as.numeric(data[length(rownames(smt_cl)[smt_cl$x==clst[i]]),])
}else{
mut_freq=colSums(data[rownames(smt_cl)[smt_cl$x==clst[i]],])/length(rownames(smt_cl)[smt_cl$x==clst[i]])}
cl_mut[[i]]=mut_freq}
cl_mut=do.call(cbind,cl_mut)
colnames(cl_mut)=paste("cluster",clst,sep="")
mut_class=data.frame(cl=colnames(cl_mut),the_most_recurrent=c(colnames(smt_cl)[1:7],"Others"),n_sp=foreach(i=1:8,.combine=c) %do% length(which(smt_cl$x==i)))
col.fig=data.fig[,1:7]
col.fig[,1:4][col.fig[,1:4]!=0]="#009E73"
col.fig[,5:7][col.fig[,5:7]!=0]="#0571b0"
col.fig[col.fig==0]="white"
lab.col=type_col[dt[[t]]$sample_type]
y.lim=(-nrow(col.fig)-cl_space[nrow(col.fig)])/nrow(col.fig)
if(t>1){
par(mar=c(4,0,4,0))
}
plot(0,0,type="n",xlim=c(-3,4.5),ylim=c(y.lim-0.01,0.02),axes=F,xlab="",main=sub_file[t],ylab="")
for(j in 1:ncol(col.fig)){
for(k in 1:nrow(col.fig)){
rect((j-1)*0.5,(1-k-cl_space[k])/nrow(col.fig),j*0.5,(-k-cl_space[k])/nrow(col.fig),col=col.fig[k,j],border = NA)
rect(ncol(col.fig)*0.5+0.1,(1-k-cl_space[k])/nrow(col.fig),(ncol(col.fig)+1)*0.5+0.1,(-k-cl_space[k])/nrow(col.fig),col=lab.col[k],border = NA)
}}
rect(0,y.lim,(ncol(col.fig))*0.5,0)
seg=mut_class$n_sp
segments(0,-cumsum(seg)/nrow(col.fig),3.5,-cumsum(seg)/nrow(col.fig))
lab_dt=mut_class[mut_class$n_sp!=0,]
text.loc=foreach(i=2:length(clst),.combine = c) %do% (-sum(table(smt_cl$x)[1:i])-(i-1)*0.9-sum(table(smt_cl$x)[1:(i-1)])-(i-1)*0.9)/2
row_lab=paste(paste(lab_dt[,2],paste(round(lab_dt$n_sp/sum(lab_dt$n_sp),4)*100,"",sep="%"),sep=" ("),")",sep="")
text(-3,c(-table(smt_cl$x)[1]/2,text.loc)/nrow(col.fig),row_lab,pos=4,cex=1,xpd=T)
text(seq(0.5,(length(clst)-1)*0.5+0.1,by=0.5),0.02,c(colnames(smt_cl)[1:7]),pos=3,srt=90,cex=1,xpd=T)
}
plot(0,0,type="n",xlim=c(-3,5),ylim=c(y.lim-0.02,0.05),axes=F,xlab="",ylab="")
legend("center",legend=c("Fusion","Somatic Mutation",names(type_col)[1:3],"Neuroendocrine"),pch=15,col=c("#009E73","#0571b0",type_col),ncol = 1,bty="n")