This is the R Markdown for Supplementary Extended Dat Fig.2, which consists of 5 parts.
library(foreach)
file="CNV.segment.208new.txt"
seg=foreach(i=1:length(file)) %do% read.table(file[i],header = F,stringsAsFactors = F,sep="\t")
seg1=seg[[1]][seg[[1]]$V6!=0,]
write.table(seg1,"CNV.segment.208new.seg",sep="\t",row.names = F,col.names = F,quote = F)
bed_dir=c("CPGEA_segment_bed")
dir.create(bed_dir)
j=1
seg=seg1
id=unique(seg$V1)
for(i in 1:length(id)){
sub_seg=seg[seg$V1==id[i],]
write.table(sub_seg[order(sub_seg$V2,sub_seg$V3),2:4],paste(bed_dir[j],"/",id[i],".bed",sep=""),sep="\t",row.names = F,col.names = F,quote = F)
}
dir.create("CPGEA_seg")
seg=seg1
id=unique(seg$V1)
for(i in 1:length(id)){
sub_seg=seg[seg$V1==id[i],]
sub_seg$V7=ifelse(sub_seg$V6<0,"loss",ifelse(sub_seg$V6>0,"gain","normal"))
write.table(sub_seg[order(sub_seg$V2,sub_seg$V3),],paste("CPGEA_seg/",id[i],".txt",sep=""),sep="\t",row.names = F,col.names = F,quote = F)
}
#shell
multiIntersectBed -i CPGEA_segment_bed/*.bed >Bigloc_CPGEA_hg38.bed
awk '{print $1,$2,$3}' Bigloc_CPGEA_hg38.bed|sed 's/ /\t/g' >BigLoc_CPGEA_hg38.bed
awk '{print $2"\t"$3"\t"$4"\t"$1"\t"$6}' CNV.segment.208new.seg|sort -n -k 1 -k 2 >CPGEA_208.seg
awk '{print $1"\t"$2"\t"$3}' Bigloc_CPGEA_hg38.bed >Bigloc_CPGEA_loc.bed
bedtools intersect -a Bigloc_CPGEA_loc.bed -b CPGEA_208.seg -wa -wb >CPGEA_208_merge.seg
awk '{print $1":"$2"-"$3"\t"$7"\t"$8}' CPGEA_208_merge.seg >CPGEA_208_merge.segment
library(data.table)
seg1=read.table("CPGEA_208_merge.segment",header = F,stringsAsFactors = F,sep="\t")
cnv=dcast(data=seg1,V1~V2,fun=mean)
rownames(cnv)=cnv$V1
loc_chr=foreach(i=1:nrow(cnv),.combine = c) %do% strsplit(rownames(cnv)[i],split=":")[[1]][1]
loc_start=foreach(i=1:nrow(cnv),.combine = c) %do% strsplit(strsplit(rownames(cnv)[i],split=":")[[1]][2],split="-")[[1]][1]
cnv.fig=t(cnv[order(as.numeric(loc_chr),as.numeric(loc_start)),-1])
cnv.fig[is.na(cnv.fig)]=0
save(cnv.fig,file="CPGEA_208_cnv.fig.Rdata")
#Figure d
CNV=read.table("PC_WGS_208pairs_filt_js100.all_thresholded.by_genes.txt",sep="\t",header = T,stringsAsFactors = F,row.names = 1)
CNV=CNV[,-c(1:2)]
colnames(CNV)=sub("_WGS","",colnames(CNV))
my_cnv=CNV["CHD1",]
my_cnv[my_cnv!=(-2)]=0
my_cnv[my_cnv==-2]=1
#fusion
fs_ct=read.table("Fusion_sampleID_genepair_validate_134_Arv.xls",sep="\t",stringsAsFactors=F,header=T)
fs=fs_ct
fs_gene=unlist(strsplit(fs$Fusion,split="--"))
fs$Gene1=fs_gene[seq(1,length(fs_gene),by=2)]
fs$Gene2=fs_gene[seq(2,length(fs_gene),by=2)]
fs_sp=data.frame(Gene=c(fs$Gene1,fs$Gene2),Sample=c(fs$Sample,fs$Sample))
fs_sp$Sample=as.character(fs_sp$Sample)
fs_tab=table(fs_sp)
fs_oth=matrix(0,ncol=length(setdiff(names(my_cnv),colnames(fs_tab))),nrow=nrow(fs_tab))
colnames(fs_oth)=setdiff(names(my_cnv),colnames(fs_tab))
fs_Tab=as.data.frame(cbind(fs_tab,fs_oth))
fs_Tab=fs_Tab[,names(my_cnv)]
mut_fs=rbind(my_cnv,fs_Tab[intersect(c("ERG","ETV1","ETV4","FLI1"),rownames(fs_Tab)),names(my_cnv)])
mut_fs["ERG",]=colSums(mut_fs[2:4,])
mut_fs=mut_fs[1:2,]
rownames(mut_fs)[2]="ETS"
mut_fs=as.data.frame(t(mut_fs))
mut_fs[mut_fs!=0]=1
#
chromoplexy=read.table("samples.chromoplexy_208.xls",header = F,stringsAsFactors = F,sep="\t")
chromothripsis=read.table("shatterseek-chromotrhiprisis_208.xls",header = F,stringsAsFactors = F,sep="\t")
chrpxy=chromoplexy[,1]
chrth=chromothripsis[,1]
mut_fs$chromoplexy=0
mut_fs$chromothripsis=0
mut_fs[chrpxy,"chromoplexy"]=1
mut_fs[chrth,"chromothripsis"]=1
mut_fs$Sample=rownames(mut_fs)
mut_fs[is.na(mut_fs)]=0
colnames(mut_fs)[1]="CHD1_DEL"
sv_dt=read.delim("Gene_pair_sv_annotation_TAD_summary_208.xls",sep="\t",header=T,stringsAsFactors=F)
fs_dt=read.delim("Fusion_fusionhub_sv_validated_Arv.xls",sep="\t",header=T,stringsAsFactors=F)
gene=c("ERG","ETV1","ETV4","FLI1")
#sv ,fusion samples
sv_sp=list()
fs_sp=list()
sv_sp[[1]]=unique(unlist(strsplit(sv_dt$Sample[sv_dt$Pair1_Gene=="ERG"|sv_dt$Pair2_Gene=="ERG"],split=",")))
fs_sp[[1]]=unique(unlist(strsplit(fs_dt$Sample[fs_dt$Fusion=="TMPRSS2--ERG"],split=";")))
sv_sp[[2]]=unique(unlist(strsplit(sv_dt$Sample[sv_dt$Pair1_Gene=="ETV1"|sv_dt$Pair2_Gene=="ETV1"],split=",")))
fs_sp[[2]]=unique(unlist(strsplit(fs_dt$Sample[grep("ETV1",fs_dt$Fusion)],split=";")))
sv_sp[[3]]=unique(unlist(strsplit(sv_dt$Sample[c(grep("ETV4",sv_dt$Pair1_Gene),grep("ETV4",sv_dt$Pair2_Gene))],split=",")))
fs_sp[[3]]=unique(unlist(strsplit(fs_dt$Sample[grep("ETV4",fs_dt$Fusion)],split=";")))
sv_sp[[4]]=unique(unlist(strsplit(sv_dt$Sample[sv_dt$Pair1_Gene=="FLI1"|sv_dt$Pair2_Gene=="FLI1"],split=",")))
fs_sp[[4]]=unique(unlist(strsplit(fs_dt$Sample[grep("FLI1",fs_dt$Fusion)],split=";")))
SV_sp=unique(unlist(sv_sp))
FS_sp=unique(unlist(fs_sp))
SV_FS=intersect(SV_sp,FS_sp)
high_express=c("T729","T34","T273","T47","T84")
#mut_fs=read.table("CHD1_ETS_data.xls",sep='\t',header = T,stringsAsFactors = F)
rownames(mut_fs)=mut_fs$Sample
mut_fs$ETS[mut_fs$ETS==1]=2
mut_fs[intersect(SV_sp,FS_sp),"ETS"]=3
mut_fs[setdiff(SV_sp,FS_sp),"ETS"]=4
mut_fs[high_express,"ETS"]=5
write.table(mut_fs,"CHD1_ETS_fs_sv_highexp_data.xls",sep="\t",row.names = F,quote = F)
library(foreach)
library(ComplexHeatmap)
library(circlize)
load("CPGEA_208_cnv.fig.Rdata")
rownames(cnv.fig)=sub("_WGS","",rownames(cnv.fig))
ht1=Heatmap(cnv.fig,cluster_columns =F,cluster_rows =T,col = colorRamp2(c(-1,0,1),c("blue", "white", "red")),show_column_names = F,show_row_names = F,
name="CNV",width=unit(30, "cm"),row_dend_width = unit(40, "mm"),column_title = "Chromosome",clustering_distance_rows = "euclidean",clustering_method_rows = "ward.D")
hc=row_dend(ht1)
cnv_type=dendextend::cutree(hc,k=3)
cnv_type=ifelse(cnv_type==1,"Quiet",ifelse(cnv_type==2,"More_cnv","Some_cnv"))
write.table(cnv_type,"CNV_cluster_208.xls",sep="\t",quote = F)
mt=read.table("Mutation_summary_Arv.xls",sep="\t",header=T,stringsAsFactors=F,check.name=F,row.names = 1)
cl_dt=mt[,c("Pretreatment drug","PSA","GS","Pathology T","Pathology N","Clinical M","Age")]
cl_dt$Age=ifelse(cl_dt$Age<=65,"50~65",ifelse(cl_dt$Age>65&cl_dt$Age<=70,"66~70",ifelse(cl_dt$Age>70&cl_dt$Age<=75,"71~75","76~")))
cl_dt$PSA=as.numeric(cl_dt$PSA)
cl_dt$PSA=ifelse(cl_dt$PSA<4,"<4",ifelse(cl_dt$PSA>=4&cl_dt$PSA<=20,"4~20",">20"))
cl_dt$PSA=paste("PSA",cl_dt$PSA,sep=":")
colnames(cl_dt)[c(1,4:6)]=c("Pre-Treat","pT","pN","cM")
cl_dt$GS=paste("GS",cl_dt$GS,sep=":")
cl_dt$GS[cl_dt$GS=="GS:NA"]=NA
cl_dt$Age=paste("Age",cl_dt$Age,sep=":")
cl_dt$Age[cl_dt$Age=="Age:NA"]=NA
cl_dt$`Pre-Treat`=paste("Pre-Treat",cl_dt$`Pre-Treat`,sep=":")
cl_dt$`Pre-Treat`[cl_dt$`Pre-Treat`=="Pre-Treat:NA"]=NA
cl_dt=cl_dt[rownames(cnv.fig)[row_order(ht1)],]
cl_dt$cnv_type=cnv_type[rownames(cl_dt)]
cl_dt=cl_dt[rownames(cnv.fig),]
#set color
cl_col =list(Age=c("Age:50~65"="#4EB3D3","Age:66~70"="#2B8CBE","Age:71~75"="#0868AC","Age:76~"="#084081"),
cM=c("M0"="#f1eef6","M1a"="#d7b5d8","M1b"="#df65b0","M1c"="#ce1256",
"Mx"="#9B808F50"),
pN=c("pN0"="#bcbddc","pN1"="#756bb1","pNx"="#9B808F50"),
pT=c("pT2a"="#f1eef6","pT2b"="#d0d1e6","pT2c"="#a6bddb","pT3a"="#74a9cf",
"pT3b"="#2b8cbe","pT4"="#045a8d"),
GS=c("GS:6"="#fcae91","GS:7"="#fd6a4a","GS:8"="#de2d26","GS:9"="#a50f15","GS:10"="#fee5d9"),
PSA=c("PSA:<4"="#edf8b1","PSA:4~20"="#7fcdbb","PSA:>20"="#2c7fb8"),
cnv_type=c("More_cnv"="#009E73","Quiet"="#D55E00","Some_cnv"="#56B4E9"),
`Pre-Treat`=c("Pre-Treat:bicalutamide"="#377eb8","Pre-Treat:bicalutamide+Finasteride"="#4daf4a","Pre-Treat:Finasteride"="#984ea3","Pre-Treat:Flutamide"="#ff7f00","Pre-Treat:Leuprorelin+bicalutamide"="#ffff33","Pre-Treat:NO"="wheat2","Pre-Treat:Zoladex+bicalutamide"="#a65628"))
#label location
chr=unlist(strsplit(colnames(cnv.fig),"\\:"))
chr=chr[seq(1,length(chr),by=2)]
chr=gsub("chr","",chr)
chr=as.numeric(chr)
chr.n=table(chr)
loc=cumsum(chr.n)/length(chr)
text.loc=foreach(i=1:21,.combine = c) %do% mean(loc[i:(i+1)])
text.loc=c(chr.n[1]/2/length(chr),text.loc)
#annotation
ha=HeatmapAnnotation(df=cl_dt,col=cl_col,which="row",width=unit(3,"cm"))
#pdf("CPGEA_CNV.segment.208new_euclidean_ward.D.pdf",width=23,height=10)
draw(ha+ht1,row_dend_side="left")
decorate_heatmap_body("CNV", {
for(i in 1:length(loc)){
grid.lines(rep(loc[i],2), c(0, 1), gp = gpar(lty = 1, lwd = 1,col="grey90"))}
grid.lines(c(0,1), c(1, 1), gp = gpar(lty = 1, lwd = 1,col="grey90"))
grid.text(unique(chr), text.loc, 1, default.units = "npc",just="bottom")
})