This is the R Markdown for Supplementary Table 6, which consists of 2 parts.

Mut_freq_compare_summary

##Step1:
##Calculate somatic frequency of every study, respectively.
library(readr)
library(foreach)
library(maftools)
#frequency of every study
raw_mt=list()
raw_mt[[1]]<- as.data.frame(read_delim("1.prad_cpcg_2017/1.prad_cpcg_2017/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[2]] <- as.data.frame(read_delim("2.prad_su2c_2015/2.prad_su2c_2015/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE,col_types=cols(
  .default = col_character())))
raw_mt[[3]] <- as.data.frame(read_delim("3.prad_mskcc_2017/3.prad_mskcc_2017/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[4]] <- as.data.frame(read_delim("4.nepc_wcm_2016/4.nepc_wcm_2016/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[5]] <- as.data.frame(read_delim("5.prad_broad_2013/5.prad_broad_2013/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[6]] <- as.data.frame(read_delim("6.prad_broad_2012/6.prad_broad_2012/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[7]] <- as.data.frame(read_delim("7.prad_eururol_2017/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[8]] <- as.data.frame(read_delim("8.prad_fhcrc_NM_2016/8.prad_fhcrc_NM_2016/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[9]] <- as.data.frame(read_delim("9.prad_mskcc/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[10]] <- as.data.frame(read_delim("10.prad_MSKCC_DFCI_NG_2018/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE,skip=1))
raw_mt[[11]] <- as.data.frame(read_delim("11.prad_tcga_pub/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[12]] <- as.data.frame(read_delim("13.prad_mskcc_cheny1_organoids_2014/13.prad_mskcc_cheny1_organoids_2014/data_mutations_extended.txt", "\t", skip=1,col_names = TRUE, progress = FALSE))
raw_mt[[13]] <- as.data.frame(read_delim("14.prad_mich_Nature_2012/14.prad_mich_Nature_2012/data_mutations_extended.txt", "\t", col_names = TRUE,skip=1, progress = FALSE))
#check sample ID
raw_mt[[1]]$Tumor_Sample_Barcode[grep("^TCGA",raw_mt[[1]]$Tumor_Sample_Barcode)]=sub("-01","",raw_mt[[1]]$Tumor_Sample_Barcode[grep("^TCGA",raw_mt[[1]]$Tumor_Sample_Barcode)])
raw_mt[[1]]$Tumor_Sample_Barcode[grep("-Tumor",raw_mt[[1]]$Tumor_Sample_Barcode)]=sub("-Tumor","",raw_mt[[1]]$Tumor_Sample_Barcode[grep("-Tumor",raw_mt[[1]]$Tumor_Sample_Barcode)])
raw_mt[[11]]$Tumor_Sample_Barcode[grep("^TCGA",raw_mt[[11]]$Tumor_Sample_Barcode)]=sub("-01","",raw_mt[[11]]$Tumor_Sample_Barcode[grep("^TCGA",raw_mt[[11]]$Tumor_Sample_Barcode)])

#integral mutation data
#check the column of HGVSp_Short
colnames(raw_mt[[1]])[35]="HGVSp_Short"
gl=which(is.na(raw_mt[[5]][,"HGVSp_Short"]))
raw_mt[[5]][gl,"HGVSp_Short"]=raw_mt[[5]][gl,"Protein_Change"]
col_name=foreach(i=1:13,.combine=c) %do% colnames(raw_mt[[i]])
com_name=names(table(col_name)[table(col_name)==13])
feature=intersect(colnames(raw_mt[[2]]),com_name)

library(foreach)
study=tolower(c("cpcg_2017","su2c_2015","mskcc_2017","wcm_2016","broad_2013","broad_2012",
                        "eururol_2017","fhcrc_NM_2016","mskcc_2010","MSKCC_DFCI_NG_2018",
                        "tcga_pub","mskcc_cheny_2014","mich_2012"))
names(raw_mt)=study

allct=read.table("Abroad_meta_data.xls",sep="\t",header=T,stringsAsFactors = F)
rownames(allct)=allct$SAMPLE_ID

#caculate somatic mutation frequency

nonsys=c("Missense_Mutation","Nonsense_Mutation","Nonstop_Mutation","Splice_Site","Frame_Shift_Del","Frame_Shift_Ins","In_Frame_Ins","In_Frame_Del")

if(!dir.exists("mut_freq")){
dir.create("mut_freq")
}
Mut=list()
for(i in 1:length(study)){
  nct=allct[grep(study[i],allct$source),]
  mut=raw_mt[study[i]][[1]]
  mut$Tumor_Sample_Barcode=factor(mut$Tumor_Sample_Barcode,levels=nct$SAMPLE_ID)
  
  mt_cl=table(mut[is.element(mut$Variant_Classification,nonsys),c("Hugo_Symbol","Tumor_Sample_Barcode")])
  mt_cl[mt_cl!=0]=1
  
  sp_pri=nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Primary"]
  sp_met=nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Metastatic"]
  sp_neu=nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Prostate Neuroendocrine Carcinoma"]
  
    if(length(sp_pri)>1){
      scnv=mt_cl[,sp_pri]
      pri_mt=rowSums(scnv)
      Pri_mt=data.frame(Pri_n=pri_mt,Pri_freq=pri_mt/length(sp_pri))
      rownames(Pri_mt)=rownames(scnv)
      colnames(Pri_mt)=sub("Pri",paste("Pri(",length(sp_pri),")",sep=""),colnames(Pri_mt))
    }else{
      Pri_mt=NULL
    }
    if(length(sp_met)>1){
      scnv=mt_cl[,sp_met]
      meta_mt=rowSums(scnv)
      Meta_mt=data.frame(Meta_n=meta_mt,Meta_freq=meta_mt/length(sp_met))
      rownames(Meta_mt)=rownames(scnv)
      colnames(Meta_mt)=sub("Meta",paste("Meta(",length(sp_met),")",sep=""),colnames(Meta_mt))
    }else{
      Meta_mt=NULL
    }
    if(length(sp_neu)>1){
      scnv=mt_cl[,sp_neu]
      neu_mt=rowSums(scnv)
      Neu_mt=data.frame(Neu_n=neu_mt,Neu_freq=neu_mt/length(sp_neu))
      rownames(Neu_mt)=rownames(scnv)
      colnames(Neu_mt)=sub("Neu",paste("Neu(",length(sp_neu),")",sep=""),colnames(Neu_mt))
   }else{
      Neu_mt=NULL
    }
    
    MUT=list(dt1=Pri_mt,dt2=Meta_mt,dt3=Neu_mt)
    na=foreach(k=1:length(MUT),.combine=c) %do% length(nrow(MUT[[k]]))
    MUT=MUT[na!=0]
   
    Mut[[i]]=do.call(cbind,MUT)
  write.table(Mut[[i]],paste("mut_freq",paste(study[i],"Mut_freq.xls",sep="_"),sep="/"),sep="\t",quote=F)
}

#CPGEA
mt=read.maf("Somatic_mutation.filter.208.maf", removeDuplicatedVariants = F)
our_mt=subsetMaf(mt, includeSyn = F, tsb = NULL, genes = NULL,fields = NULL, query = NULL, mafObj = FALSE, isTCGA = FALSE)
nonsys=c("Missense_Mutation","Nonsense_Mutation","Nonstop_Mutation","Splice_Site","Frame_Shift_Del","Frame_Shift_Ins","In_Frame_Ins","In_Frame_Del")

our_mt=our_mt[is.element(our_mt$Variant_Classification,nonsys),]
our_mt=our_mt[!is.element(as.character(our_mt$Tumor_Sample_Barcode),c("T13_WGS","T502_WGS")),]
our_tab=table(our_mt[,c("Hugo_Symbol","Tumor_Sample_Barcode")])
our_tab[our_tab!=0]=1
our_tab=our_tab[rowSums(our_tab)!=0,]
our_count=rowSums(our_tab)
our_freq=our_count/206
Our_study=data.frame(Gene=rownames(our_tab),CPGEA_n=our_count,CPGEA_freq=our_freq)
write.table(Our_study,"CPGEA_somatic_mutation_freq_206.xls",sep="\t",quote=F)

#TCGA_CPGEA_pipeline
mt=read.maf("TCGA_PRAD_333_WXS_20190724.maf", removeDuplicatedVariants = F)
our_mt=subsetMaf(mt, includeSyn = F, tsb = NULL, genes = NULL,fields = NULL, query = NULL, mafObj = FALSE, isTCGA = FALSE)
nonsys=c("Missense_Mutation","Nonsense_Mutation","Nonstop_Mutation","Splice_Site","Frame_Shift_Del","Frame_Shift_Ins","In_Frame_Ins","In_Frame_Del")

our_mt=our_mt[is.element(our_mt$Variant_Classification,nonsys),]
our_tab=table(our_mt[,c("Hugo_Symbol","Tumor_Sample_Barcode")])
our_tab[our_tab!=0]=1
our_count=rowSums(our_tab)
our_freq=our_count/333
Our_study=data.frame(Gene=rownames(our_tab),TCGA_CP_n=our_count,TCGA_CP_freq=our_freq)
write.table(Our_study,"TCGA_CPGEA_somatic_mutation_freq_333.xls",sep="\t",quote=F)

#merge all study 
Mut=list()
Mut=foreach(i=1:length(study)) %do% read.table(paste("mut_freq",paste(study[i],"Mut_freq.xls",sep="_"),sep="/"),sep="\t",header=T,stringsAsFactor=F,check.name=F)
#Our_study
CPGEA=read.table("CPGEA_somatic_mutation_freq_206.xls",sep = "\t",header=T,stringsAsFactors=F)[,-1]
Mut[[14]]=CPGEA
colnames(Mut[[14]])=sub("CPGEA","CPGEA(206)",colnames(Mut[[14]]))
TCGA_CP=read.table("TCGA_CPGEA_somatic_mutation_freq_333.xls",sep = "\t",header=T,stringsAsFactors=F)[,-1]
Mut[[15]]=TCGA_CP
colnames(Mut[[15]])=sub("TCGA_CP","TCGA_CP(333)",colnames(Mut[[15]]))

names(Mut)=c(study,"CPGEA","TCGA_CP")

for(i in 1:length(Mut)){
  colnames(Mut[[i]])=sub("dt3",study[i],sub("dt2",study[i],sub("dt1",study[i],colnames(Mut[[i]]))))
  Mut[[i]]$Gene=toupper(rownames(Mut[[i]]))
}

dt=Mut[[1]]
for(i in 2:length(Mut)){
  dt=merge(dt,Mut[[i]],by="Gene",all=T)
}
dt[is.na(dt)]=0
write.table(dt,"Mut_study_freq_compare_12Abroad_EU_CPGEA_TCGACP.xls",sep="\t",quote=F)

##Step2:
#Calculate somatic mutation frequency of Primary, Metastatic and Neuroendocrine,respecively.

raw_mt=list()
raw_mt[[1]]<- as.data.frame(read_delim("1.prad_cpcg_2017/1.prad_cpcg_2017/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[2]] <- as.data.frame(read_delim("2.prad_su2c_2015/2.prad_su2c_2015/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE,col_types=cols(
  .default = col_character())))
raw_mt[[3]] <- as.data.frame(read_delim("3.prad_mskcc_2017/3.prad_mskcc_2017/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[4]] <- as.data.frame(read_delim("4.nepc_wcm_2016/4.nepc_wcm_2016/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[5]] <- as.data.frame(read_delim("5.prad_broad_2013/5.prad_broad_2013/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[6]] <- as.data.frame(read_delim("6.prad_broad_2012/6.prad_broad_2012/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[7]] <- as.data.frame(read_delim("7.prad_eururol_2017/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[8]] <- as.data.frame(read_delim("8.prad_fhcrc_NM_2016/8.prad_fhcrc_NM_2016/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[9]] <- as.data.frame(read_delim("9.prad_mskcc/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[10]] <- as.data.frame(read_delim("10.prad_MSKCC_DFCI_NG_2018/data_mutations_extended.txt", "\t", col_names = TRUE, progress = FALSE,skip=1))
raw_mt[[11]] <- as.data.frame(read_delim("TCGA_PRAD_333_WXS_20190724.maf", "\t", col_names = TRUE, progress = FALSE))
raw_mt[[12]] <- as.data.frame(read_delim("13.prad_mskcc_cheny1_organoids_2014/13.prad_mskcc_cheny1_organoids_2014/data_mutations_extended.txt", "\t", skip=1,col_names = TRUE, progress = FALSE))
raw_mt[[13]] <- as.data.frame(read_delim("14.prad_mich_Nature_2012/14.prad_mich_Nature_2012/data_mutations_extended.txt", "\t", col_names = TRUE,skip=1, progress = FALSE))

#check sample ID
raw_mt[[1]]$Tumor_Sample_Barcode[grep("^TCGA",raw_mt[[1]]$Tumor_Sample_Barcode)]=sub("-01","",raw_mt[[1]]$Tumor_Sample_Barcode[grep("^TCGA",raw_mt[[1]]$Tumor_Sample_Barcode)])
raw_mt[[1]]$Tumor_Sample_Barcode[grep("-Tumor",raw_mt[[1]]$Tumor_Sample_Barcode)]=sub("-Tumor","",raw_mt[[1]]$Tumor_Sample_Barcode[grep("-Tumor",raw_mt[[1]]$Tumor_Sample_Barcode)])
raw_mt[[11]]$Tumor_Sample_Barcode[grep("^TCGA",raw_mt[[11]]$Tumor_Sample_Barcode)]=sub("-01A|-01B","",raw_mt[[11]]$Tumor_Sample_Barcode[grep("^TCGA",raw_mt[[11]]$Tumor_Sample_Barcode)])

#integral mutation data
colnames(raw_mt[[1]])[35]="HGVSp_Short"
gl=which(is.na(raw_mt[[5]][,"HGVSp_Short"]))
raw_mt[[5]][gl,"HGVSp_Short"]=raw_mt[[5]][gl,"Protein_Change"]
col_name=foreach(i=1:13,.combine=c) %do% colnames
prot_chage=foreach(j=1:nrow(raw_mt[[11]])) %do% unlist(strsplit(raw_mt[[11]]$AAchange[j],split=":"))
prot_chage=foreach(j=1:nrow(raw_mt[[11]]),.combine=c) %do% prot_chage[[j]][length(prot_chage[[j]])]
raw_mt[[11]]$HGVSp_Short=prot_chage
raw_mt[[11]]$HGVSp_Short[raw_mt[[11]]$HGVSp_Short=="."]=NA
col_name=foreach(i=1:13,.combine=c) %do% colnames(raw_mt[[i]])
com_name=names(table(col_name)[table(col_name)==13])
feature=intersect(colnames(raw_mt[[2]]),com_name)

#exclude EU
sub_mt=foreach(i=c(1:6,8:13)) %do% raw_mt[[i]][,feature]
all_mt=do.call(rbind,sub_mt)
all_mt$Hugo_Symbol=toupper(all_mt$Hugo_Symbol)
write.table(all_mt,"Abroad_mutations_TCGAPC.xls",row.names = F,sep="\t",quote=F)

#calculate all primary,meta,neu
allct=read.table("Abroad_meta_data.xls",sep="\t",header=T,stringsAsFactors = F)
rownames(allct)=allct$SAMPLE_ID
#sample file remove eururol_2017 and mskcc_2014
subct=allct[allct$source!="eururol_2017"&allct$source!="mskcc_2014",]
all_mt <- as.data.frame(read_delim("Abroad_mutations_TCGAPC.xls", "\t", col_names = TRUE, progress = FALSE))
all_mt$Tumor_Sample_Barcode=factor(all_mt$Tumor_Sample_Barcode,levels=subct$SAMPLE_ID)
cnv_cl=table(all_mt[is.element(all_mt$Variant_Classification,nonsys),c("Hugo_Symbol","Tumor_Sample_Barcode")])
cnv_cl[cnv_cl!=0]=1
 
sp_pri=subct$SAMPLE_ID[subct$SAMPLE_TYPE=="Primary"]
sp_met=subct$SAMPLE_ID[subct$SAMPLE_TYPE=="Metastatic"]
sp_neu=subct$SAMPLE_ID[subct$SAMPLE_TYPE=="Prostate Neuroendocrine Carcinoma"]

scnv=cnv_cl[,sp_pri]
pri_n=rowSums(scnv)
pri_freq=rowMeans(scnv)
Pri_mut=data.frame(Pri_n=pri_n,Pri_freq=pri_freq)
rownames(Pri_mut)=rownames(scnv)
colnames(Pri_mut)=sub("Pri",paste("Pri(",length(sp_pri),")",sep=""),colnames(Pri_mut))
 
scnv=cnv_cl[,sp_met]
meta_n=rowSums(scnv)
meta_freq=rowMeans(scnv)
Meta_mut=data.frame(Meta_n=meta_n,Meta_freq=meta_freq)
rownames(Meta_mut)=rownames(scnv)
colnames(Meta_mut)=sub("Meta",paste("Meta(",length(sp_met),")",sep=""),colnames(Meta_mut))

scnv=cnv_cl[,sp_neu]
neu_n=rowSums(scnv)
neu_freq=rowMeans(scnv)
Neu_mut=data.frame(Neu_n=neu_n,Neu_freq=neu_freq)
rownames(Neu_mut)=rownames(scnv)
colnames(Neu_mut)=sub("Neu",paste("Neu(",length(sp_neu),")",sep=""),colnames(Neu_mut))

Mut=cbind(cbind(Pri_mut,Meta_mut),Neu_mut)
 
write.table(Mut,"Mut_Pri_Met_Neu_freq.xls",sep = "\t",quote=F)

##Step3:
##Merge frequency and gene function annotation.
mut_apart=read.table("Mut_study_freq_compare_12Abroad_EU_CPGEA_TCGACP.xls",sep="\t",header=T,stringsAsFactors = F,check.names = F)
mut_pri_meta=read.table("Mut_Pri_Met_Neu_freq.xls",sep="\t",header=T,stringsAsFactors = F,check.names = F)
mut_pri_meta$Gene=rownames(mut_pri_meta)

mut_all=merge(mut_apart,mut_pri_meta,by="Gene",all=T)
mut_all[is.na(mut_all)]=0

mut_all$p_CPGEA_vs_Primary=""
mut_all$p_CPGEA_vs_Metasta=""
mut_all$p_CPGEA_vs_Neuo=""
mut_all$p_CPGEA_vs_TCGACP=""
mut_all$q_CPGEA_vs_Primary=""
mut_all$q_CPGEA_vs_Metasta=""
mut_all$q_CPGEA_vs_Neuo=""
mut_all$q_CPGEA_vs_TCGACP=""

gl=which(mut_all$`CPGEA(206)_freq`!=0)
mut_all$p_CPGEA_vs_Primary[gl]=foreach(j=gl,.combine = c) %do% prop.test(as.numeric(mut_all$`Pri(1400)_n`[j]),1400,mut_all$`CPGEA(206)_freq`[j])$p.value
mut_all$p_CPGEA_vs_Metasta[gl]=foreach(j=gl,.combine = c) %do% prop.test(as.numeric(mut_all$`Meta(880)_n`[j]),880,mut_all$`CPGEA(206)_freq`[j])$p.value
mut_all$p_CPGEA_vs_Neuo[gl]=foreach(j=gl,.combine = c) %do% prop.test(as.numeric(mut_all$`Neu(54)_n`[j]),54,mut_all$`CPGEA(206)_freq`[j])$p.value
mut_all$p_CPGEA_vs_TCGACP[gl]=foreach(j=gl,.combine = c) %do% prop.test(as.numeric(mut_all$`TCGA_CP(333)_n`[j]),333,mut_all$`CPGEA(206)_freq`[j])$p.value

mut_all$q_CPGEA_vs_Primary[gl]=p.adjust(as.numeric(mut_all$p_CPGEA_vs_Primary[gl]),method = "BH")
mut_all$q_CPGEA_vs_Metasta[gl]=p.adjust(as.numeric(mut_all$p_CPGEA_vs_Metasta[gl]),method = "BH")
mut_all$q_CPGEA_vs_Neuo[gl]=p.adjust(as.numeric(mut_all$p_CPGEA_vs_Neuo[gl]),method = "BH")
mut_all$q_CPGEA_vs_TCGACP[gl]=p.adjust(as.numeric(mut_all$p_CPGEA_vs_TCGACP[gl]),method = "BH")

colnames(mut_all)[grep("^p_CPGEA_vs",colnames(mut_all))]=paste(colnames(mut_all)[grep("^p_CPGEA_vs",colnames(mut_all))],"(prop.test)",sep = "")
colnames(mut_all)[grep("^q_CPGEA_vs",colnames(mut_all))]=paste(colnames(mut_all)[grep("^q_CPGEA_vs",colnames(mut_all))],"(BH)",sep = "")

#add function anotation
func=read.table("CancerGenesList.txt",sep="\t",header=T,stringsAsFactors=F,row.names=1)
func$OncoKB.OG[func$OncoKB.OG==""]=NA
func$OncoKB.TSG[func$OncoKB.TSG==""]=NA
func$OG_TSG=foreach(i=1:nrow(func),.combine = c) %do% paste(na.omit(c(func$OncoKB.OG[i],func$OncoKB.TSG[i])),collapse=",")
ccg <- as.data.frame(readr::read_csv("Census_allTue Jan  2 12_17_09 2018.csv"))
cgc_gene=ccg$`Gene Symbol`
rownames(ccg)=ccg$`Gene Symbol`
cgc_smg=read.table("CGC_SMG_gene.xls",sep="\t",header=T,row.names=1,stringsAsFactors=F)

pathw=as.data.frame(readxl::read_excel("Pathway+gene+oncogene+TSG+curated2.xlsx")[,1:3])
colnames(pathw)[1]="Gene"
rownames(pathw)=pathw$Gene

mut_all$OncoKB.Annotated=""
mut_all[is.element(mut_all$Gene,rownames(func)),"OncoKB.Annotated"]=func[mut_all$Gene[is.element(mut_all$Gene,rownames(func))],2]
mut_all$OncoKB_OG_TSG=""
mut_all[is.element(mut_all$Gene,rownames(func)),"OncoKB_OG_TSG"]=func[mut_all$Gene[is.element(mut_all$Gene,rownames(func))],"OG_TSG"]

#CGC
mut_all$SMG_source=""
mut_all$SMG=""
mut_all[is.element(mut_all$Gene,rownames(cgc_smg)),c("SMG_source","SMG")]=cgc_smg[mut_all$Gene[is.element(mut_all$Gene,rownames(cgc_smg))],]
mut_all$CGC=mut_all$SMG
mut_all$SMG=sub("cgc","",sub("cgc;","",mut_all$SMG))
mut_all$CGC=sub("smg","",sub(";smg","",mut_all$CGC))
mut_all$CGC_OG_TSG=""
mut_all[is.element(mut_all$Gene,rownames(ccg)),"CGC_OG_TSG"]=ccg[mut_all$Gene[is.element(mut_all$Gene,rownames(ccg))],"Role in Cancer"]

mut_all$pathwaylist=""
mut_all$OG_TSG=""
pthl=which(is.element(mut_all$Gene,unique(pathw$Gene)))
mut_all[pthl,c("OG_TSG","pathwaylist")]=pathw[mut_all$Gene[pthl],2:3]
mut_all[is.na(mut_all)]=""
write.table(mut_all,"Mut_freq_compare_summary.xls",sep = "\t",quote=F,row.names = F)

Amp_freq_compare_summary & Del_freq_compare_summary

##Step1:
##Calculate Amplification and deletion frequency of every study, respectively.

library(readxl)
library(readr)
library(foreach)
cnv_list=list()
cnv_list[[2]] <- unique(as.data.frame(read_delim("2.prad_su2c_2015/2.prad_su2c_2015/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[3]] <- unique(as.data.frame(read_delim("3.prad_mskcc_2017/3.prad_mskcc_2017/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[4]] <- unique(as.data.frame(read_delim("4.nepc_wcm_2016/4.nepc_wcm_2016/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[5]] <- unique(as.data.frame(read_delim("5.prad_broad_2013/5.prad_broad_2013/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[6]] <- unique(as.data.frame(read_delim("6.prad_broad_2012/6.prad_broad_2012/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[7]] <- unique(as.data.frame(read_delim("7.prad_eururol_2017/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[8]] <- unique(as.data.frame(read_delim("8.prad_fhcrc_NM_2016/8.prad_fhcrc_NM_2016/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[9]] <- unique(as.data.frame(read_delim("9.prad_mskcc/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[10]] <- unique(as.data.frame(read_delim("10.prad_MSKCC_DFCI_NG_2018/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[11]] <- unique(as.data.frame(read_delim("11.prad_tcga_pub/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[12]] <- unique(as.data.frame(read_delim("12.prad_mskcc_2014/12.prad_mskcc_2014/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[13]] <- unique(as.data.frame(read_delim("14.prad_mich_Nature_2012/14.prad_mich_Nature_2012/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))

#deal with duplication gene
cnv_list[[4]]=cnv_list[[4]][-c(4704,15435),]
cnv_list[[6]]$Hugo_Symbol[c(12823,10931)]=c("PLPPR3","INPP5K")
cnv_list[[6]]=cnv_list[[6]][-c(19724,27126),]
cnv_list[[8]]=cnv_list[[8]][-c(2108,2109),]
gl=names(table(cnv_list[[9]]$Hugo_Symbol))[table(cnv_list[[9]]$Hugo_Symbol)!=1]
gln=which(is.element(cnv_list[[9]]$Hugo_Symbol,gl))
cnv_list[[9]]=cnv_list[[9]][-c(1738,1739,8347),]
cnv_list[[9]]=cnv_list[[9]][-which(is.na(cnv_list[[9]]$Hugo_Symbol)),]
cnv_list[[10]]=cnv_list[[10]][-5976,]
cnv_list[[11]]=cnv_list[[11]][-c(2123:2124),]

gl=names(table(cnv_list[[12]]$Hugo_Symbol))[table(cnv_list[[12]]$Hugo_Symbol)!=1]
gln=which(is.element(cnv_list[[12]]$Hugo_Symbol,gl))
gl_cnv=cnv_list[[12]][gln,-1]
gl_cnv[gl_cnv!=0]=1
gl_dt=data.frame(nline=gln,gene=cnv_list[[12]]$Hugo_Symbol[gln],rsum=rowSums(gl_cnv))
delt=foreach(j=gl,.combine = c) %do% gl_dt$nline[gl_dt$gene==j][-which.max(gl_dt$rsum[gl_dt$gene==j])]
cnv_list[[12]]=cnv_list[[12]][-delt,]

cnv_list[[13]][is.na(cnv_list[[13]])]=0
gl=names(table(cnv_list[[13]]$Hugo_Symbol))[table(cnv_list[[13]]$Hugo_Symbol)!=1]
gln=which(is.element(cnv_list[[13]]$Hugo_Symbol,gl))
gl_cnv=cnv_list[[13]][gln,-1]
gl_cnv[gl_cnv!=0]=1
rsum=foreach(j=1:nrow(gl_cnv),.combine=c) %do% sum(as.numeric(gl_cnv[j,]))
gl_dt=data.frame(nline=gln,gene=cnv_list[[13]]$Hugo_Symbol[gln],rsum=rsum)
delt=foreach(j=gl,.combine = c) %do% gl_dt$nline[gl_dt$gene==j][-which.max(gl_dt$rsum[gl_dt$gene==j])]
cnv_list[[13]]=cnv_list[[13]][-delt,]

colnames(cnv_list[[11]])=sub("-01","",colnames(cnv_list[[11]]))
study=tolower(c("cpcg_2017","su2c_2015","mskcc_2017","wcm_2016","broad_2013","broad_2012","eururol_2017","fhcrc_NM_2016","mskcc_2010","MSKCC_DFCI_NG_2018","tcga_pub","mskcc_2014","mich_2012"))
names(cnv_list)=study

allct=read.table("Abroad_meta_data.xls",sep="\t",header=T,stringsAsFactors = F)
rownames(allct)=allct$SAMPLE_ID

#caculate frequency
if(!dir.exists("cnv_freq")){
  dir.create("cnv_freq")
}
Amp=list()
Del=list()
for(i in c(2:13)){
  nct=allct[grep(study[i],allct$source),]
  cnv_cl=cnv_list[study[i]][[1]]
  
  Sp_pri=nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Primary"]
  Sp_met=nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Metastatic"]
  Sp_neu=nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Prostate Neuroendocrine Carcinoma"]
  
  sp_pri=intersect(nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Primary"],colnames(cnv_cl))
  sp_met=intersect(nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Metastatic"],colnames(cnv_cl))
  sp_neu=intersect(nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Prostate Neuroendocrine Carcinoma"],colnames(cnv_cl))
  
  #all gene
  cnv_amp=cnv_cl
  rownames(cnv_amp)=cnv_amp$Hugo_Symbol
  
  cnv_del=cnv_cl
  rownames(cnv_del)=cnv_del$Hugo_Symbol

    if(length(sp_pri)>1){
      scnv=cnv_amp[,sp_pri]
      pri_amp=foreach(j=1:nrow(scnv)) %do% length(which(scnv[j,]>1))
      Pri_Amp=data.frame(Pri_n=unlist(pri_amp),Pri_freq=unlist(pri_amp)/length(Sp_pri))
      rownames(Pri_Amp)=rownames(scnv)
      colnames(Pri_Amp)=sub("Pri",paste("Pri(",length(Sp_pri),")",sep=""),colnames(Pri_Amp))
      
      scnv=cnv_del[,sp_pri]
      pri_del=foreach(j=1:nrow(scnv)) %do% length(which(scnv[j,]<(-1)))
      Pri_Del=data.frame(Pri_n=unlist(pri_del),Pri_freq=unlist(pri_del)/length(sp_pri))
      rownames(Pri_Del)=rownames(scnv)
      colnames(Pri_Del)=sub("Pri",paste("Pri(",length(Sp_pri),")",sep=""),colnames(Pri_Del))
      
    }else{
      Pri_Amp=NULL
      Pri_Del=NULL
    }
    
    if(length(sp_met)>1){
      scnv=cnv_amp[,sp_met]
      meta_amp=foreach(j=1:nrow(scnv)) %do% length(which(scnv[j,]>1))
      Meta_Amp=data.frame(Meta_n=unlist(meta_amp),Meta_freq=unlist(meta_amp)/length(Sp_met))
      rownames(Meta_Amp)=rownames(scnv)
      colnames(Meta_Amp)=sub("Meta",paste("Meta(",length(Sp_met),")",sep=""),colnames(Meta_Amp))
      
      scnv=cnv_del[,sp_met]
      meta_del=foreach(j=1:nrow(scnv)) %do% length(which(scnv[j,]<(-1)))
      Meta_Del=data.frame(Meta_n=unlist(meta_del),Meta_freq=unlist(meta_del)/length(Sp_met))
      rownames(Meta_Del)=rownames(scnv)
      colnames(Meta_Del)=sub("Meta",paste("Meta(",length(Sp_met),")",sep=""),colnames(Meta_Del))
    }else{
      Meta_Amp=NULL
      Meta_Del=NULL
    }
    
    if(length(sp_neu)>1){
      scnv=cnv_amp[,sp_neu]
      neu_amp=foreach(j=1:nrow(scnv)) %do% length(which(scnv[j,]>1))
      Neu_Amp=data.frame(Neu_n=unlist(neu_amp),Neu_freq=unlist(neu_amp)/length(Sp_neu))
      rownames(Neu_Amp)=rownames(scnv)
      colnames(Neu_Amp)=sub("Neu",paste("Neu(",length(Sp_neu),")",sep=""),colnames(Neu_Amp))
      
      scnv=cnv_del[,sp_neu]
      neu_del=foreach(j=1:nrow(scnv)) %do% length(which(scnv[j,]<(-1)))
      Neu_Del=data.frame(Neu_n=unlist(neu_del),Neu_freq=unlist(neu_del)/length(Sp_neu))
      rownames(Neu_Del)=rownames(scnv)
      colnames(Neu_Del)=sub("Neu",paste("Neu(",length(Sp_neu),")",sep=""),colnames(Neu_Del))
    }else{
      Neu_Amp=NULL
      Neu_Del=NULL
    }
    
    AMP=list(dt1=Pri_Amp,dt2=Meta_Amp,dt3=Neu_Amp)
    na=foreach(k=1:length(AMP),.combine=c) %do% length(nrow(AMP[[k]]))
    AMP=AMP[na!=0]
    
    DEL=list(dt1=Pri_Del,dt2=Meta_Del,dt3=Neu_Del)
    na=foreach(k=1:length(DEL),.combine=c) %do% length(nrow(DEL[[k]]))
    DEL=DEL[na!=0]
    
    Amp[[i]]=do.call(cbind,AMP)
    Del[[i]]=do.call(cbind,DEL)
  
  write.table(Amp[[i]],paste("cnv_freq/",study[i],"_Amp_freq.xls",sep=""),sep="\t",quote=F)
  write.table(Del[[i]],paste("cnv_freq/",study[i],"_Del_freq.xls",sep=""),sep="\t",quote=F)
}

#cpgea
our_cnv <- as.data.frame(read.table("PC_WGS_208pairs_filt_js100.all_thresholded.by_genes.txt", sep="\t",stringsAsFactors = F,header=T))

Our_amp=foreach(i=1:nrow(our_cnv),.combine=c) %do% length(which(our_cnv[i,-c(1:3)]==2))
Our_del=foreach(i=1:nrow(our_cnv),.combine=c) %do% length(which(our_cnv[i,-c(1:3)]==-2))

Our_Amp=data.frame(CPGEA_n=Our_amp,CPGEA_freq=Our_amp/208)
rownames(Our_Amp)=our_cnv$Gene.Symbol
colnames(Our_Amp)=sub("CPGEA","CPGEA(208)",colnames(Our_Amp))
Our_Del=data.frame(CPGEA_n=Our_del,CPGEA_freq=Our_del/208)
rownames(Our_Del)=our_cnv$Gene.Symbol
colnames(Our_Del)=sub("CPGEA","CPGEA(208)",colnames(Our_Del))

write.table(Our_Amp,paste("cnv_freq/","CPGEA","_Amp_freq.xls",sep=""),sep="\t",quote=F)
write.table(Our_Del,paste("cnv_freq/","CPGEA","_Del_freq.xls",sep=""),sep="\t",quote=F)

#TCGA_cpgea
our_cnv <- as.data.frame(read.table("TCGA_PRAD_WGS_114pairs_filtered_freqcnv_blacklist_js100.all_thresholded.by_genes.txt", sep="\t",stringsAsFactors = F,header=T))

Our_amp=foreach(i=1:nrow(our_cnv),.combine=c) %do% length(which(our_cnv[i,-c(1:3)]==2))
Our_del=foreach(i=1:nrow(our_cnv),.combine=c) %do% length(which(our_cnv[i,-c(1:3)]==-2))

Our_Amp=data.frame(TCGA_CP_n=Our_amp,TCGA_CP_freq=Our_amp/114)
rownames(Our_Amp)=our_cnv$Gene.Symbol
colnames(Our_Amp)=sub("TCGA_CP","TCGA_CP(114)",colnames(Our_Amp))

Our_Del=data.frame(TCGA_CP_n=Our_del,TCGA_CP_freq=Our_del/114)
rownames(Our_Del)=our_cnv$Gene.Symbol
colnames(Our_Del)=sub("TCGA_CP","TCGA_CP(114)",colnames(Our_Del))

write.table(Our_Amp,paste("cnv_freq/","TCGA_CP","_Amp_freq.xls",sep=""),sep="\t",quote=F)
write.table(Our_Del,paste("cnv_freq/","TCGA_CP","_Del_freq.xls",sep=""),sep="\t",quote=F)

#11 abroad+EU+cpgea+tcga_cp
study=tolower(c("su2c_2015","mskcc_2017","wcm_2016","broad_2013","broad_2012",
                "eururol_2017","fhcrc_NM_2016","mskcc_2010","MSKCC_DFCI_NG_2018",
                "tcga_pub","mskcc_2014","mich_2012"))
study=c(study,"CPGEA","TCGA_CP")
Amp=foreach(i=1:length(study)) %do% read.table(paste("cnv_freq/",study[i],"_Amp_freq.xls",sep=""),sep="\t",header=T,stringsAsFactor=F,check.name=F)
Del=foreach(i=1:length(study)) %do% read.table(paste("cnv_freq/",study[i],"_Del_freq.xls",sep=""),sep="\t",header=T,stringsAsFactor=F,check.name=F)

names(Amp)=study
names(Del)=study

for(i in 1:length(Amp)){
  colnames(Amp[[i]])=sub("dt3",study[i],sub("dt2",study[i],sub("dt1",study[i],colnames(Amp[[i]]))))
  colnames(Del[[i]])=sub("dt3",study[i],sub("dt2",study[i],sub("dt1",study[i],colnames(Del[[i]]))))
  Amp[[i]]$Gene=rownames(Amp[[i]])
  Del[[i]]$Gene=rownames(Del[[i]])
}

dt1=Amp[[1]]
for(i in 2:(length(Amp)-2)){
  dt1=merge(dt1,Amp[[i]],by="Gene",all=T)
}
dt1=merge(dt1,Amp[[13]],by="Gene")
dt1=merge(dt1,Amp[[14]],by="Gene")
dt1[is.na(dt1)]=0
write.table(dt1,"Amp_study_freq_compare_11Abroad_EU_CPGEA_TCGACP.xls",sep="\t",quote=F)

dt2=Del[[1]]
for(i in 2:(length(Del)-2)){
  dt2=merge(dt2,Del[[i]],by="Gene",all=T)
}
dt2=merge(dt2,Del[[13]],by="Gene")
dt2=merge(dt2,Del[[14]],by="Gene")
dt2[is.na(dt2)]=0
write.table(dt2,"Del_study_freq_compare_11Abroad_EU_CPGEA_TCGACP.xls",sep="\t",quote=F)
##Step2:
##Calculate Amplification and deletion frequency of Primary, Metastatic and Neuroendocrine,respecively.

#Primary+meta+neu
cnv_list=list()
cnv_list[[2]] <- unique(as.data.frame(read_delim("2.prad_su2c_2015/2.prad_su2c_2015/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[3]] <- unique(as.data.frame(read_delim("3.prad_mskcc_2017/3.prad_mskcc_2017/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[4]] <- unique(as.data.frame(read_delim("4.nepc_wcm_2016/4.nepc_wcm_2016/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[5]] <- unique(as.data.frame(read_delim("5.prad_broad_2013/5.prad_broad_2013/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[6]] <- unique(as.data.frame(read_delim("6.prad_broad_2012/6.prad_broad_2012/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[7]] <- unique(as.data.frame(read_delim("7.prad_eururol_2017/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[8]] <- unique(as.data.frame(read_delim("8.prad_fhcrc_NM_2016/8.prad_fhcrc_NM_2016/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[9]] <- unique(as.data.frame(read_delim("9.prad_mskcc/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[10]] <- unique(as.data.frame(read_delim("10.prad_MSKCC_DFCI_NG_2018/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[11]] <- unique(as.data.frame(read_delim("TCGA_PRAD_WGS_114pairs_filtered_freqcnv_blacklist_js100.all_thresholded.by_genes.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[12]] <- unique(as.data.frame(read_delim("12.prad_mskcc_2014/12.prad_mskcc_2014/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
cnv_list[[13]] <- unique(as.data.frame(read_delim("14.prad_mich_Nature_2012/14.prad_mich_Nature_2012/data_CNA.txt", "\t", col_names = TRUE, progress = FALSE)))
colnames(cnv_list[[11]])[1]="Hugo_Symbol"
for(i in c(2:13)){
  cnv_list[[i]]$Hugo_Symbol=toupper(cnv_list[[i]]$Hugo_Symbol)
}

#deal with duplication gene
cnv_list[[4]]=cnv_list[[4]][-c(4704,15435),]
cnv_list[[6]]$Hugo_Symbol[c(12823,10931,18848,17653)]=c("PLPPR3","INPP5K","FAM172BP","RGL4")
cnv_list[[6]]=cnv_list[[6]][-c(19724,27126),]
cnv_list[[8]]=cnv_list[[8]][-c(2108,2109),]
gl=names(table(cnv_list[[9]]$Hugo_Symbol))[table(cnv_list[[9]]$Hugo_Symbol)!=1]
gln=which(is.element(cnv_list[[9]]$Hugo_Symbol,gl))
cnv_list[[9]]=cnv_list[[9]][-c(1738,1739,8347),]
cnv_list[[9]]=cnv_list[[9]][-which(is.na(cnv_list[[9]]$Hugo_Symbol)),]
cnv_list[[10]]=cnv_list[[10]][-5976,]

gl=names(table(cnv_list[[12]]$Hugo_Symbol))[table(cnv_list[[12]]$Hugo_Symbol)!=1]
gln=which(is.element(cnv_list[[12]]$Hugo_Symbol,gl))
gl_cnv=cnv_list[[12]][gln,-1]
gl_cnv[gl_cnv!=0]=1
gl_dt=data.frame(nline=gln,gene=cnv_list[[12]]$Hugo_Symbol[gln],rsum=rowSums(gl_cnv))
delt=foreach(j=gl,.combine = c) %do% gl_dt$nline[gl_dt$gene==j][-which.max(gl_dt$rsum[gl_dt$gene==j])]
cnv_list[[12]]=cnv_list[[12]][-delt,]

cnv_list[[13]][is.na(cnv_list[[13]])]=0
gl=names(table(cnv_list[[13]]$Hugo_Symbol))[table(cnv_list[[13]]$Hugo_Symbol)!=1]
gln=which(is.element(cnv_list[[13]]$Hugo_Symbol,gl))
gl_cnv=cnv_list[[13]][gln,-1]
gl_cnv[gl_cnv!=0]=1
rsum=foreach(j=1:nrow(gl_cnv),.combine=c) %do% sum(as.numeric(gl_cnv[j,]))
gl_dt=data.frame(nline=gln,gene=cnv_list[[13]]$Hugo_Symbol[gln],rsum=rsum)
delt=foreach(j=gl,.combine = c) %do% gl_dt$nline[gl_dt$gene==j][-which.max(gl_dt$rsum[gl_dt$gene==j])]
cnv_list[[13]]=cnv_list[[13]][-delt,]

colnames(cnv_list[[11]])=sub("-01A","",colnames(cnv_list[[11]]))

study=tolower(c("cpcg_2017","su2c_2015","mskcc_2017","wcm_2016","broad_2013","broad_2012",
                "eururol_2017","fhcrc_NM_2016","mskcc_2010","MSKCC_DFCI_NG_2018",
                "tcga_cp","mskcc_2014","mich_2012"))
names(cnv_list)=study

#ensure study
allct=read.table("Abroad_meta_data.xls",sep="\t",header=T,stringsAsFactors = F)
rownames(allct)=allct$SAMPLE_ID
allct$source2=allct$source
allct[colnames(cnv_list[[11]])[-c(1:3)],"source2"]="tcga_cp"
#17,12,65,237
subct=allct[allct$source2!="tcga_pub"&allct$source!="mskcc_cheny_2014"&allct$source!="eururol_2017"&allct$source!="cpcg_2017",]
subct$source2[grep("mskcc_dfci_ng_2018",subct$source2)]="mskcc_dfci_ng_2018"
subct=subct[subct$source2!="cpcg_2017;tcga_pub",]
subct$source2[grep("broad_2013",subct$source2)]="broad_2013"
subct$source2[grep("broad_2012",subct$source2)]="broad_2012"
write.table(subct,"Aborad_CNAs_meta.xls",sep="\t",row.names = F,quote = F)

for(i in c(2:6,8:13)){
  cnv_list[study[i]][[1]]=cnv_list[study[i]][[1]][,c("Hugo_Symbol",intersect(colnames(cnv_list[study[i]][[1]]),subct$SAMPLE_ID[subct$source2==study[i]]))] 
  cnv_list[study[i]][[1]]$Hugo_Symbol=toupper(cnv_list[study[i]][[1]]$Hugo_Symbol)
}

all_cnv=cnv_list[[2]]
for(i in c(3:6,8:13)){
  all_cnv=merge(all_cnv,cnv_list[[i]],by="Hugo_Symbol",all=T)
}
all_cnv[is.na(all_cnv)]=0
write.table(all_cnv[-1,],"Abroad_CNAs_TCGAPC.xls",sep="\t",quote=F,row.names = F)

#calculate freq
cnv <- read.table("Abroad_CNAs_TCGAPC.xls", sep="\t", header = T,row.names = 1,check.names = F)
nct=read.table("Aborad_CNAs_meta.xls",sep="\t",header=T,stringsAsFactors = F)
cnv_cl=t(cnv)
cnv_cl[is.na(cnv_cl)]=0

Sp_pri=nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Primary"]
Sp_met=nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Metastatic"]
Sp_neu=nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Prostate Neuroendocrine Carcinoma"]

sp_pri=intersect(nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Primary"],rownames(cnv_cl))
sp_met=intersect(nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Metastatic"],rownames(cnv_cl))
sp_neu=intersect(nct$SAMPLE_ID[nct$SAMPLE_TYPE=="Prostate Neuroendocrine Carcinoma"],rownames(cnv_cl))

scnv=cnv_cl[sp_pri,]
pri_amp=foreach(j=1:ncol(scnv)) %do% length(which(na.omit(as.numeric(scnv[,j]))>=2))
Pri_Amp=data.frame(Pri_n=unlist(pri_amp),Pri_freq=unlist(pri_amp)/length(Sp_pri))
rownames(Pri_Amp)=colnames(scnv)
colnames(Pri_Amp)=sub("Pri",paste("Pri(",length(Sp_pri),")",sep=""),colnames(Pri_Amp))

scnv=cnv_cl[sp_met,]
meta_amp=foreach(j=1:ncol(scnv)) %do% length(which(as.numeric(scnv[,j])>=2))
Meta_Amp=data.frame(Meta_n=unlist(meta_amp),Meta_freq=unlist(meta_amp)/length(Sp_met))
rownames(Meta_Amp)=colnames(scnv)
colnames(Meta_Amp)=sub("Meta",paste("Meta(",length(Sp_met),")",sep=""),colnames(Meta_Amp))

scnv=cnv_cl[sp_neu,]
neu_amp=foreach(j=1:ncol(scnv)) %do% length(which(na.omit(as.numeric(scnv[,j]))>=2))
Neu_Amp=data.frame(Neu_n=unlist(neu_amp),Neu_freq=unlist(neu_amp)/length(Sp_neu))
rownames(Neu_Amp)=colnames(scnv)
colnames(Neu_Amp)=sub("Neu",paste("Neu(",length(Sp_neu),")",sep=""),colnames(Neu_Amp))

Amp=cbind(cbind(Pri_Amp,Meta_Amp),Neu_Amp)

scnv=cnv_cl[sp_pri,]
pri_del=foreach(j=1:ncol(scnv)) %do% length(which(as.numeric(scnv[,j])<=(-2)))
Pri_Del=data.frame(Pri_n=unlist(pri_del),Pri_freq=unlist(pri_del)/length(Sp_pri))
rownames(Pri_Del)=colnames(scnv)
colnames(Pri_Del)=sub("Pri",paste("Pri(",length(Sp_pri),")",sep=""),colnames(Pri_Del))

scnv=cnv_cl[sp_met,]
meta_del=foreach(j=1:ncol(scnv)) %do% length(which(as.numeric(scnv[,j])<=(-2)))
Meta_Del=data.frame(Meta_n=unlist(meta_del),Meta_freq=unlist(meta_del)/length(Sp_met))
rownames(Meta_Del)=colnames(scnv)
colnames(Meta_Del)=sub("Meta",paste("Meta(",length(Sp_met),")",sep=""),colnames(Meta_Del))

scnv=cnv_cl[sp_neu,]
neu_del=foreach(j=1:ncol(scnv)) %do% length(which(as.numeric(scnv[,j])<=(-2)))
Neu_Del=data.frame(Neu_n=unlist(neu_del),Neu_freq=unlist(neu_del)/length(Sp_neu))
rownames(Neu_Del)=colnames(scnv)
colnames(Neu_Del)=sub("Neu",paste("Neu(",length(Sp_neu),")",sep=""),colnames(Neu_Del))

Del=cbind(cbind(Pri_Del,Meta_Del),Neu_Del)
write.table(Amp,"Amp_Pri_Met_Neu_freq.xls",sep = "\t",quote=F)
write.table(Del,"Del_Pri_Met_Neu_freq.xls",sep = "\t",quote=F)

##Step3:
##Summarize genes called by Gistic.
#gistic table
library(readxl)
cancer_cell_2010<- as.data.frame(read_excel("CNV.xlsx"))
cell_gistic_2015<- as.data.frame(read_excel("CNV.xlsx", 
                                            sheet = "2015 Cell GISTIC"))
nature_2017<- as.data.frame(read_excel("CNV.xlsx", 
                                       sheet = "2017 Nature Paper CNV"))
jco_2017<- as.data.frame(read_excel("CNV.xlsx", 
                                    sheet = "2017 JCO CNV"))
eu_cnv<- as.data.frame(read_excel("CNV.xlsx", 
                                  sheet = "EU CNV", skip = 1))
ccg <- as.data.frame(readr::read_csv("Census_allTue Jan  2 12_17_09 2018.csv"))
cgc_gene=ccg$`Gene Symbol`

#amp
gene=list()
gene[[1]]=na.omit(cancer_cell_2010$`Genetic element of interest`[2:31])
gene[[1]]=gene[[1]][-grep("-",gene[[1]])]
gene[[1]]=foreach(i=1:length(gene[[1]]),.combine=c) %do% strsplit(gene[[1]][i],split=",")[[1]]
gene[[1]]=unique(gene[[1]])

gene_2015=foreach(i=2:21,.combine = c) %do% na.omit(cell_gistic_2015[5:nrow(cell_gistic_2015),i])
gene[[2]]=unique(gene_2015)
gene[[2]]=sub("\\]","",sub("\\[","",gene[[2]]))
gene[[2]]=unique(gene[[2]])

eu_cnv1=eu_cnv[eu_cnv$`Type of CNV`=="Amplification",]#Deletion
gene_eu=foreach(i=1:nrow(eu_cnv1),.combine = c) %do% strsplit(eu_cnv1$`Genes in Region`[i],split=",")[[1]]
gene[[3]]=unique(gene_eu)

nature1_2017=nature_2017[nature_2017$Type=="Gain",]#Loss
gene[[4]]=foreach(i=1:nrow(nature1_2017),.combine = c) %do% strsplit(nature1_2017$`Gene Name`[i],split=";")[[1]]
gene[[4]]=sub("\\]","",sub("\\[","",gene[[4]]))
gene[[4]]=unique(gene[[4]])

#CPGEA
our_study1=read.table("PC_WGS_208pairs_filt_js100.amp_genes.conf_95.txt",sep="\t",stringsAsFactors = F)
our_gene1=foreach(i=2:ncol(our_study1),.combine=c) %do% our_study1[5:nrow(our_study1),i]
gene[[5]]=unique(our_gene1)
gene[[5]]=sub("\\]","",sub("\\[","",gene[[5]]))
gene[[5]]=gene[[5]][-which(gene[[5]]=="")]
gene[[5]]=na.omit(gene[[5]])
gene[[5]]=unique(gene[[5]])

#TCGA_CP
our_study1=read.table("TCGA_PRAD_WGS_114pairs_filtered_freqcnv_blacklist_js100.amp_genes.conf_95.txt",sep="\t",stringsAsFactors = F)
our_gene1=foreach(i=2:ncol(our_study1),.combine=c) %do% our_study1[5:nrow(our_study1),i]
gene[[6]]=unique(our_gene1)
gene[[6]]=sub("\\]","",sub("\\[","",gene[[6]]))
gene[[6]]=gene[[6]][-which(gene[[6]]=="")]
gene[[6]]=na.omit(gene[[6]])
gene[[6]]=unique(gene[[6]])

#merge
gene_dt=do.call(c,gene)
gene_dt=unique(gene_dt)

gene_df=matrix(0,nrow=length(gene_dt),ncol=6)
colnames(gene_df)=c("2010 MSKCC Cancer Cell","2015 TCGA Cell","2017 CH EU","2017 CRC Nature","CPGEA","TCGA_CP")
rownames(gene_df)=toupper(gene_dt)

for(i in 1:ncol(gene_df)){
  gene_df[toupper(gene[[i]]),i]=1
}
gene_df=as.data.frame(gene_df)

gene_df$cgc=""
gene_df[intersect(toupper(gene_dt),toupper(cgc_gene)),7]="*"
gene_df$class=""
gene_df$class=ifelse(gene_df$CPGEA==1&rowSums(gene_df[,c(1,2,3,4)])!=0,"recurrent1",ifelse(gene_df$CPGEA==0&rowSums(gene_df[,c(1,2,4)])>=2,"recurrent2",ifelse(gene_df$CPGEA==1&rowSums(gene_df[,c(1,2,3,4)])==0,"novel","")))

write.table(gene_df[,c(5,3,1:2,4,6,7,8)],"Amp_gistic_gene.xls",sep="\t",quote=F)

#del
gene=list()
gene[[1]]=na.omit(cancer_cell_2010$`Genetic element of interest`[33:81])
gene[[1]]=gene[[1]][-grep("-",gene[[1]])]
gene[[1]]=foreach(i=1:length(gene[[1]]),.combine=c) %do% strsplit(gene[[1]][i],split=",")[[1]]
gene[[1]][7]="RYBP"
gene[[1]]=gene[[1]][-8]
gene[[1]][34]="LCP1"
gene[[1]][8]="PDE4D"
gene[[1]]=unique(gene[[1]])

gene_2015=foreach(i=22:56,.combine = c) %do% cell_gistic_2015[5:nrow(cell_gistic_2015),i][-which(is.na(cell_gistic_2015[5:nrow(cell_gistic_2015),i]))]
gene[[2]]=unique(gene_2015)
gene[[2]]=sub("\\]","",sub("\\[","",gene[[2]]))
gene[[2]]=unique(gene[[2]])

eu_cnv1=eu_cnv[eu_cnv$`Type of CNV`=="Deletion",]#Deletion
gene_eu=foreach(i=1:nrow(eu_cnv1),.combine = c) %do% strsplit(eu_cnv1$`Genes in Region`[i],split=",")[[1]]
gene[[3]]=unique(gene_eu)

#
nature1_2017=nature_2017[nature_2017$Type=="Loss",]#Loss
gene[[4]]=foreach(i=1:nrow(nature1_2017),.combine = c) %do% strsplit(nature1_2017$`Gene Name`[i],split=";")[[1]]
gene[[4]]=sub("\\]","",sub("\\[","",gene[[4]]))
gene[[4]]=unique(gene[[4]])

#CPGEA
our_study2=read.table("PC_WGS_208pairs_filt_js100.del_genes.conf_95.txt",sep="\t",stringsAsFactors = F)
our_gene2=foreach(i=2:ncol(our_study2),.combine=c) %do% our_study2[5:nrow(our_study2),i]
gene[[5]]=unique(our_gene2)
gene[[5]]=sub("\\]","",sub("\\[","",gene[[5]]))
gene[[5]]=gene[[5]][-which(gene[[5]]=="")]
gene[[5]]=na.omit(gene[[5]])
gene[[5]]=unique(gene[[5]])

#TCGA_CP
our_study1=read.table("TCGA_PRAD_WGS_114pairs_filtered_freqcnv_blacklist_js100.del_genes.conf_95.txt",sep="\t",stringsAsFactors = F)
our_gene1=foreach(i=2:ncol(our_study1),.combine=c) %do% our_study1[5:nrow(our_study1),i]
gene[[6]]=unique(our_gene1)
gene[[6]]=sub("\\]","",sub("\\[","",gene[[6]]))
gene[[6]]=gene[[6]][-which(gene[[6]]=="")]
gene[[6]]=na.omit(gene[[6]])
gene[[6]]=unique(gene[[6]])

#merge
gene_dt=do.call(c,gene)
gene_dt=unique(gene_dt)

gene_df=matrix(0,nrow=length(gene_dt),ncol=6)
colnames(gene_df)=c("2010 MSKCC Cancer Cell","2015 TCGA Cell","2017 CH EU","2017 CRC Nature","CPGEA","TCGA_CP")
rownames(gene_df)=toupper(gene_dt)

for(i in 1:ncol(gene_df)){
  gene_df[toupper(gene[[i]]),i]=1
}
gene_df=as.data.frame(gene_df)

gene_df$cgc=""
gene_df[intersect(toupper(gene_dt),toupper(cgc_gene)),7]="*"
gene_df$class=""
gene_df$class=ifelse(gene_df$CPGEA==1&rowSums(gene_df[,c(1,2,3,4)])!=0,"recurrent1",ifelse(gene_df$CPGEA==0&rowSums(gene_df[,c(1,2,4)])>=2,"recurrent2",ifelse(gene_df$CPGEA==1&rowSums(gene_df[,c(1,2,3,4)])==0,"novel","")))
write.table(gene_df[,c(5,3,1:2,4,6,7,8)],"Del_gistic_gene.xls",sep="\t",quote=F)

##Step4:
##Gene function annotation qne merge table
amp1=read.table("Amp_study_freq_compare_11Abroad_EU_CPGEA_TCGACP.xls",sep="\t",header = T,stringsAsFactors = F,check.names = F)
amp2=read.table("Amp_Pri_Met_Neu_freq.xls",sep="\t",header = T,stringsAsFactors = F,check.names = F)
amp2$Gene=rownames(amp2)
amp3=read.table("Amp_gistic_gene.xls",sep="\t",header = T,stringsAsFactors = F,check.names = F)
amp3$Gene=rownames(amp3)
amp=merge(amp1,amp2,by="Gene")
amp=merge(amp3[,-7],amp,by="Gene",all.y = T)

del1=read.table("Del_study_freq_compare_11Abroad_EU_CPGEA_TCGACP.xls",sep="\t",header = T,stringsAsFactors = F,check.names = F)
del2=read.table("Del_Pri_Met_Neu_freq.xls",sep="\t",header = T,stringsAsFactors = F,check.names = F)
del2$Gene=rownames(del2)
del3=read.table("Del_gistic_gene.xls",sep="\t",header = T,stringsAsFactors = F,check.names = F)
del3$Gene=rownames(del3)
del=merge(del1,del2,by="Gene")
del=merge(del3[,-7],del,by="Gene",all.y = T)

#annotation
func=read.table("CancerGenesList.txt",sep="\t",header=T,stringsAsFactors=F,row.names=1)
func$OncoKB.OG[func$OncoKB.OG==""]=NA
func$OncoKB.TSG[func$OncoKB.TSG==""]=NA
func$OG_TSG=foreach(i=1:nrow(func),.combine = c) %do% paste(na.omit(c(func$OncoKB.OG[i],func$OncoKB.TSG[i])),collapse=",")
ccg <- as.data.frame(readr::read_csv("Census_allTue Jan  2 12_17_09 2018.csv"))
cgc_gene=ccg$`Gene Symbol`
rownames(ccg)=ccg$`Gene Symbol`
cgc_smg=read.table("CGC_SMG_gene.xls",sep="\t",header=T,row.names=1,stringsAsFactors=F)

pathw=as.data.frame(readxl::read_excel("Pathway+gene+oncogene+TSG+curated2.xlsx")[,1:3])
colnames(pathw)[1]="Gene"
rownames(pathw)=pathw$Gene

anot=function(dt){
  dt$p_CPGEA_vs_Primary=""
  dt$p_CPGEA_vs_Metasta=""
  dt$p_CPGEA_vs_Neuo=""
  dt$p_CPGEA_vs_TCGACP=""
  dt$q_CPGEA_vs_Primary=""
  dt$q_CPGEA_vs_Metasta=""
  dt$q_CPGEA_vs_Neuo=""
  dt$q_CPGEA_vs_TCGACP=""
  
  gl=which(dt$`CPGEA(208)_freq`!=0)
  dt$p_CPGEA_vs_Primary[gl]=foreach(j=gl,.combine = c) %do% prop.test(as.numeric(dt$`Pri(1326)_n`[j]),1326,dt$`CPGEA(208)_freq`[j])$p.value
  dt$p_CPGEA_vs_Metasta[gl]=foreach(j=gl,.combine = c) %do% prop.test(as.numeric(dt$`Meta(868)_n`[j]),868,dt$`CPGEA(208)_freq`[j])$p.value
  dt$p_CPGEA_vs_Neuo[gl]=foreach(j=gl,.combine = c) %do% prop.test(as.numeric(dt$`Neu(54)_n`[j]),54,dt$`CPGEA(208)_freq`[j])$p.value
  dt$p_CPGEA_vs_TCGACP[gl]=foreach(j=gl,.combine = c) %do% prop.test(as.numeric(dt$`TCGA_CP(114)_n`[j]),114,dt$`CPGEA(208)_freq`[j])$p.value
  
  dt$q_CPGEA_vs_Primary[gl]=p.adjust(as.numeric(dt$p_CPGEA_vs_Primary[gl]),method = "BH")
  dt$q_CPGEA_vs_Metasta[gl]=p.adjust(as.numeric(dt$p_CPGEA_vs_Metasta[gl]),method = "BH")
  dt$q_CPGEA_vs_Neuo[gl]=p.adjust(as.numeric(dt$p_CPGEA_vs_Neuo[gl]),method = "BH")
  dt$q_CPGEA_vs_TCGACP[gl]=p.adjust(as.numeric(dt$p_CPGEA_vs_TCGACP[gl]),method = "BH")
  
  colnames(dt)[grep("^p_CPGEA_vs",colnames(dt))]=paste(colnames(dt)[grep("^p_CPGEA_vs",colnames(dt))],"(prop.test)",sep = "")
  colnames(dt)[grep("^q_CPGEA_vs",colnames(dt))]=paste(colnames(dt)[grep("^q_CPGEA_vs",colnames(dt))],"(BH)",sep = "")
 
dt$OncoKB.Annotated=""
dt[is.element(dt$Gene,rownames(func)),"OncoKB.Annotated"]=func[dt$Gene[is.element(dt$Gene,rownames(func))],2]
dt$OncoKB_OG_TSG=""
dt[is.element(dt$Gene,rownames(func)),"OncoKB_OG_TSG"]=func[dt$Gene[is.element(dt$Gene,rownames(func))],"OG_TSG"]

#CGC
dt$SMG_source=""
dt$SMG=""
dt[is.element(dt$Gene,rownames(cgc_smg)),c("SMG_source","SMG")]=cgc_smg[dt$Gene[is.element(dt$Gene,rownames(cgc_smg))],]
dt$CGC=dt$SMG
dt$SMG=sub("cgc","",sub("cgc;","",dt$SMG))
dt$CGC=sub("smg","",sub(";smg","",dt$CGC))

dt$CGC_OG_TSG=""
dt[is.element(dt$Gene,rownames(ccg)),"CGC_OG_TSG"]=ccg[dt$Gene[is.element(dt$Gene,rownames(ccg))],"Role in Cancer"]

dt$pathwaylist=""
dt$OG_TSG=""
pthl=which(is.element(dt$Gene,unique(pathw$Gene)))
dt[pthl,c("OG_TSG","pathwaylist")]=pathw[dt$Gene[pthl],2:3]
dt[is.na(dt)]=""
return(dt)
}

amp_anot=anot(amp)
del_anot=anot(del)
write.table(amp_anot,"Amp_freq_compare_annot.xls",sep="\t",quote=F,row.names = F)
write.table(del_anot,"Del_freq_compare_annot.xls",sep="\t",quote=F,row.names = F)

##Step5:
##RNA(DESeq normalized count) mean annotation.
deseq=read.table("DESeq2_normalizedCount_134.xls",header=T,stringsAsFactors = F)
mRNA=deseq
t_sp=colnames(mRNA)[grep("T",colnames(mRNA))]
n_sp=sub("T","N",t_sp)

our_cnv <- as.data.frame(read.table("PC_WGS_208pairs_filt_js100.all_thresholded.by_genes.txt", sep="\t",stringsAsFactors = F,header=T,row.names = 1))
gene=intersect(rownames(mRNA),rownames(our_cnv))
sub_cnv=our_cnv[gene,-c(1:2)]
colnames(sub_cnv)=sub("_WGS","",colnames(sub_cnv))

sub_mRNA=mRNA[gene,]
sub_mRNA$T_mean=rowMeans(sub_mRNA[,t_sp])
sub_mRNA$N_mean=rowMeans(sub_mRNA[,n_sp])

sp=intersect(colnames(sub_mRNA),colnames(sub_cnv))
amp_p=c()
del_p=c()
for(i in 1:nrow(sub_mRNA)){
  cn=sub_cnv[rownames(sub_mRNA)[i],sp]
  dp=colnames(cn)[cn==0]
  asp=colnames(cn)[cn==2]
  dsp=colnames(cn)[cn==-2]
  if(length(asp)<=1){
    amp_p[i]=NA
  }else{
    t.ty=c(rep("Tumor_Amp",length(asp)),rep("Tumor_Diploid",length(dp)))
    names(t.ty)=c(asp,dp)
    n.ty=sub("Tumor","Normal",t.ty)
    names(n.ty)=sub("T","N",names(n.ty))
    ty=c(t.ty,n.ty)
    dt1=data.frame(type=as.character(ty),value=as.numeric(sub_mRNA[i,names(ty)]))
    dt1$type=factor(dt1$type,levels=c("Tumor_Amp","Tumor_Diploid","Normal_Amp","Normal_Diploid"))
    amp_p[i]=t.test(dt1$value[dt1$type=="Tumor_Amp"],dt1$value[dt1$type=="Tumor_Diploid"])$p.value
  }
  if(length(dsp)<=1){
    del_p[i]=NA
  }else{
    t.ty=c(rep("Tumor_Del",length(dsp)),rep("Tumor_Diploid",length(dp)))
    names(t.ty)=c(dsp,dp)
    n.ty=sub("Tumor","Normal",t.ty)
    names(n.ty)=sub("T","N",names(n.ty))
    ty=c(t.ty,n.ty)
    dt2=data.frame(type=as.character(ty),value=as.numeric(sub_mRNA[i,names(ty)]))
    dt2$type=factor(dt2$type,levels=c("Tumor_Del","Tumor_Diploid","Normal_Del","Normal_Diploid"))
    del_p[i]=t.test(dt2$value[dt2$type=="Tumor_Del"],dt2$value[dt2$type=="Tumor_Diploid"])$p.value
  }
}
dt=cbind(sub_mRNA[,c("T_mean","N_mean")],cbind(amp_p=amp_p,del_p=del_p))
dt[is.na(dt)]=""
dt$Gene=rownames(dt)
colnames(dt)[3]="p_Amp_vs_Diploid(t.test)"
colnames(dt)[4]="p_Del_vs_Diploid(t.test)"
dt$`p_Amp_vs_Diploid(t.test)`[dt$T_mean<1.5|dt$N_mean<1.5]=""
dt$`p_Del_vs_Diploid(t.test)`[dt$T_mean<1.5|dt$N_mean<1.5]=""

write.table(dt[,c(5,1:4)],"CNV_vs_Diploid_DESeq_ttest.xls",sep="\t",row.names = F,quote = F)

#final table:annot+expression test
amp=read.table("Amp_freq_compare_annot.xls",sep="\t",header = T,stringsAsFactors = F,check.names = F)
del=read.table("Del_freq_compare_annot.xls",sep="\t",header = T,stringsAsFactors = F,check.names = F)
exp=read.table("CNV_vs_Diploid_DESeq_ttest.xls",sep="\t",header = T,stringsAsFactors = F,check.names = F)

amp_dt=merge(amp,exp[,c(1:4)],by="Gene",all.x = T)
del_dt=merge(del,exp[,-4],by="Gene",all.x = T)
amp_dt[is.na(amp_dt)]=""
del_dt[is.na(del_dt)]=""
write.table(amp_dt,"Amp_freq_compare_summary.xls",sep="\t",row.names = F,quote = F)
write.table(del_dt,"Del_freq_compare_summary.xls",sep="\t",row.names = F,quote = F)