This is the R Markdown for Supplementary Table 11, which consists of 1 part.
#remove exon from candidates
library(foreach)
dt=read.table("filtersnv_rm_exon.txt",sep="\t",stringsAsFactors=F,header=T)
raw_dt=read.table("Candidates.Summary",sep="\t",stringsAsFactors = F)
raw_dt$sp=foreach(i=1:nrow(raw_dt),.combine=c) %do% sub("SAMPLE=","",strsplit(raw_dt$V8[i],split=";")[[1]][1])
raw_dt$region=paste(raw_dt$V1,raw_dt$V2,raw_dt$V4,raw_dt$V5,raw_dt$sp,sep="_")
new_dt=raw_dt[is.element(raw_dt$region,dt$V1),]
write.table(new_dt[,1:8],"Candidates.Summary_rm_exon",sep="\t",row.names = F,col.names = F,quote = F)#41109
#Command line:locsite-39208
cat Candidates.Summary_rm_exon|awk '{print $1,$2,$2}' |sed 's/ /\t/g'|sort|uniq >snp_locsite.bed
cat snp_locsite.bed |sort|uniq|sed 's/ /\t/g' >Output_locsite.bed
cat Candidates.Summary_rm_exon|sed 's/=/\t/g'|sed 's/;/\t/g'|awk '{print $1,$2,$2,$3,$4,$5,$6,$7,$9}'|sed 's/ /\t/g' >filter.snv_rm_exon.bed
#ensure region of interest
mkdir region.nsite
bedtools intersect -a hg38_annotation/promoter_2.2k_norep.bed -b Output_locsite.bed -c |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4}'|sed 's/ /\t/g'>region.nsite/promoter_2.2k_norep.region.nsite
bedtools intersect -a hg38_annotation/promoterCore_norep.bed -b Output_locsite.bed -c |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4}'|sed 's/ /\t/g'>region.nsite/promoterCore_norep.region.nsite
bedtools intersect -a hg38_annotation/3utr_norep.bed -b Output_locsite.bed -c |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4}'|sed 's/ /\t/g'>region.nsite/3utr_norep.region.nsite
bedtools intersect -a hg38_annotation/5utr_norep.bed -b Output_locsite.bed -c |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4}'|sed 's/ /\t/g'>region.nsite/5utr_norep.region.nsite
bedtools intersect -a hg38_annotation/exon_norep.bed -b Output_locsite.bed -c |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4}'|sed 's/ /\t/g'>region.nsite/exon_norep.region.nsite
bedtools intersect -a hg38_annotation/intron_norep.bed -b Output_locsite.bed -c |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4}'|sed 's/ /\t/g'>region.nsite/intron_norep.region.nsite
bedtools intersect -a hg38_annotation/enhancer_norep.bed -b Output_locsite.bed -c |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4}'|sed 's/ /\t/g' >region.nsite/enhancer_norep.region.nsite
bedtools intersect -a hg38_annotation/IGR1_norep.bed -b Output_locsite.bed -c |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4}'|sed 's/ /\t/g' >region.nsite/IGR1_norep.region.nsite
bedtools intersect -a hg38_annotation/IGR2_norep.bed -b Output_locsite.bed -c |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4}'|sed 's/ /\t/g' >region.nsite/IGR2_norep.region.nsite
#calculate local region
library(readxl)
library(foreach)
path="region.nsite"
fun=dir(path)[grep(".region.nsite$",dir(path))]
fun=fun[-c(5,8)]
dt=foreach(i=1:length(fun)) %do% read.table(paste(path,fun[i],sep="/"),sep="\t",stringsAsFactors=F)
dt=foreach(i=1:length(dt)) %do% dt[[i]][dt[[i]]$V5!=0&(dt[[i]]$V4-dt[[i]]$V3)>=50,]
#local region: extend region of interest 10kb forward and backward
dir.create("extend_region")
genome=read.table("genome_remove_Exon_add_utr.bed",sep="\t",stringsAsFactors = F)
genome$len=genome$V3-genome$V2+1
dt_Ext=list()
for(i in 1:length(dt)){
dt_ext=list()
for(j in 1:nrow(dt[[i]])){
sgl=which(genome$V1==dt[[i]]$V2[j]&dt[[i]]$V3[j]-1>=genome$V2&dt[[i]]$V3[j]-1<=genome$V3)
egl=which(genome$V1==dt[[i]]$V2[j]&dt[[i]]$V4[j]+1>=genome$V2&dt[[i]]$V4[j]+1<=genome$V3)
if(length(sgl)!=0){
if(dt[[i]]$V3[j]-10000>=genome[sgl,2]){
s1=dt[[i]]$V3[j]-10000
e1=dt[[i]]$V3[j]-1
}else{
p=1
while(sum(genome$len[(sgl-p):(sgl-1)])<10000-dt[[i]]$V3[j]+genome[sgl,2]-1){
p=p+1
}
if(p==1){
s1=c(genome$V3[sgl-p]-(10000-dt[[i]]$V3[j]+genome[sgl,2]-1),genome$V2[sgl])
e1=c(genome$V3[(sgl-p)],dt[[i]]$V3[j]-1)
}else{
s1=c(genome$V3[sgl-p]-(10000-dt[[i]]$V3[j]+genome[sgl,2]-1)+sum(genome$len[(sgl-p+1):(sgl-1)]),genome$V2[(sgl-p+1):sgl])
e1=c(genome$V3[(sgl-p):(sgl-1)],dt[[i]]$V3[j]-1)
}
}}else{
gl=which(genome$V1==dt[[i]]$V2[j]&dt[[i]]$V3[j]>=genome$V3)
if(length(gl)!=0){
sgl=gl[which(genome$V3[gl]==max(genome$V3[gl]))]
if(genome$len[sgl]>=10000){
s1=genome$V3[sgl]-10000+1
e1=genome$V3[sgl]
}else{
p=1
while(sum(genome$len[(sgl-p):sgl])<10000){
p=p+1
}
s1=c(genome$V3[sgl-p]-10000+1+sum(genome$len[(sgl-p+1):sgl]),genome$V2[(sgl-p+1):sgl])
e1=c(genome$V3[(sgl-p):sgl])}
}else{
s2=NA
e2=NA}
}
if(length(egl)!=0){
if(dt[[i]]$V4[j]+10000<=genome[egl,3]){
s2=dt[[i]]$V4[j]+1
e2=dt[[i]]$V4[j]+10000
}else{
q=1
while(sum(genome$len[(egl+1):(egl+q)])<10000+dt[[i]]$V4[j]-genome[egl,3]){
q=q+1
}
if(q==1){
s2=c(dt[[i]]$V4[j]+1,genome$V2[(egl+1):(egl+q)])
e2=c(genome$V3[egl:(egl+q-1)],genome$V2[egl+q]+(10000+dt[[i]]$V4[j]-genome[egl,3])-1)
}else{
s2=c(dt[[i]]$V4[j]+1,genome$V2[(egl+1):(egl+q)])
e2=c(genome$V3[egl:(egl+q-1)],genome$V2[egl+q]+(10000+dt[[i]]$V4[j]-genome[egl,3])-sum(genome$len[(egl+1):(egl+q-1)])-1)
}
}}else{
gl=which(genome$V1==dt[[i]]$V2[j]&dt[[i]]$V4[j]<=genome$V2)
if(length(gl)!=0){
egl=gl[which(genome$V2[gl]==min(genome$V2[gl]))]
if(genome$len[egl]>=10000){
s2=genome$V2[egl]
e2=genome$V2[egl]+10000-1
}else{
q=1
while(sum(genome$len[egl:(egl+q)])<10000){
q=q+1
}
s2=c(genome$V2[egl:(egl+q)])
e2=c(genome$V3[egl:(egl+q-1)],genome$V2[egl+q]+10000-1-sum(genome$len[egl:(egl+q-1)]))}
}else{
s2=NA
e2=NA}
}
dt_ext[[j]]=data.frame(chr=rep(dt[[i]]$V2[j],length(s1)+length(s2)),start=c(s1,s2),end=c(e1,e2),region=rep(dt[[i]]$V1[j],length(s1)+length(s2)))
}
dt_Ext[[i]]=do.call(rbind,dt_ext)
write.table(dt_Ext[[i]],paste("extend_region","/",paste(fun[i],"up10000_down_10000_extent",sep = "_"),sep=""),row.names=F,col.names=F,sep = "\t",quote=F)
}
#mutation in local region
mkdir local.region.nsite
bedtools intersect -a extend_region/3utr_norep.region.nsite_up10000_down_10000_extent -b Output_locsite.bed -c |sed 's/ /\t/g'>local.region.nsite/3utr.local.region.nsite
bedtools intersect -a extend_region/5utr_norep.region.nsite_up10000_down_10000_extent -b Output_locsite.bed -c |sed 's/ /\t/g'>local.region.nsite/5utr.local.region.nsite
bedtools intersect -a extend_region/promoterCore_norep.region.nsite_up10000_down_10000_extent -b Output_locsite.bed -c |sed 's/ /\t/g'|sort|uniq >local.region.nsite/promoterCore.local.region.nsite
bedtools intersect -a extend_region/enhancer_norep.region.nsite_up10000_down_10000_extent -b Output_locsite.bed -c |sed 's/ /\t/g'>local.region.nsite/enhancer.local.region.nsite
bedtools intersect -a extend_region/intron_norep.region.nsite_up10000_down_10000_extent -b Output_locsite.bed -c |sed 's/ /\t/g'>local.region.nsite/intron.local.region.nsite
bedtools intersect -a extend_region/IGR2_norep.region.nsite_up10000_down_10000_extent -b Output_locsite.bed -c |sed 's/ /\t/g'>local.region.nsite/IGR2.local.region.nsite
bedtools intersect -a extend_region/exon_norep.region.nsite_up10000_down_10000_extent -b Output_locsite.bed -c |sed 's/ /\t/g'>local.region.nsite/exon.local.region.nsite
#global region
#calculate replitime in every region
#python script:
python region_replication_time.py
#calculate global region
python distance.py
#calculate the mutation samples in every region. Output_loc.bed:41109
cat Candidates.Summary_rm_exon|sed 's/=/\t/g'|sed 's/;/\t/g'|awk '{print $1,$2,$2,$4,$5,$9}' |sed 's/ /\t/g'|sort|uniq >snp_loc.bed
cat snp_loc.bed|sed 's/ /\t/g' >Output_loc.bed
mkdir mtregion
bedtools intersect -a hg38_annotation/promoterCore_norep.bed -b Output_loc.bed -wa -wb |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4,$5,$6,$7,$8,$9}'|sed 's/ /\t/g'>mtregion/promoterCore.mtregion
bedtools intersect -a hg38_annotation/3utr_norep.bed -b Output_loc.bed -wa -wb |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4,$5,$6,$7,$8,$9}'|sed 's/ /\t/g'>mtregion/3utr.mtregion
bedtools intersect -a hg38_annotation/5utr_norep.bed -b Output_loc.bed -wa -wb |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4,$5,$6,$7,$8,$9}'|sed 's/ /\t/g'>mtregion/5utr.mtregion
bedtools intersect -a hg38_annotation/exon_norep.bed -b Output_loc.bed -wa -wb |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4,$5,$6,$7,$8,$9}'|sed 's/ /\t/g'>mtregion/exon.mtregion
bedtools intersect -a hg38_annotation/intron_norep.bed -b Output_loc.bed -wa -wb |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4,$5,$6,$7,$8,$9}'|sed 's/ /\t/g'>mtregion/intron.mtregion
bedtools intersect -a hg38_annotation/enhancer_norep.bed -b Output_loc.bed -wa -wb |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4,$5,$6,$7,$8,$9}'|sed 's/ /\t/g' >mtregion/enhancer.mtregion
bedtools intersect -a hg38_annotation/IGR2_norep.bed -b Output_loc.bed -wa -wb |awk '{print $1"_"$2"_"$3,$1,$2,$3,$4,$5,$6,$7,$8,$9}'|sed 's/ /\t/g' >mtregion/IGR2.mtregion
#caculate the recurrent
path="mtregion"
mt=dir(path)[grep(".mtregion",dir(path))]
mt_dt=foreach(i=1:length(mt)) %do% read.table(paste(path,mt[i],sep="/"),sep="\t",stringsAsFactors=F)
loc_mt=list()
for(i in 1:length(mt_dt)){
region=unique(mt_dt[[i]]$V1)
n_mt=foreach(j=1:length(region)) %do% length(unique(mt_dt[[i]][mt_dt[[i]]$V1==region[j],ncol(mt_dt[[i]])]))
loc_mt[[i]]=data.frame(V1=region,n_sample=unlist(n_mt))
write.table(loc_mt[[i]],paste(mt[i],"nsample",sep="."),sep="\t",quote=F,row.names = F)
}
#calculate loc background
loc_rg=dir("local.region.nsite")[grep("local.region.nsite",dir("local.region.nsite"))]
loc_dt=foreach(i=1:length(loc_rg)) %do% read.table(paste("local.region.nsite",loc_rg[i],sep="/"),sep="\t",stringsAsFactors = F)
loc_mt=list()
for(i in 1:length(loc_dt)){
region=unique(loc_dt[[i]]$V4)
mt=foreach(j=1:length(region),.combine=c) %do% sum(loc_dt[[i]]$V5[loc_dt[[i]]$V4==region[j]])
loc_mt[[i]]=data.frame(V1=region,Loc_mt=mt)
}
fun=dir("region.nsite")[grep("\\.region.nsite$",dir("region.nsite"))]
fun=fun[-c(5,8)]
dt=foreach(i=1:length(fun)) %do% read.table(paste("region.nsite",fun[i],sep="/"),sep="\t",stringsAsFactors=F)
dt=foreach(i=1:length(dt)) %do% dt[[i]][dt[[i]]$V5!=0&(dt[[i]]$V4-dt[[i]]$V3)>=50,]
dt_loc=foreach(i=1:length(dt)) %do% merge(dt[[i]],loc_mt[[i]],by="V1")
for(i in 1:length(dt_loc)){
dt_loc[[i]]$loc_qi=dt_loc[[i]]$Loc_mt/20000
dt_loc[[i]]$len=dt_loc[[i]]$V4-dt_loc[[i]]$V3+1
dt_loc[[i]]$loc_pi=1-(1-dt_loc[[i]]$loc_qi)^dt_loc[[i]]$len
write.table(dt_loc[[i]],paste(fun[i],"localpi",sep="."),sep="\t",row.names = F,quote=F)
}
#merg global+local data
gl_rg=dir()[grep("replitime.nsite.global.0.05.csv$",dir())]
gl_dt=foreach(i=1:length(gl_rg)) %do% read.csv(gl_rg[i],stringsAsFactors = F,header=T,row.names=1)
for(i in 1:length(gl_dt)){
colnames(gl_dt[[i]])[1:6]=c("V1","CHROM","start","end","len","mutation_number")
}
loc_rg=dir()[grep("localpi",dir())]
loc_dt=foreach(i=1:length(loc_rg)) %do% read.table(loc_rg[i],sep="\t",stringsAsFactors = F,header=T)
loc_gl=foreach(i=1:length(loc_dt)) %do% merge(gl_dt[[i]][,-c(7:11)],loc_dt[[i]][,c(1,(ncol(loc_dt[[i]])-3):(ncol(loc_dt[[i]])-2),ncol(loc_dt[[i]]))],by="V1")
mt=dir()[grep("nsample$",dir())]
mt_dt=foreach(i=1:length(mt)) %do% read.table(mt[i],sep="\t",header=T,stringsAsFactors=F)
mt_dt=foreach(i=1:length(loc_gl)) %do% merge(loc_gl[[i]],mt_dt[[i]],by="V1")
for(i in 1:length(mt_dt)){
P1=foreach(j=1:nrow(mt_dt[[i]])) %do% (1-pbinom((mt_dt[[i]]$n_sample[j]-1),208,(mt_dt[[i]]$loc_pi[j])))
mt_dt[[i]]$loc_p_value=unlist(P1)
mt_dt[[i]]$loc_adj_p=p.adjust(mt_dt[[i]]$loc_p_value,method = "BH")
P2=foreach(j=1:nrow(mt_dt[[i]])) %do% (1-pbinom((mt_dt[[i]]$n_sample[j]-1),208,(mt_dt[[i]]$global_pi[j])))
mt_dt[[i]]$glob_p_value=unlist(P2)
mt_dt[[i]]$glob_adj_p=p.adjust(mt_dt[[i]]$glob_p_value,method = "BH")
write.table(mt_dt[[i]],sub("global.0.05.csv","loctest_globtest.0.05.txt",gl_rg[i]),sep="\t",row.names = F,quote=F)
}
#associate gene
id_file=read.table("gencode.v27.metadata.HGNC/data")
colnames(id_file)=c("TRANS","Gene")
mt=dir("mtregion")[grep(".mtregion$",dir("mtregion"))]
mt_dt=foreach(i=1:length(mt)) %do% read.table(paste("mtregion",mt[i],sep="/"),sep="\t",stringsAsFactors=F)
annot=c("gencode.v27.3utr.bed","gencode.v27.5utr.bed","genehancer.v4.7.bed","gencode.v27.exon.bed","IGR2.1","gencode.v27.intron.bed",
"gencode.v27.promoterCore.bed")
annot_dt=foreach(i=1:length(annot)) %do% read.table(paste("hg38_annotation",annot[i],"data",sep = "/"),sep="\t",stringsAsFactors=F)
annot_dt[[5]]=annot_dt[[5]][,-c(4:6)]
assoc=list()
assoc1=list()
assoc2=list()
assoc3=list()
final_dt=list()
for(i in c(1:2,4,6)){
p=0
ct=list()
colnames(mt_dt[[i]])[1:4]=c("bin","V1","V2","V3")
assoc1[[i]]=merge(mt_dt[[i]],annot_dt[[i]],by=c("V1","V2","V3"))
bin1=setdiff(mt_dt[[i]]$bin,assoc1[[i]]$bin)
assoc2[[i]]=merge(mt_dt[[i]][is.element(mt_dt[[i]]$bin,bin1),-4],annot_dt[[i]],by=c("V1","V2"))
assoc3[[i]]=merge(mt_dt[[i]][is.element(mt_dt[[i]]$bin,bin1),-3],annot_dt[[i]],by=c("V1","V3"))
assoc[[i]]=rbind(rbind(assoc1[[i]],assoc2[[i]]),assoc3[[i]])
colnames(assoc[[i]])[11]="TRANS"
dt=merge(assoc[[i]],id_file,by="TRANS")
dt_a=assoc[[i]][!is.element(assoc[[i]]$TRANS,dt$TRANS),]
dt_a$Gene=dt_a$TRANS
dt=as.data.frame(rbind(dt,dt_a))
region=unique(dt$bin)
for(j in 1:length(region)){
gene=unique(as.character(dt$Gene)[dt$bin==region[j]])
for(k in 1:length(gene)){
p=p+1
dat=dt[dt$bin==region[j]&dt$Gene==gene[k],]
ct[[p]]=data.frame(Region=unique(as.character(dat$bin)),CHROM=unique(as.character(dat$V1)),
Start=paste(as.character(dat$V2),collapse=";"),End=paste(as.character(dat$V3),collapse=";"),TRANS=paste(unique(as.character(dat$TRANS)),collapse=";"),
Gene=unique(as.character(dat$Gene)),MUT=paste(unique(paste(paste(as.character(dat$V10),paste(as.character(dat$V1),as.character(dat$V6.x),sep=":"),sep="("),")",sep="")),collapse=";"),
Sample=paste(unique(as.character(dat$V10)),collapse=";"))
}
}
final_dt[[i]]=do.call(rbind,ct)
write.table(unique(final_dt[[i]]),sub("mtregion","associate_gene_all.xls",mt[i]),sep="\t",row.names=F,quote=F)
}
for(i in 5){
p=0
ct=list()
colnames(mt_dt[[i]])[1:4]=c("bin","V1","V2","V3")
assoc1[[i]]=merge(mt_dt[[i]],annot_dt[[i]],by=c("V1","V2","V3"))
bin1=setdiff(mt_dt[[i]]$bin,assoc1[[i]]$bin)
assoc2[[i]]=merge(mt_dt[[i]][is.element(mt_dt[[i]]$bin,bin1),-4],annot_dt[[i]],by=c("V1","V2"))
assoc3[[i]]=merge(mt_dt[[i]][is.element(mt_dt[[i]]$bin,bin1),-3],annot_dt[[i]],by=c("V1","V3"))
assoc[[i]]=rbind(rbind(assoc1[[i]],assoc2[[i]]),assoc3[[i]])
colnames(assoc[[i]])[11]="TRANS"
dt=merge(assoc[[i]],id_file,by="TRANS")
dt_a=assoc[[i]][!is.element(assoc[[i]]$TRANS,dt$TRANS),]
dt_a$Gene=dt_a$TRANS
dt=as.data.frame(rbind(dt,dt_a))
region=unique(dt$bin)
for(j in 1:length(region)){
gene=unique(as.character(dt$Gene)[dt$bin==region[j]])
for(k in 1:length(gene)){
p=p+1
dat=dt[dt$bin==region[j]&dt$Gene==gene[k],]
ct[[p]]=data.frame(Region=unique(as.character(dat$bin)),CHROM=unique(as.character(dat$V1)),
Start=paste(as.character(dat$V2),collapse=";"),End=paste(as.character(dat$V3),collapse=";"),TRANS=paste(unique(as.character(dat$TRANS)),collapse=";"),
Gene=unique(as.character(dat$Gene)),MUT=paste(unique(paste(paste(as.character(dat$V10),paste(as.character(dat$V1),as.character(dat$V6),sep=":"),sep="("),")",sep="")),collapse=";"),
Sample=paste(unique(as.character(dat$V10)),collapse=";"))
}
}
final_dt[[i]]=do.call(rbind,ct)
write.table(unique(final_dt[[i]]),sub("mtregion","associate_gene_all.xls",mt[i]),sep="\t",row.names=F,quote=F)
}
for(i in 3){
p=0
ct=list()
colnames(mt_dt[[i]])[1:4]=c("bin","V1","V2","V3")
assoc[[i]]=merge(mt_dt[[i]],annot_dt[[i]],by=c("V1","V2","V3"))
dt=assoc[[i]]
region=unique(dt$bin)
for(j in 1:length(region)){
gene=unique(as.character(dt$V6.y)[dt$bin==region[j]])
for(k in 1:length(gene)){
p=p+1
dat=dt[dt$bin==region[j]&dt$V6.y==gene[k],]
genehancer=strsplit(gene,split=";")[[1]]
genehancer_id=sub("genehancer_id=","",genehancer[1])
c_gene=sub("connected_gene=","",genehancer[2])
gene_score=sub("score=","",genehancer[3])
ct[[p]]=data.frame(Region=rep(unique(as.character(dat$bin)),length(c_gene)),CHROM=rep(unique(as.character(dat$V1)),length(c_gene)),
Start=rep(unique(as.character(dat$V2)),length(c_gene)),End=rep(unique(as.character(dat$V3)),length(c_gene)),genehancer_id=rep(genehancer_id,length(c_gene)),Gene=c_gene,Score=gene_score,
MUT=rep(paste(paste(paste(as.character(dat$V10),paste(as.character(dat$V1),as.character(dat$V6.x),sep=":"),sep="("),")",sep=""),collapse=";"),length(c_gene)),
Sample=rep(paste(unique(as.character(dat$V10)),collapse=";"),length(c_gene)))
}
}
final_dt[[i]]=do.call(rbind,ct)
write.table(unique(final_dt[[i]]),sub("mtregion","associate_gene_all.xls",mt[i]),sep="\t",row.names=F,quote=F)
}
for(i in 7){
p=0
ct=list()
colnames(mt_dt[[i]])[1:4]=c("bin","V1","V2","V3")
assoc1[[i]]=merge(mt_dt[[i]],annot_dt[[i]],by=c("V1","V2","V3"))
bin1=setdiff(mt_dt[[i]]$bin,assoc1[[i]]$bin)
assoc2[[i]]=merge(mt_dt[[i]][is.element(mt_dt[[i]]$bin,bin1),-4],annot_dt[[i]],by=c("V1","V2"))
assoc3[[i]]=merge(mt_dt[[i]][is.element(mt_dt[[i]]$bin,bin1),-3],annot_dt[[i]],by=c("V1","V3"))
assoc[[i]]=rbind(rbind(assoc1[[i]],assoc2[[i]]),assoc3[[i]])
colnames(assoc[[i]])[12]="TRANS"
dt=assoc[[i]]
region=unique(dt$bin)
for(j in 1:length(region)){
gene=unique(as.character(dt$V4)[dt$bin==region[j]])
for(k in 1:length(gene)){
p=p+1
dat=dt[dt$bin==region[j]&dt$V4==gene[k],]
ct[[p]]=data.frame(Region=unique(as.character(dat$bin)),CHROM=unique(as.character(dat$V1)),
Start=paste(as.character(dat$V2),collapse=";"),End=paste(as.character(dat$V3),collapse=";"),TRANS=paste(unique(as.character(dat$TRANS)),collapse=";"),
Gene=unique(as.character(dat$V4)),MUT=paste(unique(paste(paste(as.character(dat$V10),paste(as.character(dat$V1),as.character(dat$V6),sep=":"),sep="("),")",sep="")),collapse=";"),
Sample=paste(unique(as.character(dat$V10)),collapse=";"))
}
}
final_dt[[i]]=do.call(rbind,ct)
write.table(unique(final_dt[[i]]),sub("mtregion","associate_gene_all.xls",mt[i]),sep="\t",row.names=F,quote=F)
}
#expression comparison
mRNA=read.table("DESeq2_normalizedCount_134.xls",header=T,stringsAsFactors = F)
mRNA=mRNA[,-which(is.element(colnames(mRNA),c("T502","N502")))]
t_sp=colnames(mRNA)[grep("T",colnames(mRNA))]
mRNA$T_Mean=rowMeans(mRNA[,grep("T",colnames(mRNA))])
mRNA$N_Mean=rowMeans(mRNA[,grep("N",colnames(mRNA))])
file=dir()[grep("associate_gene_all.xls",dir())]
crt.dir=sub(".associate_gene_all.xls","_RNA_compare.xls",file)
dt=foreach(i=1:length(file)) %do% read.table(file[i],sep="\t",header=T,stringsAsFactors=F)
for(i in 1:length(dt)){
sp=foreach(j=1:nrow(dt[[i]])) %do% intersect(strsplit(gsub("_WGS","",dt[[i]]$Sample[j]),split=";")[[1]],t_sp)
n_sample=foreach(j=1:nrow(dt[[i]]),.combine=c) %do% length(sp[[j]])
mut_rna=foreach(j=1:nrow(dt[[i]])) %do% as.numeric(mRNA[dt[[i]]$Gene[j],sp[[j]]])
wt_rna=foreach(j=1:nrow(dt[[i]])) %do% as.numeric(mRNA[dt[[i]]$Gene[j],setdiff(t_sp,sp[[j]])])
dt[[i]]$MUT_mRNA=unlist(foreach(j=1:nrow(dt[[i]])) %do% paste(mut_rna[[j]],collapse=";"))
dt[[i]]$WT_mRNA=unlist(foreach(j=1:nrow(dt[[i]])) %do% paste(wt_rna[[j]],collapse=";"))
dt[[i]]$MUT_mean=unlist(foreach(j=1:nrow(dt[[i]])) %do% mean(mut_rna[[j]]))
dt[[i]]$WT_mean=unlist(foreach(j=1:nrow(dt[[i]])) %do% mean(wt_rna[[j]]))
dt[[i]]$T_Mean=mRNA[dt[[i]]$Gene,"T_Mean"]
dt[[i]]$N_Mean=mRNA[dt[[i]]$Gene,"N_Mean"]
dt[[i]]$p.value=NA
gl=which(n_sample>1&(dt[[i]]$T_Mean>=1.5|dt[[i]]$N_Mean>=1.5))
dt[[i]]$p.value[gl]=foreach(j=gl,.combine=c) %do% t.test(mut_rna[[j]],wt_rna[[j]])$p.value
dt[[i]]$q.value=p.adjust(dt[[i]]$p.value,method="BH")
write.table(dt[[i]],crt.dir[i],sep="\t",row.names=F,quote=F)
}
#lncRNA
mRNA=read.table("Annotated_lncRNA_FPKM_134.xls",sep="\t",header = T,stringsAsFactors = F)
colnames(mRNA)=sub("_WTS","",colnames(mRNA))
mRNA=mRNA[,-which(is.element(colnames(mRNA),c("T502","N502","T13","N13")))]
t_sp=colnames(mRNA)[-c(1:4)][grep("T",colnames(mRNA)[-c(1:4)])]
mRNA$T_Mean=rowMeans(mRNA[,t_sp])
mRNA$N_Mean=rowMeans(mRNA[,sub("T","N",t_sp)])
muti=names(table(mRNA$Official_Symbol))[table(mRNA$Official_Symbol)!=1]
for(i in 1:length(muti)){
gl=which(mRNA$Official_Symbol==muti[i])
sgl=which.max(rowMeans(mRNA[gl,c("T_Mean","N_Mean")]))
mRNA=mRNA[-gl[-sgl],]
}
rownames(mRNA)=mRNA$Official_Symbol
mRNA=mRNA[,-c(1:4)]
file=crt.dir
dat=foreach(i=1:length(file)) %do% read.table(crt.dir[i],sep="\t",header=T,stringsAsFactors=F)
dt=list()
for(i in 1:length(dat)){
lnc=intersect(dat[[i]]$Gene,rownames(mRNA))
gl=which(is.element(dat[[i]]$Gene,lnc))
if(length(gl)!=0){
dt[[i]]=dat[[i]][gl,]
sp=foreach(j=1:nrow(dt[[i]])) %do% intersect(strsplit(gsub("_WGS","",dt[[i]]$Sample[j]),split=";")[[1]],t_sp)
n_sample=foreach(j=1:nrow(dt[[i]]),.combine=c) %do% length(sp[[j]])
mut_rna=foreach(j=1:nrow(dt[[i]])) %do% as.numeric(mRNA[dt[[i]]$Gene[j],sp[[j]]])
wt_rna=foreach(j=1:nrow(dt[[i]])) %do% as.numeric(mRNA[dt[[i]]$Gene[j],setdiff(t_sp,sp[[j]])])
dt[[i]]$MUT_mRNA=unlist(foreach(j=1:nrow(dt[[i]])) %do% paste(mut_rna[[j]],collapse=";"))
dt[[i]]$WT_mRNA=unlist(foreach(j=1:nrow(dt[[i]])) %do% paste(wt_rna[[j]],collapse=";"))
dt[[i]]$MUT_mean=unlist(foreach(j=1:nrow(dt[[i]])) %do% mean(mut_rna[[j]]))
dt[[i]]$WT_mean=unlist(foreach(j=1:nrow(dt[[i]])) %do% mean(wt_rna[[j]]))
dt[[i]]$T_Mean=mRNA[dt[[i]]$Gene,"T_Mean"]
dt[[i]]$N_Mean=mRNA[dt[[i]]$Gene,"N_Mean"]
dt[[i]]$p.value=NA
gpl=which(n_sample>1&(dt[[i]]$T_Mean>=1.5|dt[[i]]$N_Mean>=1.5))
dt[[i]]$p.value[gpl]=foreach(j=gpl,.combine=c) %do% t.test(mut_rna[[j]],wt_rna[[j]])$p.value
dt[[i]]$q.value=p.adjust(dt[[i]]$p.value,method="BH")
adt=rbind(dat[[i]][-gl,],dt[[i]])
}else{
adt=dat[[i]]}
write.table(adt,sub("RNA","RNA_lnc",crt.dir[i]),sep="\t",row.names=F,quote=F)
}
#loctest_globtest.0.05.txt
mkdir loctest_globtest associate_gene_RNA
mv *loctest_globtest.0.05.txt loctest_globtest
mv *RNA_lnc_compare.xls associate_gene_RNA
library(foreach)
path1="associate_gene_RNA"
path2="loctest_globtest"
file1=dir(path1)
file2=dir(path2)
dt1=foreach(i=1:length(file1)) %do% read.table(paste(path1,file1[i],sep="/"),sep="\t",stringsAsFactors=F,header=T)
dt2=foreach(i=1:length(file2)) %do% read.table(paste(path2,file2[i],sep="/"),sep="\t",stringsAsFactors=F,header=T)
#cgc
cgc_smg=read.table("CGC_SMG_gene.xls",sep="\t",stringsAsFactors=F,header=T)
rownames(cgc_smg)=cgc_smg$Gene
dat=list()
for(i in 1:7){
colnames(dt2[[i]])[1]="Region"
dat[[i]]=merge(dt2[[i]],dt1[[i]][,-c(2:4)],by="Region")
dat[[i]]=dat[[i]][dat[[i]]$Loc_mt!=0,]
dat[[i]]$smg_source=""
dat[[i]]$smg_source[is.element(dat[[i]]$Gene,rownames(cgc_smg))]=cgc_smg[dat[[i]]$Gene[is.element(dat[[i]]$Gene,rownames(cgc_smg))],"smg_source"]
dat[[i]]$cgc_smg=""
dat[[i]]$cgc_smg[is.element(dat[[i]]$Gene,rownames(cgc_smg))]=cgc_smg[dat[[i]]$Gene[is.element(dat[[i]]$Gene,rownames(cgc_smg))],"cgc_smg"]
write.table(dat[[i]],sub("RNA_lnc_compare.xls","local_global_associate_206.xls",file1[i]),sep="\t",row.names=F,quote=F)
}