#! /usr/bin/Rscript ########################################################################################### ######################## READ ARGUMENTS ########################## ########################################################################################### args=commandArgs(trailingOnly=TRUE) Prof.file=args[1] Probes.file=args[2] Reference.file=args[3] th.bkp=as.numeric(args[4]) ########################################################################################### #################### INTERMEDIATE FUNCTIONS ###################### ########################################################################################### # Input: A vector of integers # Action: Fragment v into a series of continuous intervals # Output: A data frame giving start and end of each interval intervals=function(v) { l=length(v) from=v[1];to=c() k=1 while(k < l) { while(v[k+1]==v[k]+1 & (k+1) < l){k=k+1} if(k+1==l) { if(v[k+1]==v[k]+1){to=c(to,v[l]);k=l} else{to=c(to,v[k],v[l]);from=c(from,v[l]);k=l} } else{to=c(to,v[k]);from=c(from,v[k+1]);k=k+1} } if(l==1){from=v;to=v} if(l==0){from=c();to=c()} return(data.frame(from,to)) } # Input: A nsample x nchr matrix Schr giving the copy number for each segment of a given chromosome in each samples # Action: Compute the nsample x (nchr+1) matrix Bchr giving the shift between each segment in each sample # Output: The breakpoints matrix Bchr for this chromosome S2B <- function(Schr) { if(class(Schr)=="numeric"){Schr=matrix(Schr,nrow=1)} nsample=nrow(Schr) nchr=ncol(Schr) Bchr=matrix(nrow=nsample,ncol=nchr+1) for(i in 1:nsample) { Bchr[i,1]=Schr[i,1] if(nchr > 1){ for(j in 2:nchr) { Bchr[i,j]=Schr[i,j]-Schr[i,j-1] } } Bchr[i,nchr+1]=-Schr[i,nchr] } if(nsample==1){Bchr=as.numeric(Bchr)}else{rownames(Bchr)=rownames(Schr)} return(Bchr) } # Input: A nsample x (nchr+1) matrix Bchr giving the shift between each segment of a given chromosome in each sample # Action: Compute the nsample x nchr matrix Schr giving the copy number for each segment in each sample # Output: The segments matrix Schr B2S <- function(Bchr) { if(class(Bchr)=="numeric"){Bchr=matrix(Bchr,nrow=1)} nsample=nrow(Bchr) nchr=ncol(Bchr)-1 Schr=matrix(nrow=nsample,ncol=nchr) for(i in 1:nsample) { Schr[i,1]=Bchr[i,1] if(nchr > 1) { for(j in 2:nchr) { Schr[i,j]=Schr[i,j-1]+Bchr[i,j] } } } if(nsample==1){Schr=as.numeric(Schr)}else{rownames(Schr)=rownames(Bchr)} return(Schr) } # Input: A nsample x (nchr+1) matrix Bchr giving the shift between each segment of a given chromosome in each sample # Action: Compute the nsample x (2*nchr+2) matrix Achr giving the amplitude matrix for the chromosome # Output: The amplitude matrix Achr B2A <- function(Bchr) { if(class(Bchr)=="numeric"){Bchr=matrix(Bchr,nrow=1)} nsample=nrow(Bchr) nbkp=ncol(Bchr) Achr=matrix(nrow=nsample,ncol=2*nbkp) for(i in 1:nsample) { for(j in 1:nbkp) { if(Bchr[i,j] >= 0){Achr[i,j]=Bchr[i,j];Achr[i,nbkp+j]=0} if(Bchr[i,j] <= 0){Achr[i,nbkp+j]=-Bchr[i,j];Achr[i,j]=0} } } if(nsample==1){Achr=as.numeric(Achr)}else{rownames(Achr)=rownames(Bchr)} return(Achr) } # Input: The amplitude matrix Achr # Action: Computes the breakpoint matrix corresponding to the amplitude matrix given # Output: The breakpoint matrix A2B <- function(Achr) { if(class(Achr)=="numeric"){Achr=matrix(Achr,nrow=1)} nsample=nrow(Achr) nbkp=ncol(Achr)/2 Bchr=matrix(nrow=nsample,ncol=nbkp) for(i in 1:nsample) { for(j in 1:nbkp) { Bchr[i,j]=Achr[i,j]-Achr[i,nbkp+j] } } if(nsample==1){Bchr=as.numeric(Bchr)}else{rownames(Bchr)=rownames(Achr)} return(Bchr) } # Input: A vector giving the chromosome for each probe, the y-coordinate at which to write the chromosome name and the size cx of the labels # Action: Add the chromosome limits on the active plot affchrom=function(chr,ord,cx,texte=TRUE) { # Détection des limites entre les chromosomes nprobe=length(chr) limchrom=c() for(i in 1:(nprobe-1)) { if(chr[i]!=chr[i+1]){limchrom=c(limchrom,i+0.5)} } # Traçage des limites sur le plot for(i in 1:(max(chr)-1)) { abline(v=limchrom[i],col="darkgrey",lwd=1) } # Ecriture du numéro de chaque chromosome labchr=as.character(unique(chr));labchr[labchr=="23"]="X";labchr[labchr=="24"]="Y" xcord=rep(-9,max(chr)) xcord[1]=limchrom[1]/2 for(i in 2:(max(chr)-1)){xcord[i]=(limchrom[i]+limchrom[i-1])/2} xcord[max(chr)]=(nprobe+limchrom[max(chr)-1])/2 ycord=rep(ord,max(chr)) if(texte==T){text(labels=labchr,x=xcord,y=ycord,cex=cx,col="darkgrey")} } ################################################################################ ############### SEGMENTATION ############################ ################################################################################ # Input: A nsample x nprobe P matrix containing the discretized copy number profiles of all samples from one patient # Action: Merge breakpoints of the same sign that are closer than a given threshold # Output: The P matrix in which very close breakpoints have been merged merge.breakpoints=function(P,th.bkp,chr) { number.merged=0 for(c in unique(chr)) { p=S2B(P[,chr==c]) Bup=which(p > 0,arr.ind=TRUE) Bdown=which(p < 0,arr.ind=TRUE) if(nrow(Bup) > 0) { for(i in 1:nrow(Bup)) { d=abs(Bup[,2]-Bup[i,2]) close=which(d > 0 & d <= th.bkp & Bup[,1]!=Bup[i,1]) if(length(close) > 0) { middle=round(mean(c(Bup[i,2],Bup[close,2]))) for(k in c(i,close)) { if(setequal(unique(p[Bup[k,1],(Bup[k,2]:middle)[-1]]),0)){ p[Bup[k,1],middle]=p[Bup[k,1],Bup[k,2]] p[Bup[k,1],Bup[k,2]]=0 Bup[k,2]=middle number.merged=number.merged+1 } } } } } if(nrow(Bdown) > 0) { for(i in 1:nrow(Bdown)) { d=abs(Bdown[,2]-Bdown[i,2]) close=which(d > 0 & d <= th.bkp & Bdown[,1]!=Bdown[i,1]) if(length(close) > 0) { middle=round(mean(c(Bdown[i,2],Bdown[close,2]))) for(k in c(i,close)) { if(setequal(unique(p[Bdown[k,1],(Bdown[k,2]:middle)[-1]]),0)){ p[Bdown[k,1],middle]=p[Bdown[k,1],Bdown[k,2]] p[Bdown[k,1],Bdown[k,2]]=0 Bdown[k,2]=middle number.merged=number.merged+1 } } } } } P[,chr==c]=B2S(p) } print(paste("->",number.merged,"breakpoints closer than",th.bkp,"probes were identified and were merged.")) return(P) } # Input: A nsample x nprobe P matrix containing the discretized copy number profiles of all samples from one patient # Action: Partition all chromosomes into segments delineated by all breakpoints found in at least one sample # Output: A data frame describing each segment segments.delineation=function(P,chr,pos,name,cytostart,cytoend,lobound=NULL,upbound=NULL) { nsample=nrow(P);nprobe=ncol(P) Segment.ID=c() Start.probe.number=c();End.probe.number=c() for(i in 1:max(chr)) { bkp=c() x=which(chr==i) if(length(x) > 1) { for(e in 1:nsample) { for(j in 2:length(x)) { if(P[e,x[j]]!=P[e,x[j-1]] & j%in%bkp==FALSE){bkp=c(bkp,j)} } } } if(length(bkp) > 0){bkp=sort(bkp)} Start.probe.number=c(Start.probe.number,x[c(1,bkp)]) End.probe.number=c(End.probe.number,x[c(bkp-1,length(x))]) Segment.ID=c(Segment.ID,paste(i,".",1:(length(bkp)+1),sep="")) } Chromosome=chr[Start.probe.number] # Note: all probes of the segment are necessarily on the same chromosome Start.probe.name=name[Start.probe.number];End.probe.name=name[End.probe.number] if(!is.null(lobound) & !is.null(upbound)){ Start.position=lobound[Start.probe.number] End.position=upbound[End.probe.number]}else{ Start.position=pos[Start.probe.number];End.position=pos[End.probe.number]} Size=End.position-Start.position Start.cytoband=cytostart[Start.probe.number];End.cytoband=cytoend[End.probe.number] return(data.frame(Segment.ID,Start.probe.name,End.probe.name,Start.probe.number,End.probe.number,Chromosome,Start.position,End.position,Size,Start.cytoband,End.cytoband)) } # Input: the nsample x nprobe P matrix and the data frame describing all segments # Action: Make the matrix containing the copy number value for each segment in each sample # Output: the nsample x nsegm segments matrix profile.segmentation=function(P,segments.table) { nsample=nrow(P);nsegm=nrow(segments.table) S=matrix(nrow=nsample,ncol=nsegm) for(s in 1:nsegm) { for(e in 1:nsample) { S[e,s]=P[e,segments.table$Start.probe.number[s]] # Note: copy number is the same on any probe of the segment } } rownames(S)=rownames(P);colnames(S)=rownames(segments) return(S) } ################################################################################### ############# TUMORS' TREE RECONSTRUCTION ################ ################################################################################### # Input: 2 vectors containing the segmented profiles of 2 samples, the data frame describing all segments and 2 lists containing the frequencies of breakpoints of each sign in the reference data set # Action: Compute the partial identity score between the 2 samples # Output: the partial identity score between the 2 samples identity.score <- function(sampleA,sampleB,segments.table,Fup,Fdown) { CBS=0;D=0 for(c in unique(segments.table$Chromosome)) { indc=which(segments.table$Chromosome==c) Schr=rbind(sampleA[indc],sampleB[indc]) Bchr=S2B(Schr);nbkp=ncol(Bchr) Bcp=numeric(nbkp) for(j in 1:nbkp) { s=sign(Bchr[1,j]) if(s!=0 & sign(Bchr[2,j])==s){Bcp[j]=s*min(abs(Bchr[,j]))} } for(j in 1:nbkp) { if(Bcp[j] > 0){CBS=CBS+abs(Bcp[j])*(1-Fup[[c]][j])} if(Bcp[j] < 0){CBS=CBS+abs(Bcp[j])*(1-Fdown[[c]][j])} } Bdiff=Bchr-matrix(rep(Bcp,2),nrow=2,byrow=TRUE) for(i in 1:2) { for(j in 1:nbkp) { if(Bdiff[i,j] > 0){D=D+Bdiff[i,j]*(1-Fup[[c]][j])} if(Bdiff[i,j] < 0){D=D+abs(Bdiff[i,j])*(1-Fdown[[c]][j])} } } } return(list(CBS=CBS,D=D)) } # Input: a 3 x nchr scenario matrix containing segmented profiles of a hypothetical common precursor (line 1) and its two descending tumors (lines 2 and 3) # Action: Evaluate the likelihood of this hypothetical scenario # Output: 0 if the scenario is impossible, 0 otherwise check.CP <- function(scenario) { scenario.segm=B2S(scenario) ok=1 for(k in which(scenario.segm[1,]!=0)) { if(scenario.segm[1,k] < -2 | scenario.segm[1,k] > 1){ok=0;break} # One segment of the common precursor goes out of authorized values if(scenario.segm[1,k]==-2 & sum(scenario.segm[2:3,k]!=0)!=0){ok=0;break} # A segment with status -2 (homozygous deletion) in the common precursor changes of status later } return(ok) } # Input: a 3 x nchr scenario matrix containing segmented profiles of a hypothetical common precursor (line 1) and its two descending tumors (lines 2 and 3) # Action: Evaluate the likelyhood of this hypothetical scenario # Output: the penalty score associated to the scenario distance.scenario <- function(scenario,Bchr,a,b,c,segments.table,Fup,Fdown) { d=0 #breakpoints that will remain in the front after the CP's profile reconstruction fr=rbind(scenario[1,],Bchr[-c(a,b),]) for(j in 1:ncol(fr)) { if(sum(fr[,j] < 0) > 0){d=d+(1-max(abs(fr[fr[,j] < 0,j]))*Fdown[[c]][j])} if(sum(fr[,j] > 0) > 0){d=d+(1-max(fr[fr[,j] > 0,j])*Fup[[c]][j])} } #breakpoints necessary to go from the CP to each tumor spe=scenario[2:3,] bkp=which(spe!=0,arr.ind=TRUE) for(i in 1:nrow(bkp)) { val=spe[bkp[i,1],bkp[i,2]] if(val > 0){F=Fup[[c]]}else{F=Fdown[[c]]} d=d+abs(val)*(1-F[bkp[i,2]]) } return(d) } # Input: the front of the tree, indices of the 2 tumors with the most breakpoints in common, and the segments table # Action: compute the most likely profile of the 2 tumors' common precursor # Output: the common precursor's profile, and the aberrations that occured on each edge CP.reconstruction <- function(front,closest,segments.table,Fup,Fdown) { a=closest[1];b=closest[2] # THE PARTICULAR CASE OF AMPLICONS # Common amplicons and amplicons specific to one sample are stored ac=which(front[a,]==2 & front[b,]==2) aspeA=which(front[a,]==2 & front[b,]!=2) aspeB=which(front[a,]!=2 & front[b,]==2) # Amplicons are then removed from the profile and asigned the value of the closest segment subst=which(front==2,arr.ind=T) if(nrow(subst)!=0){ for(i in 1:nrow(subst)) { x=subst[i,1];y=subst[i,2];before=y;after=y;before.val=NA;after.val=NA if(y > 1){ before=y-1;while(front[x,before]==2 & before > 1){before=before-1} if(segments.table$Chromosome[before]==segments.table$Chromosome[y] & front[x,before]!=2){before.val=front[x,before]}} if(y < ncol(front)){ after=y+1;while(front[x,after]==2 & after < ncol(front)){after=after+1} #recherche du segment suivant non égal à 2 if(segments.table$Chromosome[after]==segments.table$Chromosome[y] & front[x,after]!=2){after.val=front[x,after]}} if(segments.table$Start.position[y]-segments.table$End.position[before] < segments.table$Start.position[after]-segments.table$End.position[y]){front[x,y]=before.val}else{front[x,y]=after.val} if(is.na(front[x,y])){front[x,y]=0} } } # RECONSTRUCTION OF THE COMMON PRECURSOR'S PROFILE CP.profile=c();speA=c();speB=c() for(c in unique(segments.table$Chromosome)) { Schr=as.matrix(front[,segments.table$Chromosome ==c]) nchr=ncol(Schr) Bchr=S2B(Schr) Achr=B2A(Bchr) #Compute the common breakpoints amplitude vector Acp and the common breakpoints Bcp Acp=apply(Achr[closest,],2,min) Bcp=A2B(Acp) #If common breakpoints don't fully define the CP's profile, complete it if(sum(Bcp)!=0) { delta=sum(Bcp) spe=Bchr[closest,]-matrix(rep(Bcp,2),nrow=2,byrow=TRUE) poss=unique(c(which(sign(spe)==-sign(delta),arr.ind=T)[,2],which(sign(Bcp)==delta))) if(length(poss) > 1){poss=combn(poss,abs(delta))}else{poss=matrix(poss,nrow=1)} #Verify each possibility gives a realistic scenario and compute the overall distance associated to each scen.OK=numeric(ncol(poss));scen.dist=numeric(ncol(poss)) for(i in 1:ncol(poss)) { bkphyp=Bcp bkphyp[poss[,i]]=bkphyp[poss[,i]]-sign(delta) scenario=rbind(bkphyp,Bchr[c(a,b),]-matrix(rep(bkphyp,2),nrow=2,byrow=T)) scen.OK[i]=check.CP(scenario) scen.dist[i]=distance.scenario(scenario,Bchr,a,b,c,segments.table,Fup,Fdown) } #Keep the possible scenario that minimizes the distance keep=which(scen.OK==1)[which.min(scen.dist[scen.OK==1])] Bcp[poss[,keep]]=Bcp[poss[,keep]]-sign(delta) } spe=Bchr[closest,]-matrix(rep(Bcp,2),nrow=2,byrow=TRUE) #Go back to segments Scp=B2S(Bcp) CP.profile=c(CP.profile,Scp) speA=c(speA,B2S(spe[1,])) speB=c(speB,B2S(spe[2,])) } CP.profile[ac]=2;speA[aspeA]=2;speB[aspeB]=2 list(CPprof=CP.profile,indiv1=speA,indiv2=speB) } # Input: indices a and b of 2 segments # Action: formats the interval between segments a and b in a compact way # Output: a character string representing the genomic interval between segments a and b format.cytoband=function(a,b,segments.table) { chr.a=segments.table$Chromosome[a] chr.b=segments.table$Chromosome[b] cyto.a=as.character(segments.table$Start.cytoband[a]) cyto.b=as.character(segments.table$End.cytoband[b]) tmp=unlist(strsplit(cyto.a,""));arm.a=tmp[grep("[p,q]",tmp)] tmp=unlist(strsplit(cyto.b,""));arm.b=tmp[grep("[p,q]",tmp)] if(a > 1){ chr.bef=segments.table$Chromosome[a-1] tmp=unlist(strsplit(as.character(segments.table$End.cytoband[a-1]),""));arm.bef=tmp[grep("[p,q]",tmp)]}else{chr.bef=0;arm.bef=""} if(b < nrow(segments.table)){ chr.aft=segments.table$Chromosome[b+1] tmp=unlist(strsplit(as.character(segments.table$Start.cytoband[b+1]),""));arm.aft=tmp[grep("[p,q]",tmp)]}else{chr.aft=0;arm.aft=""} if(chr.bef!=chr.a & chr.aft!=chr.b){ # the aberration spans a whole chromosome if(chr.a==23){chr.a="X"};if(chr.a==24){chr.a="Y"};return(as.character(chr.a))}else{ if((chr.bef!=chr.a | arm.bef!=arm.a) & (chr.aft!=chr.b | arm.aft!=arm.b)){ # the aberration spans a whole chromosome arm return(paste(chr.a,arm.a,sep="")) }else{ aband=unlist(strsplit(cyto.a,"\\."))[1] bband=unlist(strsplit(cyto.b,"\\."))[1] if(aband==bband){return(aband)}else{ bband=unlist(strsplit(bband,"[p,q]"))[2] if(arm.a!=arm.b){bband=paste(arm.b,bband,sep="")} return(paste(aband,bband,sep="-")) } } } } # Input: a vector ev of the segments modifications that occur on an edge # Action: format the events associated to the edge # Output: a list of chromosomal events that occur on the edge format.events <- function(ev,segments.table) { cytostart=as.character(segments.table$Start.cytoband);cytoend=as.character(segments.table$End.cytoband) aber=matrix(nrow=0,ncol=3,dimnames=list(c(),c("Start","End","Type"))) for(c in unique(segments.table$Chromosome)) { g=as.matrix(intervals(which(ev > 0 & segments.table$Chromosome==c)));if(nrow(g) > 0){g=cbind(g,rep(1,nrow(g)))}else{g=matrix(nrow=0,ncol=3)} a=as.matrix(intervals(which(ev==2 & segments.table$Chromosome==c)));if(nrow(a) > 0){a=cbind(a,rep(2,nrow(a)))}else{a=matrix(nrow=0,ncol=3)} l=as.matrix(intervals(which(ev < 0 & segments.table$Chromosome==c)));if(nrow(l) > 0){l=cbind(l,rep(-1,nrow(l)))}else{l=matrix(nrow=0,ncol=3)} d=as.matrix(intervals(which(ev==-2 & segments.table$Chromosome==c)));if(nrow(d) > 0){d=cbind(d,rep(-2,nrow(d)))}else{d=matrix(nrow=0,ncol=3)} rmg=numeric(0);rml=numeric(0) if(nrow(a) > 0){ for(i in 1:nrow(g)){if(sum(a[,1]==g[i,1] & a[,2]==g[i,2])!=0){rmg=c(rmg,i)}} } if(nrow(d) > 0){ for(i in 1:nrow(l)){if(sum(d[,1]==l[i,1] & d[,2]==l[i,2])!=0){rml=c(rml,i)}} } if(length(rmg) > 0){g=g[-rmg,]};if(length(rml) > 0){l=l[-rml,]} aber=rbind(aber,g,a,l,d) } listev.segments=listev.cyto=character(0) if(nrow(aber) > 0) { for(i in 1:nrow(aber)) { if(aber[i,3]==1){signe="+"}else{if(aber[i,3]==-1){signe="-"}else{if(aber[i,3]==2){signe="++"}else{if(aber[i,3]==-2){signe="--"}}}} if(cytostart[aber[i,1]]!=cytoend[aber[i,2]]){listev.cyto=c(listev.cyto,paste(signe,format.cytoband(aber[i,1],aber[i,2],segments.table),sep=""))}else{listev.cyto=c(listev.cyto,paste(signe,unlist(strsplit(cytostart[aber[i,1]],"\\."))[1],sep=""))} if(aber[i,1]!=aber[i,2]){listev.segments=c(listev.segments,paste(signe,paste(segments.table$Segment.ID[aber[i,1]],segments.table$Segment.ID[aber[i,2]],sep=":"),sep=""))}else{listev.segments=c(listev.segments,paste(signe,segments.table$Segment.ID[aber[i,1]],sep=""))} } } list(segments=listev.segments,cyto=listev.cyto) } # Input: the matrix of segmented profiles S and the table describing segments # Action: reconstructs the tumors'tree and the sequence of chromosomal aberrations involved # Output: a list of 3 vectors: the nodes, the edges, and the events associated to each edge tree.reconstruction=function(S,segments.table,Fup,Fdown) { # Initiation Nodes=list();for(i in 1:nrow(S)){Nodes[[i]]=S[i,]};names(Nodes)=rownames(S) Edges=data.frame(0,0);colnames(Edges)=c("from","to") Events.cyto=Events.segments=list() left=1:length(Nodes) # indices of nodes that are in the front ncp=0 # number of common precursors inferred while(length(left) > 1) { # Find the two nodes in the front with the most breakpoints in common Front=Nodes[left] nn=length(Front) C=matrix(nrow=nn,ncol=nn);D=matrix(nrow=nn,ncol=nn) for(i in 1:(nn-1)) { for(j in (i+1):nn) { idscore=identity.score(Front[[i]],Front[[j]],segments.table,Fup,Fdown) C[i,j]=idscore$CBS D[i,j]=idscore$D } } closest=which(C==max(C,na.rm=TRUE),arr.ind=TRUE)[which.min(D[which.max(C)]),] # Reconstruct their common precursor's profile cp=CP.reconstruction(t(as.matrix(as.data.frame(Front))),closest,segments.table,Fup,Fdown) # Add the node, edges and events to the tree if(setequal(cp$indiv1,0)){ # if one of the node is the common precursor, draw an edge from it to the other node Edges=rbind(Edges,match(names(Front)[closest],names(Nodes))) formatted.events=format.events(cp$indiv2,segments.table) Events.cyto=append(Events.cyto,list(formatted.events$cyto)) Events.segments=append(Events.segments,list(formatted.events$segments)) }else{if(setequal(cp$indiv2,0)){ Edges=rbind(Edges,match(names(Front)[closest[2:1]],names(Nodes))) formatted.events=format.events(cp$indiv1,segments.table) Events.cyto=append(Events.cyto,list(formatted.events$cyto)) Events.segments=append(Events.segments,list(formatted.events$segments)) }else{ Nodes[[length(Nodes)+1]]=cp$CPprof ncp=ncp+1;names(Nodes)[length(Nodes)]=ncp Edges=rbind(Edges,c(length(Nodes),which(names(Nodes)==names(Front)[closest[1]]))) formatted.events=format.events(cp$indiv1,segments.table) Events.cyto=append(Events.cyto,list(formatted.events$cyto)) Events.segments=append(Events.segments,list(formatted.events$segments)) Edges=rbind(Edges,c(length(Nodes),which(names(Nodes)==names(Front)[closest[2]]))) formatted.events=format.events(cp$indiv2,segments.table) Events.cyto=append(Events.cyto,list(formatted.events$cyto)) Events.segments=append(Events.segments,list(formatted.events$segments)) } } left=setdiff((1:length(Nodes)),Edges$to) } if(ncp > 1){names(Nodes)[(length(Nodes)):(length(Nodes)-ncp+1)]=c("Common precursor",paste("Common precursor",2:ncp))}else{if(ncp==1){names(Nodes)[length(Nodes)]="Common precursor"}} # Add the "normal tissue" node if(setequal(unique(Nodes[[length(Nodes)]]),0)){names(Nodes)[length(Nodes)]="Normal tissue"}else{ Nodes=append(list(root=numeric(ncol(S))),Nodes);Edges=Edges+1;names(Nodes)[1]="Normal tissue" LCA=setdiff(1:length(Nodes),Edges$to) Edges=rbind(Edges,c(1,LCA)) formatted.events=format.events(Nodes[[LCA]],segments.table) Events.cyto=append(Events.cyto,list(formatted.events$cyto)) Events.segments=append(Events.segments,list(formatted.events$segments)) } # Format edges as a list Edges.list=list() for(i in 2:nrow(Edges)) { Edges.list=append(Edges.list,list(c(from=Edges[i,1],to=Edges[i,2]))) } list(nodes=Nodes, edges=Edges.list, events.cyto=Events.cyto, events.segments=Events.segments) } #################################################################### ############# EXPORT RESULTS ############# #################################################################### export2dot=function(tumors.tree, dot.file.segments, dot.file.cyto) { # Export the tree with cytoband labels unlink(dot.file.cyto) sink(dot.file.cyto) Edges=tumors.tree$edges Nodes=tumors.tree$nodes Events.cyto=tumors.tree$events.cyto cat("digraph G { \n") cat("\t ratio=fill\n") cat("\t size=\"20,20\"\n") cat("\t","node[fontsize=100,style=\"setlinewidth(8)\"]") n=length(Edges) for (i in 1:n) { from=Nodes[Edges[[i]]["from"]] labfrom=names(from) to=Nodes[Edges[[i]]["to"]] labto=names(to) evt=Events.cyto[[i]] # Do a new line for edge labels every 4 aberrations labevt=character(0) maxline=40 rest=evt while(length(rest)!=0) { eol=max(which(cumsum(nchar(rest)) <= maxline)) labevt=paste(c(labevt,paste(rest[1:eol],collapse=","),"\\l"),collapse="") if(eol!=length(rest)){rest=rest[(eol+1):length(rest)]}else{rest=c()} } if (length(labevt)==0) labevt="No event" cat ("\t \"",labfrom, "\"->\"", labto, "\"[label=\"",labevt,"\"",",fontsize=100,style=\"setlinewidth(8)\",arrowsize=2","]\n", sep="") } cat("}\n") sink() # Export the tree with segments labels unlink(dot.file.segments) sink(dot.file.segments) Edges=tumors.tree$edges Nodes=tumors.tree$nodes Events.segments=tumors.tree$events.segments cat("digraph G { \n") cat("\t ratio=fill\n") cat("\t size=\"20,20\"\n") cat("\t","node[fontsize=100,style=\"setlinewidth(8)\"]") n=length(Edges) for (i in 1:n) { from=Nodes[Edges[[i]]["from"]] labfrom=names(from) to=Nodes[Edges[[i]]["to"]] labto=names(to) evt=Events.segments[[i]] # Do a new line for edge labels every 4 aberrations labevt=character(0) maxline=40 rest=evt while(length(rest)!=0) { eol=max(which(cumsum(nchar(rest)) <= maxline)) labevt=paste(c(labevt,paste(rest[1:eol],collapse=","),"\\l"),collapse="") if(eol!=length(rest)){rest=rest[(eol+1):length(rest)]}else{rest=c()} } if (length(labevt)==0) labevt="No event" cat ("\t \"",labfrom, "\"->\"", labto, "\"[label=\"",labevt,"\"",",fontsize=100,style=\"setlinewidth(8)\",arrowsize=2","]\n", sep="") } cat("}\n") sink() } plotbkp=function(tumors.tree,segments.table,chr,arm,cyto,P.val,P,bkp.file) { # Find cytoband coordinates tmp=unlist(strsplit(unique(cyto),split="\\.")) cytolab=unique(tmp[grep("[p,q]",tmp)]) tel=union(grep("ptel",cytolab),grep("qtel",cytolab)) if(length(tel) > 0){cytolab=cytolab[-tel]} ncyto=length(cytolab);colcyto=rep(c("white","grey"),ncyto) cytodeb=cytofin=numeric(ncyto) for(i in 1:ncyto) { int=intervals(grep(paste("^",cytolab[i],sep=""),cyto));if(nrow(int) > 1){print(i)} cytodeb[i]=int$from cytofin[i]=int$to } # Identification of common breakpoints nsample=nrow(P);nprobe=ncol(P) normal.node=which(names(tumors.tree$nodes)=='Normal tissue') normal.edge=which(unlist(tumors.tree$edges)==normal.node) if(length(normal.edge)==1){LCA=unlist(tumors.tree$edges)[normal.edge+1]}else{LCA=normal.node} # index of the last common ancestor v=tumors.tree$nodes[[LCA]] if(setequal(unique(v),0)){combkp.start.visible=combkp.end.visible=combkp.start.inferred=combkp.end.inferred=c()}else{ if(v[1]!=0){start=1;end=1}else{start=c();end=c()} for(i in 2:nrow(segments.table)){if(v[i]!=0){if(v[i]==v[i-1] & segments.table$Chromosome[i]==segments.table$Chromosome[i-1]){end[length(end)]=i}else{start=c(start,i);end=c(end,i)}}} combkp.start=sort(segments.table$Start.probe.number[start]) combkp.end=sort(segments.table$End.probe.number[end]) # Differenciating common breakpoints still visible in the profiles from common breakpoints inferred by TuMult to balance chromosomes combkp.start.visible=combkp.start.inferred=combkp.end.visible=combkp.end.inferred=c() for(k in 1:length(combkp.start)){ if(combkp.start[k]==1 | chr[max(1,combkp.start[k]-1)]!=chr[combkp.start[k]]){bef=rep(0,nsample)}else{bef=P[,combkp.start[k]-1]} if(abs(sum(sign(P[,combkp.start[k]]-bef)))==nsample){combkp.start.visible=c(combkp.start.visible,combkp.start[k])}else{combkp.start.inferred=c(combkp.start.inferred,combkp.start[k])}} for(k in 1:length(combkp.end)){ if(combkp.end[k]==nprobe | chr[min(nprobe,combkp.end[k]+1)]!=chr[combkp.end[k]]){aft=rep(0,nsample)}else{aft=P[,combkp.end[k]+1]} if(abs(sum(sign(aft-P[,combkp.end[k]])))==nsample){combkp.end.visible=c(combkp.end.visible,combkp.end[k])}else{combkp.end.inferred=c(combkp.end.inferred,combkp.end[k])}} } pdf(bkp.file,width=12,height=8) # Page 1: Entire profiles combkp.visible=c(combkp.start.visible-0.5,combkp.end.visible+0.5);combkp.inferred=c(combkp.start.inferred-0.5,combkp.end.inferred+0.5) M=max(abs(P.val),na.rm=TRUE);my.ylim=c(-M,M) mypch="." cx=1;cx.points=5*nsample/log10(nprobe);ld=1;colchrom=rep(c("white","grey"),12) m=matrix(1:nsample,nsample) layout(m) par(mai=c(0,par()$mai[2],0,par()$mai[4])) mycol=c("blue","green","yellow","red","purple")[P[1,]+3] plot(P.val[1,],col=mycol,pch=mypch,xlab="",ylab=rownames(P)[1],xaxt="n",cex.axis=cx,cex.lab=cx,cex=cx.points,ylim=my.ylim,bty="n",las=1) for(c in unique(chr)) { p.arm=intervals(which(chr==c & arm=="p"));q.arm=intervals(which(chr==c & arm=="q")) if(nrow(p.arm)!=0){rect(p.arm$from,my.ylim[1],p.arm$to,my.ylim[1]+0.2,col=colchrom[c])} if(nrow(q.arm)!=0){rect(q.arm$from,my.ylim[1],q.arm$to,my.ylim[1]+0.2,col=colchrom[c])} } affchrom(chr,my.ylim[2]-0.1,cx) if(length(combkp.visible) > 0){for(k in 1:length(combkp.visible)){segments(combkp.visible[k],par("usr")[3],combkp.visible[k],my.ylim[2]-0.3,lwd=ld,lty=2)}} if(length(combkp.inferred) > 0){for(k in 1:length(combkp.inferred)){segments(combkp.inferred[k],par("usr")[3],combkp.inferred[k],my.ylim[2]-0.3,lwd=ld,lty=3)}} if(nsample > 2){ for(i in 2:(nsample-1)) { mycol=c("blue","green","yellow","red","purple")[P[i,]+3] plot(P.val[i,],col=mycol,pch=mypch,xlab="",ylab=rownames(P)[i],xaxt="n",cex.axis=cx,cex.lab=cx,cex=cx.points,ylim=my.ylim,bty="n",las=1) for(c in unique(chr)) { p.arm=intervals(which(chr==c & arm=="p"));q.arm=intervals(which(chr==c & arm=="q")) if(nrow(p.arm)!=0){rect(p.arm$from,my.ylim[1],p.arm$to,my.ylim[1]+0.2,col=colchrom[c])} if(nrow(q.arm)!=0){rect(q.arm$from,my.ylim[1],q.arm$to,my.ylim[1]+0.2,col=colchrom[c])} } affchrom(chr,my.ylim[1]+0.4,cx,texte=FALSE) abline(v=combkp.visible,lwd=ld,lty=2);abline(v=combkp.inferred,lwd=ld,lty=3) } } mycol=c("blue","green","yellow","red","purple")[P[nsample,]+3] plot(P.val[nsample,],col=mycol,pch=mypch,xlab="",ylab=rownames(P)[nsample],xaxt="n",cex.axis=cx,cex.lab=cx,cex=cx.points,ylim=my.ylim,bty="n",las=1) for(c in unique(chr)) { p.arm=intervals(which(chr==c & arm=="p"));q.arm=intervals(which(chr==c & arm=="q")) if(nrow(p.arm)!=0){rect(p.arm$from,my.ylim[1],p.arm$to,my.ylim[1]+0.2,col=colchrom[c])} if(nrow(q.arm)!=0){rect(q.arm$from,my.ylim[1],q.arm$to,my.ylim[1]+0.2,col=colchrom[c])} } affchrom(chr,my.ylim[1]-0.4,cx,texte=FALSE) if(length(combkp.visible) > 0){for(k in 1:length(combkp.visible)){segments(combkp.visible[k],my.ylim[1]+0.2,combkp.visible[k],par("usr")[4],lwd=ld,lty=2)}} if(length(combkp.inferred) > 0){for(k in 1:length(combkp.inferred)){segments(combkp.inferred[k],my.ylim[1]+0.2,combkp.inferred[k],par("usr")[4],lwd=ld,lty=3)}} # One page per chromosome for(c in unique(chr)) { sel=which(chr==c);P.chr=P[,sel];P.val.chr=P.val[,sel] cx=1;cx.points=5*nrow(P.chr)/log10(ncol(P.chr));ld=1;colchrom=rep(c("white","grey"),12) m=matrix(1:nrow(P.chr),nrow(P.chr)) layout(m) par(mai=c(0,par()$mai[2],0,par()$mai[4])) mycol=c("blue","green","yellow","red","purple")[P.chr[1,]+3] plot(P.val.chr[1,],col=mycol,pch=mypch,xlab="",ylab=rownames(P.chr)[1],xaxt="n",cex.axis=cx,cex.lab=cx,cex=cx.points,ylim=my.ylim,bty="n",las=1) cytochrom=which(cytodeb%in%sel) for(k in cytochrom){rect(match(cytodeb[k],sel)-0.5,my.ylim[1],match(cytofin[k],sel)+0.5,my.ylim[1]+0.3,col=colcyto[k])} axis(side=1,at=apply(cbind(match(cytodeb[cytochrom],sel),match(cytofin[cytochrom],sel)),1,mean),labels=cytolab[cytochrom],pos=my.ylim[1],las=3,cex.axis=0.8*cx) centro=match(which(chr==c & arm=="q")[1],sel)-0.5;arrows(centro,my.ylim[1],centro,my.ylim[1]+0.8,lwd=2*ld,angle=120,length=.15) lab=c(1:22,"X","Y")[c];title(paste("Chromosome",lab),line=-1) combkp.visible=c(match(combkp.start.visible,sel)-0.5,match(combkp.end.visible,sel)+0.5) combkp.inferred=c(match(combkp.start.inferred,sel)-0.5,match(combkp.end.inferred,sel)+0.5) if(length(combkp.visible) > 0){for(k in 1:length(combkp.visible)){segments(combkp.visible[k],par("usr")[3],combkp.visible[k],my.ylim[2]-0.3,lwd=ld,lty=2)}} if(length(combkp.inferred) > 0){for(k in 1:length(combkp.inferred)){segments(combkp.inferred[k],par("usr")[3],combkp.inferred[k],my.ylim[2]-0.3,lwd=ld,lty=3)}} if(nsample > 2){ for(i in 2:(nsample-1)) { mycol=c("blue","green","yellow","red","purple")[P.chr[i,]+3] plot(P.val.chr[i,],col=mycol,pch=mypch,xlab="",ylab=rownames(P.chr)[i],xaxt="n",cex.axis=cx,cex.lab=cx,cex=cx.points,ylim=my.ylim,bty="n",las=1) cytochrom=which(cytodeb%in%sel) for(k in cytochrom){rect(match(cytodeb[k],sel)-0.5,my.ylim[1],match(cytofin[k],sel)+0.5,my.ylim[1]+0.3,col=colcyto[k])} axis(side=1,at=apply(cbind(match(cytodeb[cytochrom],sel),match(cytofin[cytochrom],sel)),1,mean),labels=cytolab[cytochrom],pos=my.ylim[1],las=3,cex.axis=0.8*cx) centro=match(which(chr==c & arm=="q")[1],sel)-0.5;arrows(centro,my.ylim[1],centro,my.ylim[1]+0.8,lwd=2*ld,angle=120,length=.15) abline(v=combkp.visible,lwd=ld,lty=2);abline(v=combkp.inferred,lwd=ld,lty=3) } } par(mai=c(0.5,par()$mai[2],0,par()$mai[4])) mycol=c("blue","green","yellow","red","purple")[P.chr[nrow(P.chr),]+3] plot(P.val.chr[nrow(P.chr),],col=mycol,pch=mypch,xlab="",ylab=rownames(P.chr)[nrow(P.chr)],xaxt="n",cex.axis=cx,cex.lab=cx,cex=cx.points,ylim=my.ylim,bty="n",las=1) cytochrom=which(cytodeb%in%sel) for(k in cytochrom){rect(match(cytodeb[k],sel)-0.5,my.ylim[1],match(cytofin[k],sel)+0.5,my.ylim[1]+0.3,col=colcyto[k])} axis(side=1,at=apply(cbind(match(cytodeb[cytochrom],sel),match(cytofin[cytochrom],sel)),1,mean),labels=cytolab[cytochrom],pos=my.ylim[1],las=3,cex.axis=0.8*cx) centro=match(which(chr==c & arm=="q")[1],sel)-0.5;arrows(centro,my.ylim[1],centro,my.ylim[1]+0.8,lwd=2*ld,angle=120,length=.15) if(length(combkp.visible) > 0){for(k in 1:length(combkp.visible)){segments(combkp.visible[k],my.ylim[1]+0.2,combkp.visible[k],par("usr")[4],lwd=ld,lty=2)}} if(length(combkp.inferred) > 0){for(k in 1:length(combkp.inferred)){segments(combkp.inferred[k],my.ylim[1]+0.2,combkp.inferred[k],par("usr")[4],lwd=ld,lty=3)}} } dev.off() } #################################################################### ############## MAIN PROGRAM ############## #################################################################### # Read probes information print("Loading the probes table...") probes.table=read.csv(Probes.file,as.is=TRUE,fill=FALSE) chr=probes.table$Chromosome;lobound=probes.table$StartPosition;upbound=probes.table$EndPosition;pos=probes.table$Position;name=probes.table$Name chr[chr=="X"]=23;chr[chr=="Y"]=24;chr=as.numeric(chr) if(is.null(pos)){pos=(lobound+upbound)/2} cyto=probes.table[,grep("Cytoband",colnames(probes.table))[1]] arm=character(nrow(probes.table)) arm[grep("p",cyto)]="p";arm[grep("q",cyto)]="q" if("StartCytoband"%in%colnames(probes.table) & "StartCytoband"%in%colnames(probes.table)){cytostart=probes.table$StartCytoband;cytoend=probes.table$EndCytoband}else{cytostart=cytoend=cyto} print(paste("->","Table of",nrow(probes.table),"probes read.")) # Load the reference dataset print("Loading the reference data set...") load(Reference.file) nref=nrow(ref) print(paste("->",nref,"reference profiles read.")) # Output files pat=unlist(strsplit(Prof.file,split=".csv"))[1] dot.file.segments=paste(pat,"_tree_segments.dot",sep="") dot.file.cyto=paste(pat,"_tree_cytobands.dot",sep="") segments.file=paste(pat,"_segments.csv",sep="") tree.file=paste(pat,"_tree.Rdata",sep="") bkp.file=paste(pat,"_common_breakpoints.pdf",sep="") # Read the tumors profile table print("Loading the tumors profiles...") profile=as.matrix(read.csv(Prof.file,check.names=FALSE,fill=FALSE)) P=profile[,grep("status",colnames(profile))] P.val=profile[,grep("value",colnames(profile))] sample.names=unlist(strsplit(colnames(P),".status")) colnames(P)=sample.names;colnames(P.val)=sample.names for(i in 1:ncol(P)){P[,i]=as.numeric(P[,i]);P.val[,i]=as.numeric(P.val[,i])} P=t(P);P.val=t(P.val) # Replace NAs by the value of the closest probe number.NA=sum(is.na(P)) for(i in 1:nrow(P)) { for(j in 1:ncol(P)) { if(is.na(P[i,j])){ x=which(chr==chr[j] & !is.na(P[i,])) P[i,j]=P[i,x[which.min(abs(pos[x]-pos[j]))]] } } } print(paste("->",nrow(P),"profiles read.")) # Merge breakpoints closer than the threshold th.bkp print(paste("Merging breakpoints closer than",th.bkp,"probes...")) P=merge.breakpoints(P,th.bkp,chr) # Segmentation of the tumors profiles print("Delineating homogenous segments...") segments.table=segments.delineation(P,chr,pos,name,cytostart,cytoend,lobound,upbound) S=profile.segmentation(P,segments.table) print(paste("->",nrow(segments.table),"homogenous segments delineated.")) # Generation of the amplitude matrix and calculation of the breakpoints frequencies in the reference data set A=matrix(nrow=nrow(P),ncol=0);F=c();Fup=list();Fdown=list() for(c in unique(chr)) { indc=which(segments.table$Chromosome==c) Schr=as.matrix(S[,indc]) Bchr=S2B(Schr) Achr=B2A(Bchr) A=cbind(A,Achr) Fup[[c]]=sum(ref[,segments.table$Start.probe.number[indc[1]]] > 0,na.rm=TRUE)/nref Fdown[[c]]=sum(ref[,segments.table$Start.probe.number[indc[1]]] < 0,na.rm=TRUE)/nref if(length(indc) > 1) { for(i in 2:length(indc)) { Fup[[c]]=c(Fup[[c]],sum(ref[,segments.table$Start.probe.number[indc[i]]] > ref[,segments.table$End.probe.number[indc[i-1]]],na.rm=TRUE)/nref) Fdown[[c]]=c(Fdown[[c]],sum(ref[,segments.table$Start.probe.number[indc[i]]] < ref[,segments.table$End.probe.number[indc[i-1]]],na.rm=TRUE)/nref) } } Fup[[c]]=c(Fup[[c]],sum(ref[,segments.table$End.probe.number[length(indc)]] < 0,na.rm=TRUE)/nref) Fdown[[c]]=c(Fdown[[c]],sum(ref[,segments.table$End.probe.number[length(indc)]] > 0,na.rm=TRUE)/nref) F=c(F,Fup[[c]],Fdown[[c]]) } # Reconstruct tumors lineage and the sequence of chromosomal aberrations print("Reconstructing the tumor progression tree.") tumors.tree=tree.reconstruction(S,segments.table,Fup,Fdown) print("-> Tumor progression tree reconstructed.") # Export results print("Exporting results.") export2dot(tumors.tree,dot.file.segments,dot.file.cyto) # tree in .dot format write.csv(cbind(Number=1:nrow(segments.table),segments.table),segments.file,row.names=FALSE,quote=FALSE) # segments.table in .csv format save(tumors.tree,file=tree.file) # Rdata of the tumors tree plotbkp(tumors.tree,segments.table,chr,arm,cyto,P.val,P,bkp.file) # Plot of the profiles and the common breakpoints print("Done.")