## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>"
)

## ----echo=TRUE, eval=FALSE----------------------------------------------------
# 
# if(!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("survClust")

## -----------------------------------------------------------------------------
library(survClust)
library(survival)
library(BiocParallel)

#mutation data
uvm_dat[[1]][1:5,1:5]

#copy number data
uvm_dat[[2]][1:5,1:5]

#TCGA UVM clinical data
head(uvm_survdat)


## ----echo=TRUE, eval=TRUE-----------------------------------------------------

cv_rounds = 10

#function to do cross validation 
uvm_all_cvrounds<-function(kk){
    this.fold<-3
    fit<-list()
    for (i in seq_len(cv_rounds)){
        fit[[i]] <- cv_survclust(uvm_dat,uvm_survdat,kk,this.fold)
        print(paste0("finished ", i, " rounds for k= ", kk))
    }
    return(fit)
}


## ----echo=TRUE, eval=FALSE----------------------------------------------------
# ptm <- Sys.time()
# cv.fit<-bplapply(2:7, uvm_all_cvrounds)
# ptm2 <- Sys.time()
# 
# #> ptm
# #[1] "2022-09-05 20:54:21 EDT"
# #> ptm2
# #[1] "2022-09-05 21:01:12 EDT"
# 
# 

## -----------------------------------------------------------------------------
lapply(uvm_dat, function(x) dim(x))

## -----------------------------------------------------------------------------

#for k=2, 1st round of cross validation
names(uvm_survClust_cv.fit[[1]][[1]])


## ----fig.width=8, fig.height=8, fig.cap= "survClust analysis of TCGA UVM mutation and Copy Number data"----

ss_stats <- getStats(uvm_survClust_cv.fit, kk=7, cvr=10)
plotStats(ss_stats, 2:7)

## ----message=FALSE, fig.cap= "survClust KM analysis for integrated TCGA UVM Mutation and Copy Number for k=4"----

k4 <- cv_voting(uvm_survClust_cv.fit, getDist(uvm_dat, uvm_survdat), pick_k=4)
table(k4)

plot(survfit(Surv(uvm_survdat[,1], uvm_survdat[,2])~k4), mark.time=TRUE, col=1:4)

## -----------------------------------------------------------------------------

mut_k4_test <- apply(uvm_dat[[1]],2,function(x) fisher.test(x,k4)$p.value)
head(sort(p.adjust(mut_k4_test)))

## ----echo = FALSE, fig.cap="TCGA UVM mutation features for k=4"---------------
htmltools::img(src = knitr::image_uri("uvm_mut_example.png"), 
               style = 'margin-left: auto;margin-right: auto')

## ----fig.width=5, fig.height=6, fig.cap= "TCGA UVM Copy Number k=4"-----------

cn_imagedata <- uvm_dat[[2]]
cn_imagedata[cn_imagedata < -1.5] <- -1.5
cn_imagedata[cn_imagedata > 1.5] <- 1.5

oo <- order(k4)
cn_imagedata <- cn_imagedata[oo,]
cn_imagedata <- cn_imagedata[,ncol(cn_imagedata):1]
#image(cn_imagedata,col=gplots::bluered(50),axes=F)

#image y labels - chr names
cnames <- colnames(cn_imagedata)
cnames <- unlist(lapply(strsplit(cnames, "\\."), function(x) x[1]))
tt <- table(cnames)
nn <- paste0("chr",1:22)

chr.labels <- rep(NA, length(cnames))

index <- 1
chr.labels[1] <- "1"

for(i in seq_len(length(nn)-1)) {
    index <- index + tt[nn[i]]
    chr.labels[index] <- gsub("chr","",nn[i+1])
}

idx <- which(!(is.na(chr.labels)))

image(cn_imagedata,col=gplots::bluered(50),axes=FALSE)

axis(2, at = 1 - (idx/length(cnames)), labels = chr.labels[idx], las=1, cex.axis=0.8)
abline(v = c(cumsum(prop.table(table(k4)))))
abline(h=c(0,1))

## -----------------------------------------------------------------------------

#function to do cross validation 
sim_cvrounds<-function(kk){
    this.fold<-3
    fit<-list()
    for (i in seq_len(cv_rounds)){
        fit[[i]] <- cv_survclust(simdat, simsurvdat,kk,this.fold)
        print(paste0("finished ", i, " rounds for k= ", kk))
    }
    return(fit)
}


ptm <- Sys.time()
sim_cv.fit<-bplapply(2:7, sim_cvrounds)
ptm2 <- Sys.time()

ptm
ptm2

## ----fig.width=8, fig.height=8, fig.cap= "survClust analysis of simulated dataset"----

ss_stats <- getStats(sim_cv.fit, kk=7, cvr=10)
plotStats(ss_stats, 2:7)

## ----message=FALSE, fig.cap= "survClust k=3 class labels KM analysis for simulated dataset "----

k3 <- cv_voting(sim_cv.fit, getDist(simdat, simsurvdat), pick_k=3)

sim_class_labels <- c(rep(1, 50), rep(2,50), rep(3,50))

table(k3, sim_class_labels)

plot(survfit(Surv(simsurvdat[,1], simsurvdat[,2]) ~ k3), mark.time=TRUE, col=1:3)

## -----------------------------------------------------------------------------

#function to do cross validation 
cvrounds_mut <- function(kk){
    this.fold<-3
    fit<-list()
    for (i in seq_len(cv_rounds)){
        fit[[i]] <- cv_survclust(uvm_mut_dat, uvm_survdat,kk,this.fold, type="mut")
        print(paste0("finished ", i, " rounds for k= ", kk))
    }
    return(fit)
}

#let's create a list object with just the mutation data 
uvm_mut_dat <- list()
uvm_mut_dat[[1]] <- uvm_dat[[1]]

ptm <- Sys.time()
uvm_mut_cv.fit<-bplapply(2:7, cvrounds_mut)
ptm2 <- Sys.time()


## ----fig.width=8, fig.height=8, fig.cap= "survClust analysis of TCGA UVM mutation data alone"----

ss_stats <- getStats(uvm_mut_cv.fit, kk=7, cvr=10)
plotStats(ss_stats, 2:7)

## ----fig.width=4, fig.height=4, message=FALSE, fig.cap= "survClust k=3 class labels KM analysis for TCGA UVM mutation data alone"----

k4 <- cv_voting(uvm_mut_cv.fit, getDist(uvm_mut_dat, uvm_survdat), pick_k=4)
plot(survfit(Surv(uvm_survdat[,1], uvm_survdat[,2]) ~ k4), mark.time=TRUE, col=2:5)

## -----------------------------------------------------------------------------

mut_k4_test <- apply(uvm_mut_dat[[1]],2,function(x) fisher.test(x,k4)$p.value)
head(sort(p.adjust(mut_k4_test)))

## ----eval=FALSE, echo=TRUE----------------------------------------------------
# 
# # DO NOT RUN. Use provided dataset
# #Process mutation maf data
# #Download data from - https://gdc.cancer.gov/about-data/publications/pancanatlas
# 
# maf <- data.table::fread("mc3.v0.2.8.PUBLIC.maf.gz", header = TRUE)
# maf_filter <- maf %>% filter(FILTER == "PASS",
#                             Variant_Classification != "Silent")
# 
# # few lines of code in tidyR to convert maf to a binary file
# maf_binary <- maf_filter %>%
#     select(Tumor_Sample_Barcode, Hugo_Symbol) %>%
#     distinct() %>%
#     pivot_wider(names_from = "Hugo_Symbol",
#                 values_from = 'Hugo_Symbol',
#                 values_fill = 0, values_fn = function(x) 1)
# 
# maf_binary$tcga_short <- substr(maf_binary$Tumor_Sample_Barcode, 1, 12)
# 
# # Process clinical file
# tcga_clin <- readxl::read_excel("TCGA-CDR-SupplementalTableS1.xlsx", sheet=1, col_names = TRUE)
# 
# uvm_clin <- tcga_clin %>% filter(type == "UVM")
# uvm_maf_binary <- maf_binary %>%
#     filter(tcga_short %in% uvm_clin$bcr_patient_barcode) %>%
#     select(-Tumor_Sample_Barcode)
# rnames <- uvm_maf_binary$tcga_short
# 
# uvm_maf <- uvm_maf_binary %>% select(-tcga_short) %>%
#     apply(., 2, as.numeric)
# 
# # Remove singletons
# gene_sum <- apply(uvm_maf,2,sum)
# idx <- which(gene_sum > 1)
# 
# uvm_maf <- uvm_maf[,idx]
# rownames(uvm_maf) <- rnames
# 
# 
# uvm_survdat <- uvm_clin %>% select(OS.time, OS) %>%
#     apply(., 2, as.numeric)
# 
# rownames(uvm_survdat) <- uvm_clin$bcr_patient_barcode
# 
# # process CN
# library(cluster)#pam function for derive medoid
# library(GenomicRanges) #interval overlap to remove CNV
# library(iClusterPlus)
# 
# seg <- read.delim(file="broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.seg", header=TRUE,sep="\t", as.is=TRUE)
# 
# pp <- substr(seg$Sample,13,16)
# seg.idx <- c(grep("-01A",pp),grep("-01B",pp),grep("-03A",pp))
# 
# #only take tumors
# seg.idx <- c(grep("-01A",pp),grep("-01B",pp))
# seg <- seg[seg.idx,]
# 
# seg$Sample <- substr(seg[,1],1,12)
# 
# uvm_seg <- seg[seg$Sample %in% uvm_clin$bcr_patient_barcode,]
# 
# colnames(uvm_seg) <- c("Sample", "Chromosome", "Start", "End", "Num_Probes", "Segment_Mean")
# 
# # pass epsilon as 0.001 default or user
# reducedMseg <- CNregions(ss_seg,epsilon=0.001,adaptive=FALSE,rmCNV=FALSE, cnv=NULL, frac.overlap=0.5, rmSmallseg=TRUE, nProbes=75)
# 
# uvm_dat <- list(uvm_mut = uvm_maf, uvm_cn = uvm_seg)
# 

## ----echo=TRUE, eval=FALSE----------------------------------------------------
# set.seed(112)
# n1 <- 50 #class1
# n2 <- 50 #class2
# n3 <- 50 #class3
# n <- n1+n2+n3
# p <- 15 #survival related features (10%)
# q <- 120 #noise
# 
# #class1 ~ N(1.5,1), class2 ~ N(0,1), class3 ~ N(-1.5,1)
# 
# sample_names <- paste0("S",1:n)
# feature_names <- paste0("features", 1:n)
# 
# #final matrix
# x_big <- NULL
# 
# ################
# # sample 15 informant features
# 
# #simulating class1
# x1a <- matrix(rnorm(n1*p, 1.5, 1), ncol=p)
# 
# #simulating class2
# x2a <- matrix(rnorm(n2*p), ncol=p)
# 
# 
# #simulating class3
# x3a <- matrix(rnorm(n3*p, -1.5,1), ncol=p)
# 
# #this concluded that part shaded in red of the matrix -
# #corresponding to related to survival and molecularly distinct
# xa <- rbind(x1a,x2a,x3a)
# 
# ################
# # sample 15 other informant features, but scramble them.
# 
# permute.idx<-sample(1:length(sample_names),length(sample_names))
# 
# x1b <- matrix(rnorm(n1*p, 1.5, 1), ncol=p)
# x2b <- matrix(rnorm(n2*p), ncol=p)
# x3b <- matrix(rnorm(n3*p, -1.5,1), ncol=p)
# 
# #this concluded that part shaded in blue of the matrix -
# #containing the molecular distinct features but not related to survival
# xb <- rbind(x1b,x2b,x3b)
# 
# 
# #this concludes the area shaded area in grey which corresponds to noise
# xc <- matrix(rnorm(n*q), ncol=q)
# 
# x_big <- cbind(xa,xb[permute.idx,], xc)
# 
# rownames(x_big) <- sample_names
# colnames(x_big) <- feature_names
# simdat <- list()
# simdat[[1]] <- x_big
# 
# #the three classes will have a median survival of 4.5, 3.25 and 2 yrs respectively
# set.seed(112)
# med_surv_class1 <- log(2)/4.5
# med_surv_class2 <- log(2)/3.25
# med_surv_class3 <- log(2)/2
# 
# surv_dist_class1 <- rexp(n1,rate=med_surv_class1)
# censor_events_class1 <- runif(n1,0,10)
# 
# surv_dist_class2 <- rexp(n2,rate=med_surv_class2)
# censor_events_class2 <- runif(n2,0,10)
# 
# surv_dist_class3 <- rexp(n3,rate=med_surv_class3)
# censor_events_class3 <- runif(n3,0,10)
# 
# surv_time_class1 <- pmin(surv_dist_class1,censor_events_class1)
# surv_time_class2 <- pmin(surv_dist_class2,censor_events_class2)
# surv_time_class3 <- pmin(surv_dist_class3,censor_events_class3)
# 
# event <- c((surv_time_class1==surv_dist_class1),
#           (surv_time_class2==surv_dist_class2),
#           (surv_time_class3==surv_dist_class3))
# 
# time <- c(surv_time_class1, surv_time_class2, surv_time_class3)
# 
# survdat <- cbind(time, event)
# 
# simsurvdat <- cbind(time, event)
# rownames(simsurvdat) <- sample_names

## -----------------------------------------------------------------------------
sessionInfo()