## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)

## ----load packages------------------------------------------------------------
suppressPackageStartupMessages({
  library(censcyt)
  library(ggplot2)
  library(SummarizedExperiment)
  library(tidyr)
})

## ----data_simulation covariate------------------------------------------------
set.seed(1234)

nr_samples <- 50
nr_clusters <- 6
# the covariate is simulated from an exponential distribution:
X_true <- rexp(nr_samples)
# the censoring time is also sampled from an exponential distribution:
C <- rexp(nr_samples)
# the actual observed value is the minimum of the two:
X_obs <- pmin(X_true,C)
# additionally, we have the event indicator:
Event_ind <- ifelse(X_true>C,0,1)
# proportion censored:
(length(Event_ind)-sum(Event_ind))/length(Event_ind)

## ----data_simulation multicluster---------------------------------------------
# number of cells per sample
sizes <- round(runif(nr_samples,1e3,1e4))
# alpha parameters of the dirichlet-multinomial distribution. 
alphas <- runif(nr_clusters,1,10)
# main simulation function
simulation_output <- simulate_multicluster(alphas = alphas,
                                           sizes = sizes,
                                           covariate = X_obs,
                                           nr_diff = 2,
                                           return_summarized_experiment = TRUE,
                                           slope = list(0.9))
# counts as a SummarizedExperiment object
d_counts <- simulation_output$counts

## ----diffplot, fig.cap="Relative cell population abundance vs. survival time (X_obs). Event_ind indicates if X_obs is censored or observed.", fig.wide=TRUE----

# vector indicating if a cluster has a modeled association or not
is_diff_cluster <- ifelse(is.na(simulation_output$row_data$paired),FALSE,TRUE)
# first convert to proportions:
proportion <- as.data.frame(t(apply(assay(d_counts),2,function(x)x/sum(x))))
names(proportion) <- paste0("Nr: ",names(proportion), " - ", 
                            ifelse(is_diff_cluster,"DA","non DA"))
# then to long format for plotting
counts_long <-
  pivot_longer(proportion, 
               cols= seq_len(nr_clusters),
               names_to = "cluster_id",
               values_to = "proportion")
# add observed (partly censored) variable
counts_long$X_obs <- rep(X_obs,each=nr_clusters)
# add event indicator
counts_long$Event_ind <- factor(rep(Event_ind,each=nr_clusters),
                                levels = c(0,1),
                                labels=c("censored","observed"))

ggplot(counts_long) +
  geom_point(aes(x=X_obs,y=proportion,color=Event_ind)) +
  facet_wrap(~cluster_id)

## ----experiment_info----------------------------------------------------------
# all covariates and sample ids
experiment_info <- data.frame(sample_id = seq_len(nr_samples),
                              X_obs = X_obs, 
                              Event_ind = Event_ind)

## ----formula_contrast---------------------------------------------------------
da_formula <- createFormula(experiment_info = experiment_info,
                            cols_fixed = "X_obs",
                            cols_random = "sample_id",
                            event_indicator = "Event_ind")
# also create contrast matrix for testing
contrast <- matrix(c(0,1))

## ----differential_testing, message=TRUE---------------------------------------
# test with 50 repetitions with method risk set imputation (rs)
da_out <- testDA_censoredGLMM(d_counts = d_counts,
                              formula = da_formula,
                              contrast = contrast,
                              mi_reps = 30,
                              imputation_method = "km",
                              verbose = TRUE,
                              BPPARAM=BiocParallel::MulticoreParam(workers = 1))

## ----differential_testing evaluation, message=TRUE----------------------------
topTable(da_out)

# compare to actual differential clusters:
which(!is.na(simulation_output$row_data$paired))

## ----wrapper function create data---------------------------------------------
# Function to create expression data (one sample)
fcs_sim <- function(n = 2000, mean = 0, sd = 1, ncol = 10, cof = 5) {
  d <- matrix(sinh(rnorm(n*ncol, mean, sd)) * cof,ncol=ncol)
  # each marker is heavily expressed in one population, i.e. this should result 
  # in 'ncol' clusters
  for(i in seq_len(ncol)){
    d[seq(n/ncol*(i-1)+1,n/ncol*(i)),i] <- 
      sinh(rnorm(n/ncol, mean+2, sd)) * cof
  }
  colnames(d) <- paste0("marker", sprintf("%02d", 1:ncol))
  d
}

# create multiple expression data samples with DA signal
comb_sim <- function(X, nr_samples = 10, mean = 0, sd = 1, ncol = 10, cof = 5){
  # Create random data (without differential signal)
  d_input <- lapply(seq_len(nr_samples), 
                    function(i) fcs_sim(mean = mean, 
                                        sd = sd,
                                        ncol = ncol,
                                        cof = cof))
  # Add differential abundance (DA) signal
  for(i in seq_len(nr_samples)){
    # number of cells in cluster 1
    n_da <- round((plogis(X_true[i]))*300)
    # set to random expression
    d_input[[i]][seq_len(n_da), ] <- 
      matrix(sinh(rnorm(n_da*ncol, mean, sd)) * cof, ncol=ncol)
    # increase expression for cluster 1
    d_input[[i]][seq_len(n_da) ,1] <- 
      sinh(rnorm(n_da, mean + 2, sd)) * cof
  }
  d_input
}

# set parameter and simulat
d_input <- comb_sim(X_true, nr_samples = 50)


## ----wrapper function marker info---------------------------------------------
marker_info <- data.frame(
  channel_name = paste0("channel", sprintf("%03d", seq_len(10))),
  marker_name = paste0("marker", sprintf("%02d", seq_len(10))),
  marker_class = factor(c(rep("type", 10)),
                        levels = c("type", "state", "none")),
  stringsAsFactors = FALSE
)

## ----run wrapper function-----------------------------------------------------
# Run wrapper function
out_DA <- censcyt(d_input, 
                  experiment_info, 
                  marker_info,
                  formula = da_formula, 
                  contrast = contrast,
                  meta_clustering = TRUE, 
                  meta_k = 10,
                  seed_clustering = 123, 
                  verbose = FALSE, 
                  mi_reps = 10,
                  imputation_method = "km",
                  BPPARAM=BiocParallel::MulticoreParam(workers = 1))

## ----run wrapper result, fig.width=10-----------------------------------------

# Display results for top DA clusters
topTable(out_DA, format_vals = TRUE)

# Plot heatmap for DA tests
plotHeatmap(out_DA, 
            analysis_type = "DA", 
            sample_order = order(X_true))

## ----sessionInfo, echo=FALSE--------------------------------------------------
sessionInfo()