## ----knitr options, echo = FALSE, include = FALSE-----------------------------
plotDev <- if (requireNamespace("ragg", quietly = TRUE)) {
  "ragg_png"
} else if (requireNamespace("Cairo", quietly = TRUE)) {
  "cairo_png"
} else {
  "png"
}

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7L,
  fig.height = 5L,
  dev = plotDev,
  fig.crop = FALSE,
  dpi = 100L
)

## ----preamble, message=FALSE, warning=FALSE-----------------------------------
library(COTAN)
library(zeallot)

# necessary to solve precedence of overloads
conflicted::conflict_prefer("%<-%", "zeallot")

# enable multi-processing (not on Windows)
prevOptState <- options(parallelly.fork.enable = TRUE)

## ----setup-and-log------------------------------------------------------------
dataDir <- file.path(tempdir(), "COTAN_vignette_data")
dir.create(dataDir, recursive = TRUE, showWarnings = FALSE)

print(dataDir)

outDir <- dataDir

# Log-level 2 was chosen to showcase better how the package works
# In normal usage a level of 0 or 1 is more appropriate
setLoggingLevel(2L)

# This file will contain all the logs produced by the package
# as if at the highest logging level
setLoggingFile(file.path(outDir, "vignette_uniform_clustering.log"))

## ----download-data------------------------------------------------------------
GEO <- "GSM2861514"

dir.create(file.path(dataDir, GEO), showWarnings = FALSE)

datasetFileName <-
  file.path(dataDir, GEO, "GSM2861514_E175_All_Cells_DGE.txt.gz")


# retries up to 5 times to get the dataset
attempts <- 0L
maxAttempts <- 5L
ok <- FALSE

while (attempts < maxAttempts && !ok) {
  attempts <- attempts + 1L

  if (!file.exists(datasetFileName)) {
    res <- try(
      GEOquery::getGEOSuppFiles(
        GEO = GEO,
        makeDirectory = TRUE,
        baseDir = dataDir,
        filter_regex = base::basename(datasetFileName),
        fetch_files = TRUE
      ),
      silent = TRUE
    )
  }

  ok <- file.exists(datasetFileName)

  if (!ok && attempts < maxAttempts) {
    Sys.sleep(1)
  }
}

assertthat::assert_that(
  ok,
  msg = paste0(
    "Failed to retrieve file '", datasetFileName,
    "' after ", maxAttempts, " attempts."
  )
)

rawDataset <- read.csv(datasetFileName, sep = "\t", row.names = 1L)

print(dim(rawDataset))

## ----create-cotan-object------------------------------------------------------
cond <- "mouse_cortex_E17.5"

obj <- COTAN(raw = rawDataset)
obj <-
  initializeMetaDataset(
    obj,
    GEO = GEO,
    sequencingMethod = "Drop_seq",
    sampleCondition = cond
  )

logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])),
        logLevel = 1L)

## ----add-origin-to-the-cotan-object-------------------------------------------
# mark cells origin
data(vignette.cells.origin)
head(vignette.cells.origin, 18)

obj <- 
  addCondition(obj, condName = "origin", conditions = vignette.cells.origin)

rm(vignette.cells.origin)

## ----drop-low-quality-cells---------------------------------------------------
# use previously established results to determine
# which cells were dropped in the cleaning stage
data(vignette.split.clusters)

cellsToDrop <-
  getCells(obj)[!(getCells(obj) %in% names(vignette.split.clusters))]

obj <- dropGenesCells(obj, cells = cellsToDrop)

# Log the remaining number of cells

logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)

table(getCondition(obj, condName = "origin"))

rm(vignette.split.clusters)

## ----calculate-and-store-coex-matrix------------------------------------------
obj <-
  proceedToCoex(
    obj,
    calcCoex = FALSE,
    optimizeForSpeed = TRUE,
    cores = 3L,
    deviceStr = "cuda"
  )

## ----create-ut-clusterization, eval=FALSE, include=TRUE-----------------------
# # This code is a too computationally heavy to be used in a vignette [~20 min.]
# # So the result was stored and will be re-loaded in the next chunk
# 
# 
# initialClusters <- NULL
# 
# if (TRUE) {
#   # read from the last file among those named all_check_results_XX.csv
#   mergeDir <- file.path(outDir, cond, "reclustering")
# 
#   resFiles <-
#     list.files(
#       path = mergeDir,
#       pattern = "partial_clusterization_.*csv",
#       full.names = TRUE
#     )
# 
#   if (length(resFiles) != 0L) {
#     prevClustRes <-
#       read.csv(resFiles[length(resFiles)], header = TRUE, row.names = 1L)
#     initialClusters <- getColumnFromDF(prevClustRes, 1L)
#     initialClusters[is.na(initialClusters)] <- "-1"
#     initialClusters <- niceFactorLevels(initialClusters)
#   }
# }
# 
# 
# # default constructed checker is OK
# advChecker <- new("AdvancedGDIUniformityCheck")
# 
# c(splitClusters, splitCoexDF) %<-%
#   cellsUniformClustering(
#     obj,
#     initialResolution = 0.8,
#     checker = advChecker,
#     dataMethod = "LogLikelihood", # of observed against expected data
#     useCoexEigen = TRUE, # project on `COEX` eigenvectors instead doing a `PCA`
#     numReducedComp = 50L,
#     minimumUTClusterSize = 50L, # too large values imply many un-clustered cells
#     initialClusters = initialClusters,
#     initialIteration = 1L,
#     optimizeForSpeed = TRUE,
#     deviceStr = "cuda",
#     cores = 3L,
#     saveObj = TRUE, # save intermediate data for log and restart
#     outDir = outDir
#   )
# 
# # add the newly found clusterization to the `COTAN` object
# obj <-
#   addClusterization(
#     obj,
#     clName = "split",
#     clusters = splitClusters,
#     coexDF = splitCoexDF
#   )
# 
# table(splitClusters)

## ----load-pre-calculated-clusterization_split---------------------------------
data("vignette.split.clusters", package = "COTAN")
splitClusters <- vignette.split.clusters

# explicitly calculate the DEA of the clusterization to store it
splitCoexDF <- DEAOnClusters(obj, clusters = splitClusters)

obj <-
  addClusterization(
    obj,
    clName = "split",
    clusters = splitClusters,
    coexDF = splitCoexDF,
    override = FALSE
  )

## ----extract-clusterization-summary_split-------------------------------------
c(summaryData, summaryPlot) %<-%
  clustersSummaryPlot(
    obj,
    clName = "split",
    plotTitle = "split summary",
    condName = "origin"
  )

head(summaryData, n = 10L)

## ----summary-as-plot----------------------------------------------------------
plot(summaryPlot)

## ----define-layers-specific-genes---------------------------------------------
# these are some genes associated to each cortical layer
layersGenes <- list(
  "L1"   = c("Reln",   "Lhx5"),
  "L2/3" = c("Cux1",   "Satb2"),
  "L4"   = c("Rorb",   "Sox5"),
  "L5/6" = c("Bcl11b", "Fezf2"),
  "Prog" = c("Hes1",    "Vim")
)

## ----clusterization-data-visualization_split----------------------------------
c(splitHeatmap, splitScoreDF, splitPValueDF) %<-%
  clustersMarkersHeatmapPlot(
    obj,
    groupMarkers = layersGenes,
    clName = "split",
    kCuts = 2L,
    adjustmentMethod = "holm"
  )

ComplexHeatmap::draw(splitHeatmap)

## ----clusterization-umap-plot-via-drm_split, eval=FALSE, include=TRUE---------
# c(umapPlot, cellsRDM) %<-%
#   cellsUMAPPlot(
#     obj,
#     clName = "split",
#     clusters = NULL,
#     useCoexEigen = TRUE, # Aligning these to `split` generation gives best res.
#     dataMethod = "LogLikelihood", # "AdjBinarized" is also a good choice
#     numComp = 50L, # number of components of the data reduced matrix
#     colors = NULL, # a list of user defined colors for the various clusters
#     numNeighbors = 30L, # UMAP related parameters
#     minPointsDist = 0.3 # UMAP related parameters
#   )
# 
# plot(umapPlot)

## ----clusterization-umap-plot-via-drm_split_as-seurat-------------------------
c(umapPlot, cellsRDM) %<-%
  cellsUMAPPlot(
    obj,
    clName = "split",
    clusters = NULL,
    useCoexEigen = FALSE,
    dataMethod = "LogNormalized", 
    numComp = 50L, # number of components of the data reduced matrix
    genesSel = "HVG_Seurat",
    numGenes = 500L,
    colors = NULL, # a list of user defined colors for the various clusters
    numNeighbors = 30L, # UMAP related parameters
    minPointsDist = 0.3 # UMAP related parameters
  )

plot(umapPlot)

## ----merge-clusters-if-ut, eval=FALSE, include=TRUE---------------------------
# 
# checkersList <-
#   list(
#     advChecker,
#     shiftCheckerThresholds(advChecker, 0.01), # Sensible threshold shifts really
#     shiftCheckerThresholds(advChecker, 0.03)  # depend on the dataset
#   )
# 
# prevCheckRes <- data.frame()
# 
# # The following code shows the case when one wants to re-use the already
# # calculated checks from the saved intermediate outputs.
# # This can apply in cases one wants to add a new checker to the list above or
# # in the cases the execution is interrupted prematurely.
# #
# # Note that a similar facility exists for the `cellsUniformClustering()`
# # function above.
# #
# if (TRUE) {
#   # read from the last file among those named all_check_results_XX.csv
#   mergeDir <- file.path(outDir, cond, "leafs_merge")
# 
#   resFiles <-
#     list.files(
#       path = mergeDir,
#       pattern = "all_check_results_.*csv",
#       full.names = TRUE
#     )
# 
#   if (length(resFiles) != 0L) {
#     prevCheckRes <-
#       read.csv(resFiles[length(resFiles)], header = TRUE, row.names = 1L)
#   }
# }
# 
# c(mergedClusters, mergedCoexDF) %<-%
#   mergeUniformCellsClusters(
#     obj,
#     clusters = splitClusters,
#     checkers = checkersList,
#     allCheckResults = prevCheckRes,
#     optimizeForSpeed = TRUE,
#     deviceStr = "cuda",
#     cores = 6L,
#     useDEA = TRUE,
#     saveObj = TRUE,
#     outDir = outDir
#   )
# 
# # add the newly found clusterization to the `COTAN` object
# obj <-
#   addClusterization(
#     obj,
#     clName = "merge",
#     override = TRUE,
#     clusters = mergedClusters,
#     coexDF = mergedCoexDF
#   )
# 
# table(mergedClusters)

## ----load-pre-calculated-clusterization_merge---------------------------------
data("vignette.merge.clusters", package = "COTAN")
mergedClusters <- vignette.merge.clusters[getCells(obj)]

# explicitly calculate the DEA of the clusterization to store it
mergedCoexDF <- DEAOnClusters(obj, clusters = mergedClusters)

obj <-
  addClusterization(
    obj,
    clName = "merge",
    clusters = mergedClusters,
    coexDF = mergedCoexDF,
    override = FALSE
  )

# this gives immediate visual information about which clusters were merged
table(splitClusters, mergedClusters)

## ----extract-clusterization-summary_merge-------------------------------------
c(summaryData, summaryPlot) %<-%
  clustersSummaryPlot(
    obj,
    clName = "merge",
    plotTitle = "merge summary",
    condName = "origin"
  )

head(summaryData, n = 10L)

plot(summaryPlot)

## ----clusterization-data-visualization_merge----------------------------------
c(mergeHeatmap, mergeScoresDF, mergePValuesDF) %<-%
  clustersMarkersHeatmapPlot(
    obj,
    groupMarkers = layersGenes,
    clName = "merge",
    condNameList = list("origin")
  )

ComplexHeatmap::draw(mergeHeatmap)

## ----clusterization-data-visualization_merge-vs-split}------------------------
c(mergeVsSplitHeatmap, ..) %<-%
  clustersMarkersHeatmapPlot(
    obj,
    clName = "merge",
    condNameList = list("split"),
    conditionsList = list(splitClusters)
  )

ComplexHeatmap::draw(mergeVsSplitHeatmap)

## ----clusterization-umap-plot-via-drm_merge, eval=FALSE, include=TRUE---------
# c(umapPlot, .) %<-%
#   cellsUMAPPlot(
#     obj,
#     clName = "merge",
#     clusters = NULL,
#     useCoexEigen = TRUE,
#     dataMethod = "LogLikelihood", # "AdjBinarized" is also a good choice
#     numComp = 50L, # number of components of the data reduced matrix
#     colors = NULL, # a list of user defined colors for the various clusters
#     numNeighbors = 30L, # UMAP related parameters
#     minPointsDist = 0.3 # UMAP related parameters
#   )
# 
# plot(umapPlot)

## ----clusterization-umap-plot-via-drm_merge_as-seurat-------------------------
c(umapPlot, .) %<-%
  cellsUMAPPlot(
    obj,
    clName = "merge",
    clusters = NULL,
    useCoexEigen = FALSE,
    dataMethod = "LogNormalized", # "AdjBinarized" is also a good choice
    numComp = 50L, # number of components of the data reduced matrix
    genesSel = "HVG_Scanpy",
    numGenes = 500L,
    colors = NULL, # a list of user defined colors for the various clusters
    numNeighbors = 30L, # UMAP related parameters
    minPointsDist = 0.3 # UMAP related parameters
  )

plot(umapPlot)

## ----clean-up-----------------------------------------------------------------
if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) {
  # delete file if it exists
  file.remove(file.path(outDir, paste0(cond, ".cotan.RDS")))
}
if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) {
  # delete file if it exists
  file.remove(file.path(outDir, paste0(cond, "_times.csv")))
}
if (dir.exists(file.path(outDir, cond))) {
  unlink(file.path(outDir, cond), recursive = TRUE)
}
# if (dir.exists(file.path(outDir, GEO))) {
#   unlink(file.path(outDir, GEO), recursive = TRUE)
# }

# stop logging to file
setLoggingFile("")
file.remove(file.path(outDir, "vignette_uniform_clustering.log"))

options(prevOptState)

## ----closure------------------------------------------------------------------
Sys.time()

sessionInfo()