## ----knitr options, echo = FALSE, include = FALSE-----------------------------
plotDev <- if (requireNamespace("ragg", quietly = TRUE)) {
  "ragg_png"
} else if (requireNamespace("Cairo", quietly = TRUE)) {
  "cairo_png"
} else {
  "png"
}

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7L,
  fig.height = 5L,
  dev = plotDev,
  fig.crop = FALSE,
  dpi = 100L
)

## ----preamble, message=FALSE, warning=FALSE-----------------------------------
library(COTAN)
library(zeallot)

# necessary to solve precedence of overloads
conflicted::conflict_prefer("%<-%", "zeallot")

# enable multi-processing (not on Windows)
prevOptState <- options(parallelly.fork.enable = TRUE)

## ----setup-and-log------------------------------------------------------------
dataDir <- file.path(tempdir(), "COTAN_vignette_data")
dir.create(dataDir, recursive = TRUE, showWarnings = FALSE)

print(dataDir)

outDir <- dataDir

# Log-level 2 was chosen to showcase better how the package works
# In normal usage a level of 0 or 1 is more appropriate
setLoggingLevel(2L)

# This file will contain all the logs produced by the package
# as if at the highest logging level
setLoggingFile(file.path(outDir, "vignette_genes_clustering.log"))

## ----download-data------------------------------------------------------------
GEO <- "GSM2861514"

dir.create(file.path(dataDir, GEO), showWarnings = FALSE)

datasetFileName <-
  file.path(dataDir, GEO, "GSM2861514_E175_All_Cells_DGE.txt.gz")


# retries up to 5 times to get the dataset
attempts <- 0L
maxAttempts <- 5L
ok <- FALSE

while (attempts < maxAttempts && !ok) {
  attempts <- attempts + 1L

  if (!file.exists(datasetFileName)) {
    res <- try(
      GEOquery::getGEOSuppFiles(
        GEO = GEO,
        makeDirectory = TRUE,
        baseDir = dataDir,
        filter_regex = base::basename(datasetFileName),
        fetch_files = TRUE
      ),
      silent = TRUE
    )
  }

  ok <- file.exists(datasetFileName)

  if (!ok && attempts < maxAttempts) {
    Sys.sleep(1)
  }
}

assertthat::assert_that(
  ok,
  msg = paste0(
    "Failed to retrieve file '", datasetFileName,
    "' after ", maxAttempts, " attempts."
  )
)

rawDataset <- read.csv(datasetFileName, sep = "\t", row.names = 1L)

print(dim(rawDataset))

## ----create-cotan-object------------------------------------------------------
cond <- "mouse_cortex_E17.5"

obj <- COTAN(raw = rawDataset)
obj <-
  initializeMetaDataset(
    obj,
    GEO = GEO,
    sequencingMethod = "Drop_seq",
    sampleCondition = cond
  )

logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])),
        logLevel = 1L)

## ----add-origin-to-the-cotan-object-------------------------------------------
# mark cells origin
data(vignette.cells.origin)
head(vignette.cells.origin, 18)

obj <- 
  addCondition(obj, condName = "origin", conditions = vignette.cells.origin)

rm(vignette.cells.origin)

## ----drop-low-quality-cells---------------------------------------------------
# use previously established results to determine
# which cells were dropped in the cleaning stage
data(vignette.split.clusters)

cellsToDrop <-
  getCells(obj)[!(getCells(obj) %in% names(vignette.split.clusters))]

obj <- dropGenesCells(obj, cells = cellsToDrop)

# Log the remaining number of cells

logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)

table(getCondition(obj, condName = "origin"))

rm(vignette.split.clusters)

## ----calculate-and-store-coex-matrix------------------------------------------
obj <-
  proceedToCoex(
    obj,
    calcCoex = TRUE,
    optimizeForSpeed = TRUE,
    cores = 3L,
    deviceStr = "cuda"
  )

## ----contingency-tables-and-coex-for-gene-pair--------------------------------
cat("Contingency Tables:")
print(contingencyTables(obj, g1 = "Satb2", g2 = "Bcl11b"))

cat(paste("Corresponding COEX value:",
          getGenesCoex(obj, zeroDiagonal = FALSE)["Satb2", "Bcl11b"]))

## ----display-small-chunk-of-coex-matrix---------------------------------------
# Extract the whole matrix
coex <- getGenesCoex(obj, zeroDiagonal = FALSE)
coex[1000L:1005L, 1000L:1005L]

## ----get-coex-sub-matrix------------------------------------------------------
# For a partial matrix
coex_sm <- getGenesCoex(obj, genes = c("Satb2", "Bcl11b", "Fezf2"),
                        zeroDiagonal = FALSE)
coex_sm[2001:2010, ]

## ----calculate-and-store-gdi--------------------------------------------------
gdiDF <- calculateGDI(obj, cores = 3L)

# Show the most *differentially expressed* genes
perm <- order(gdiDF[["GDI"]], decreasing = TRUE)
head(gdiDF[perm, ], n = 10)

# This will store only the $GDI column
obj <- storeGDI(obj, genesGDI = gdiDF)

## ----genes-list-definition----------------------------------------------------
genesLists <- list(
  # Neural Progenitor Genes
  "NPGs" = c("Nes", "Vim", "Sox2", "Sox1", "Notch1", "Hes1", "Hes5", "Pax6"),
  # Pan Neural Genes
  "PNGs" = c("Map2", "Tubb3", "Neurod1", "Nefm", "Nefl", "Dcx", "Tbr1"),
  # House Keeping
  "hk"   = c("Calm1", "Cox6b1", "Ppia", "Rpl18", "Cox7c", "Erh", "H3f3a",
             "Taf1", "Taf2", "Gapdh", "Actb", "Golph3", "Zfr", "Sub1",
             "Tars", "Amacr")
)

## ----gdi-plot-with-marked-genes-----------------------------------------------
GDIPlot(obj, genes = genesLists, GDIThreshold = 1.40)

## ----save-cotan-obj, eval=FALSE-----------------------------------------------
# # saving the structure
# saveRDS(obj, file = file.path(outDir, paste0(cond, ".cotan.RDS")))

## ----coex-heatmap-plot-for-selected-genes-------------------------------------
plot(heatmapPlot(obj, genesLists = genesLists, sets = 1L:3L, cores = 3L))

## ----define-layers-specific-genes---------------------------------------------
layersGenes <- list(
  "L1"   = c("Reln",   "Lhx5"),
  "L2/3" = c("Cux1",   "Satb2"),
  "L4"   = c("Rorb",   "Sox5"),
  "L5/6" = c("Bcl11b", "Fezf2"),
  "Prog" = c("Hes1",    "Vim", "Dummy")
)

## ----coex-heatmap-plot-for-primary-marker-genes-------------------------------
layersMarkers <- c("Satb2", "Bcl11b", "Vim", "Hes1")

genesHeatmapPlot(
  obj,
  primaryMarkers = layersMarkers,
  pValueThreshold = 0.0001,
  symmetric = TRUE,
  cores = 3L
)

## ----coex-heatmap-plot-two-genes-sets, eval=FALSE, include=TRUE---------------
# genesHeatmapPlot(
#   obj,
#   primaryMarkers = c("Satb2", "Bcl11b", "Fezf2"),
#   secondaryMarkers = c("Gabra3", "Meg3", "Cux1", "Neurod6"),
#   pValueThreshold = 0.001,
#   symmetric = FALSE,
#   cores = 3L
# )

## ----establish-genes-clusters-------------------------------------------------
c(gSpace, eigPlot, pcaGenesClDF, treePlot) %<-%
  establishGenesClusters(
    obj,
    groupMarkers = layersGenes,
    numGenesPerMarker = 25L,
    kCuts = 5L
  )

plot(eigPlot)

## ----genes-tree-plot----------------------------------------------------------
plot(treePlot)

## ----genes-umap-plot----------------------------------------------------------
colSelection <- vapply(pcaGenesClDF, is.numeric, logical(1L))

genesUmapPl <-
  UMAPPlot(
    pcaGenesClDF[, colSelection, drop = FALSE],
    clusters = getColumnFromDF(pcaGenesClDF, "hclust"),
    elements = layersGenes,
    title = "Genes' clusters UMAP Plot",
    numNeighbors = 32L,
    minPointsDist = 0.25
  )

plot(genesUmapPl)

## ----final-clean-up-----------------------------------------------------------
if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) {
  # delete file if it exists
  file.remove(file.path(outDir, paste0(cond, ".cotan.RDS")))
}
if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) {
  # delete file if it exists
  file.remove(file.path(outDir, paste0(cond, "_times.csv")))
}
if (dir.exists(file.path(outDir, cond))) {
  unlink(file.path(outDir, cond), recursive = TRUE)
}
# if (dir.exists(file.path(outDir, GEO))) {
#   unlink(file.path(outDir, GEO), recursive = TRUE)
# }

# stop logging to file
setLoggingFile("")
file.remove(file.path(outDir, "vignette_genes_clustering.log"))

options(prevOptState)

## ----closure------------------------------------------------------------------
Sys.time()

sessionInfo()

