## ----knitr options, echo = FALSE, include = FALSE-----------------------------
plotDev <- if (requireNamespace("ragg", quietly = TRUE)) {
  "ragg_png"
} else if (requireNamespace("Cairo", quietly = TRUE)) {
  "cairo_png"
} else {
  "png"
}

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7L,
  fig.height = 5L,
  dev = plotDev,
  fig.crop = FALSE,
  dpi = 100L
)

## ----preamble, message=FALSE, warning=FALSE-----------------------------------
library(COTAN)
library(zeallot)

# necessary to solve precedence of overloads
conflicted::conflict_prefer("%<-%", "zeallot")

# enable multi-processing (not on Windows)
prevOptState <- options(parallelly.fork.enable = TRUE)

## ----setup-and-log------------------------------------------------------------
dataDir <- file.path(tempdir(), "COTAN_vignette_data")
dir.create(dataDir, recursive = TRUE, showWarnings = FALSE)

print(dataDir)

outDir <- dataDir

# Log-level 2 was chosen to showcase better how the package works
# In normal usage a level of 0 or 1 is more appropriate
setLoggingLevel(2L)

# This file will contain all the logs produced by the package
# as if at the highest logging level
setLoggingFile(file.path(outDir, "vignette_dataset_cleaning.log"))

## ----download-data------------------------------------------------------------
GEO <- "GSM2861514"

dir.create(file.path(dataDir, GEO), showWarnings = FALSE)

datasetFileName <-
  file.path(dataDir, GEO, "GSM2861514_E175_All_Cells_DGE.txt.gz")

# retries up to 5 times to get the dataset
attempts <- 0L
maxAttempts <- 5L
ok <- FALSE

while (attempts < maxAttempts && !ok) {
  attempts <- attempts + 1L

  if (!file.exists(datasetFileName)) {
    res <- try(
      GEOquery::getGEOSuppFiles(
        GEO = GEO,
        makeDirectory = TRUE,
        baseDir = dataDir,
        filter_regex = base::basename(datasetFileName),
        fetch_files = TRUE
      ),
      silent = TRUE
    )
  }

  ok <- file.exists(datasetFileName)

  if (!ok && attempts < maxAttempts) {
    Sys.sleep(1)
  }
}

assertthat::assert_that(
  ok,
  msg = paste0(
    "Failed to retrieve file '", datasetFileName,
    "' after ", maxAttempts, " attempts."
  )
)

rawDataset <- read.csv(datasetFileName, sep = "\t", row.names = 1L)

print(dim(rawDataset))

## ----create-cotan-object------------------------------------------------------
cond <- "mouse_cortex_E17.5"

obj <- COTAN(raw = rawDataset)
obj <-
  initializeMetaDataset(
    obj,
    GEO = GEO,
    sequencingMethod = "Drop_seq",
    sampleCondition = cond
  )

logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])),
        logLevel = 1L)

## ----conversions-to-and-from-sce----------------------------------------------
sceObj <- convertToSingleCellExperiment(obj)

obj2 <- convertFromSingleCellExperiment(
  sceObj,
  assayName = "counts",   # the assay that contains the raw counts
  genesNamesPattern = "", # reg-exp to identify genes' names column
  cellsIDsPattern = "",   # reg-exp to identify cells' ID column
  clNamesPattern = "",    # reg-exp to identify clusterizations columns
  condNamesPattern = ""   # reg-exp to identify cells' conditions columns
)

rm(sceObj, obj2)

## ----add-origin-to-the-cotan-object-------------------------------------------
# mark cells origin
data(vignette.cells.origin)
head(vignette.cells.origin, 18)

obj <- 
  addCondition(obj, condName = "origin", conditions = vignette.cells.origin)

rm(vignette.cells.origin)

# Sometimes the condition is encoded in the cells ID using some separator
# so it is useful to use the following function to retrieve it
if (FALSE) { # This is not the case with the dataset currently in use
  batch <- COTAN::conditionsFromNames(
    names = getCells(obj), splitPattern = "_", fragmentNum = 2L)
  names(batch) <- getCells(obj)
  obj <- addCondition(obj, condName = "batch", conditions = batch)
}

## ----ecd-plot-----------------------------------------------------------------
plot(ECDPlot(obj, condName = "origin"))

## ----cells-size-plot----------------------------------------------------------
plot(cellSizePlot(obj, condName = "origin"))

## ----number-detected-genes----------------------------------------------------
plot(genesSizePlot(obj, condName = "origin"))

## ----scatter-plot-------------------------------------------------------------
plot(scatterPlot(obj, condName = "origin"))

## ----cull-probable-doublets---------------------------------------------------
# Store the used threshold in the COTAN object for later reference
cellsSizeThr <- 6000L
obj <- addElementToMetaDataset(obj, "Cells size threshold", cellsSizeThr)

cellsToRem <- getCells(obj)[getCellsSize(obj) > cellsSizeThr]
obj <- dropGenesCells(obj, cells = cellsToRem)

plot(cellSizePlot(obj, condName = "origin"))

## ----cull-probable-doublets_2-------------------------------------------------
# Store the used threshold in the COTAN object for later reference
genesSizeHighThr <- 3000L
obj <-
  addElementToMetaDataset(obj, "Num genes high threshold", genesSizeHighThr)

cellsToRem <- getCells(obj)[getNumExpressedGenes(obj) > genesSizeHighThr]
obj <- dropGenesCells(obj, cells = cellsToRem)

plot(genesSizePlot(obj, condName = "origin"))

## ----cull-probable-dead-cells-------------------------------------------------
# Store the used threshold in the COTAN object for later reference
genesSizeLowThr <- 500L
obj <- addElementToMetaDataset(obj, "Num genes low threshold", genesSizeLowThr)

cellsToRem <- getCells(obj)[getNumExpressedGenes(obj) < genesSizeLowThr]
obj <- dropGenesCells(obj, cells = cellsToRem)

plot(genesSizePlot(obj, condName = "origin"))

## ----calculate-mitochondrial-genes-percentage---------------------------------
c(mitPlot, mitSizes) %<-%
  mitochondrialPercentagePlot(obj, genePrefix = "^mt-", condName = "origin")

plot(mitPlot)

## ----cull-probable-dead-cells_2-----------------------------------------------
# Store the used threshold in the COTAN object for later reference
mitPercThr <- 3.5
obj <- addElementToMetaDataset(obj, "Mitoc. perc. threshold", mitPercThr)

cellsToRem <- rownames(mitSizes)[mitSizes[["mit.percentage"]] > mitPercThr]
obj <- dropGenesCells(obj, cells = cellsToRem)

c(mitPlot, mitSizes) %<-%
  mitochondrialPercentagePlot(obj, genePrefix = "^mt-", condName = "origin")

plot(mitPlot)

## ----scatter-plot_2-----------------------------------------------------------
plot(scatterPlot(obj, condName = "origin"))

## ----log-current-dataset-status-----------------------------------------------
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)

getDims(obj)

## ----calculate-b-group-from-pca-----------------------------------------------
obj <- addElementToMetaDataset(obj, "Num drop B group", 0L)

obj <- clean(obj)

c(pcaCellsPlot, pcaCellsData, genesPlot,
  UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)

plot(pcaCellsPlot)

## ----show-most-expressed-genes-b-group----------------------------------------
plot(genesPlot)

## ----cells-efficiency-pca-plot------------------------------------------------
plot(UDEPlot)

## ----cells-efficiency-plot----------------------------------------------------
plot(nuPlot)

## ----cells-efficiency-plot_low-tail-------------------------------------------
plot(zoomedNuPlot)

## ----cull-b-group-and-recalculate---------------------------------------------
cellsToRem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"]
obj <- dropGenesCells(obj, cells = cellsToRem)

# Store how many times the B group was dropped
obj <- addElementToMetaDataset(obj, "Num drop B group", 1L)

## ----cull-low-efficiency-cells------------------------------------------------
udeLowThr <- 0.34
obj <- addElementToMetaDataset(obj, "Low UDE cells' threshold", udeLowThr)

obj <- estimateNuLinear(obj)

cellsToRem <- getCells(obj)[getNu(obj) < udeLowThr]
obj <- dropGenesCells(obj, cells = cellsToRem)

## ----raw-plots----------------------------------------------------------------
plot(ECDPlot(obj, condName = "origin"))

plot(cellSizePlot(obj, condName = "origin"))

plot(genesSizePlot(obj, condName = "origin"))

plot(scatterPlot(obj, condName = "origin"))

c(mitPlot, mitSizes) %<-%
  mitochondrialPercentagePlot(obj, genePrefix = "^mt-", condName = "origin")

plot(mitPlot)

## ----recalculate-clean-plots--------------------------------------------------
obj <- clean(obj)

c(pcaCellsPlot, pcaCellsData, genesPlot,
  UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)

## ----show-clean-plots---------------------------------------------------------
plot(pcaCellsPlot)

plot(genesPlot)

plot(UDEPlot)

plot(nuPlot)
plot(zoomedNuPlot)

# It is also possible to directly plot the PCA results
plot(pcaCellsData)

## ----log-updated-dataset-status-----------------------------------------------
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)

table(getCondition(obj, condName = "origin"))

## ----final-clean-up-----------------------------------------------------------
if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) {
  # delete file if it exists
  file.remove(file.path(outDir, paste0(cond, ".cotan.RDS")))
}
if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) {
  # delete file if it exists
  file.remove(file.path(outDir, paste0(cond, "_times.csv")))
}
if (dir.exists(file.path(outDir, cond))) {
  unlink(file.path(outDir, cond), recursive = TRUE)
}
# if (dir.exists(file.path(outDir, GEO))) {
#   unlink(file.path(outDir, GEO), recursive = TRUE)
# }

# stop logging to file
setLoggingFile("")
file.remove(file.path(outDir, "vignette_dataset_cleaning.log"))

options(prevOptState)

## ----closure------------------------------------------------------------------
Sys.time()

sessionInfo()

