## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, fig.align = "center")

## ----packages, message=FALSE, warning=FALSE-----------------------------------
# Packages
library("scran")
library("SingleR")
library("ggplot2")
library("scRNAseq")
library("Coralysis")
library("SingleCellExperiment")

## ----data preprocessing, message=FALSE, warning=FALSE-------------------------
### Objects used and QC steps from SingleR book:
# https://bioconductor.org/books/release/SingleRBook/sc-mode.html

## Reference: muraro-pancreas-2016
# Import data
ref <- MuraroPancreasData()

# Remove unnecessary data & labels
altExps(ref) <- NULL
ref <- ref[, !is.na(ref$label) & ref$label != "unclear"]

# Normalise
ref <- logNormCounts(ref)
counts(ref) <- NULL


## Query: grun-pancreas-2016
# Import data
query <- GrunPancreasData()

# Remove unnecessary data & labels
query <- addPerCellQC(query)
qc <- quickPerCellQC(colData(query),
    percent_subsets = "altexps_ERCC_percent",
    batch = query$donor,
    subset = query$donor %in% c("D17", "D7", "D2")
)
query <- query[, !qc$discard]
altExps(query) <- NULL

# Normalise
query <- logNormCounts(query)
counts(query) <- NULL

# Feature selection with 'scran' package for reference
nhvg <- 500
set.seed(123)
top.hvg <- getTopHVGs(ref, n = nhvg)

# Subset object
ref <- ref[top.hvg, ]

# Filter out genes for query:
# not required, only done to reduce query size object
query <- query[top.hvg, ]

## ----train reference, message=TRUE, warning=FALSE-----------------------------
# Train the reference
set.seed(123)
ref <- RunParallelDivisiveICP(
    object = ref, L = 10, k = 8,
    build.train.params = list(nhvg = 500), # default 2000
    threads = 2
) # runs without 'batch.label'

# Compute reference PCA
ref <- RunPCA(ref, return.model = TRUE, pca.method = "stats")

# Compute reference UMAP
set.seed(123)
ref <- RunUMAP(ref,
    return.model = TRUE,
    umap.method = "uwot", dims = 1:15,
    n_neighbors = 30, min_dist = 0.3
)

## ----viz reference, fig.width=6, fig.height=7---------------------------------
# Vizualize reference
ref.celltype.plot <- PlotDimRed(
    object = ref,
    color.by = "label",
    dimred = "UMAP",
    point.size = 0.01,
    legend.nrow = 3,
    seed.color = 17
) +
    ggtitle("Reference: ground-truth labels")
ref.celltype.plot

## ----reference-mapping--------------------------------------------------------
# Reference-mapping
set.seed(1024)
query <- ReferenceMapping(
    ref = ref, query = query, ref.label = "label",
    project.umap = TRUE, dimred.name.prefix = "ref",
    k.nn = 5, label.prune.cutoff = 0.5
)

## ----prediction accuracy scores - coral_labels--------------------------------
# Compare against SingleR
preds.singler <- SingleR(test = query, ref = ref, labels = ref$label, de.method = "wilcox")
colData(query) <- cbind(colData(query), preds.singler)

# Confusion matrix 'coral_labels'
preds_x_truth <- table(query$coral_labels, query$labels)

# Accuracy for 'coral_labels'
acc <- sum(diag(preds_x_truth)) / sum(preds_x_truth)

# Print confusion matrix
preds_x_truth

## ----prediction accuracy scores - pruned_coral_labels-------------------------
# Confusion matrix for 'pruned_coral_labels'
pruned_preds_x_truth <- table(query$pruned_coral_labels, query$labels)


# Accuracy for 'pruned_coral_labels'
pruned_acc <- sum(diag(pruned_preds_x_truth)) / sum(pruned_preds_x_truth)

# Print confusion matrix
pruned_preds_x_truth

## ----prediction viz, fig.width=7.5, fig.height=10-----------------------------
# Plot query and reference UMAP side-by-side # fig.height=7.25
# with ground-truth & predicted cell labels
use.color <- c("acinar" = "#FF7F00", "alpha" = "#CCCCCC", "beta" = "#FDCDAC", 
               "delta" = "#E6F5C9", "duct" = "#E31A1C", "endothelial" = "#377EB8", 
               "epsilon" = "#7FC97F", "mesenchymal" = "#F0027F", "pp" = "#B3CDE3")
query.singler.plot <- PlotDimRed(
    object = query,
    color.by = "labels",
    dimred = "refUMAP",
    point.size = 0.25,
    point.stroke = 0.25,
    legend.nrow = 3,
    use.color = use.color
) +
    ggtitle("Query: SingleR predictions")
query.predicted.plot <- PlotDimRed(
    object = query,
    color.by = "coral_labels",
    dimred = "refUMAP",
    point.size = 0.25,
    point.stroke = 0.25,
    legend.nrow = 3,
    use.color = use.color
) +
    ggtitle("Query: Coralysis predictions")
query.pruned.plot <- PlotDimRed(
    object = query,
    color.by = "pruned_coral_labels",
    dimred = "refUMAP",
    point.size = 0.25,
    point.stroke = 0.25,
    legend.nrow = 3,
    use.color = use.color
) +
    ggtitle("Query: Coralysis predictions pruned")
query.confidence.plot <- PlotExpression(
    object = query,
    color.by = "coral_probability",
    dimred = "refUMAP",
    point.size = 0.25,
    point.stroke = 0.1,
    color.scale = "viridis"
) +
    ggtitle("Query: Coralysis confidence")
cowplot::plot_grid(ref.celltype.plot, query.singler.plot,
    query.predicted.plot, query.pruned.plot,
    query.confidence.plot,
    ncol = 2, align = "vh"
)

## ----rsession-----------------------------------------------------------------
# R session
sessionInfo()