## ----installation, eval=FALSE-------------------------------------------------
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("SETA")

## ----setup, message=FALSE, warning=FALSE, echo=TRUE---------------------------
library(SingleCellExperiment)
library(SETA)
library(ggplot2)
library(dplyr)
library(TabulaMurisSenisData)
library(reshape2)
# library(tidytext)

## ----load data, message=FALSE, warning=FALSE, echo=TRUE-----------------------
sce <- TabulaMurisSenisDroplet(tissues = "Lung")$Lung

# Two samples in this dataset have had certain celltype lineages depleted. 
# For compositional comparisons, we remove these for simplicity.
sce <- sce[, colData(sce)$subtissue != "immune-endo-depleted"]

## ----tabular data exploration-------------------------------------------------
table(sce$free_annotation, sce$age)
table(sce$free_annotation, sce$sex)


## ----palettes-----------------------------------------------------------------
# Set up a color palette for plots
# 3-group distinct categoricals
age_palette <- c(
    "1m" = "#90EE90",
    "3m" = "#4CBB17",
    "18m" = "#228B22",
    "21m" = "#355E3B",
    "30m" = "#023020")

## ----data wrangling, message=FALSE, warning=FALSE, echo=TRUE------------------
# Extract the taxonomic counts matrix using custom metadata column names
# (Users can specify these column names according to their dataset.)
df <- data.frame(colData(sce))

# Fix special characters in mouse.id before working with the data
df$mouse.id <- gsub("/", "", df$mouse.id)

taxa_counts <- setaCounts(
    df,
    cell_type_col = "free_annotation",
    sample_col = "mouse.id",
    bc_col = "cell")
head(taxa_counts)

## ----transformation, echo=TRUE------------------------------------------------
# CLR Transformation with a pseudocount of 1 (default)
clr_out <- setaCLR(taxa_counts, pseudocount = 1)
print(clr_out$counts[1:5, 1:5])

## ----ALR----------------------------------------------------------------------
# ALR Transformation: using "NK cell" as the reference cell type
if ("Natural Killer" %in% colnames(taxa_counts)) {
    alr_out <- setaALR(taxa_counts, ref = "Natural Killer", pseudocount = 1)
    print(alr_out$counts[1:5, 1:5])
} else {
    message("Reference 'NK cell' not found in taxa_counts. 
            Please choose an available cell type.")
}

## ----ILR----------------------------------------------------------------------
ilr_out <- setaILR(taxa_counts, boxcox_p = 0, pseudocount = 1)
print(ilr_out$counts[1:5, 1:5])

## ----percent------------------------------------------------------------------
pct_out <- setaPercent(taxa_counts)
print(pct_out$counts[1:5, 1:5])

## ----logCPM-------------------------------------------------------------------
lcpm_out <- setaLogCPM(taxa_counts, pseudocount = 1)
print(lcpm_out$counts[1:5, 1:5])

## ----latent spaces, echo=TRUE-------------------------------------------------
latent_pca <- setaLatent(clr_out, method = "PCA", dims = 5)
# PCA Latent Space Coordinates:
head(latent_pca$latentSpace)
# Variance Explained:
latent_pca$varExplained

## ----variance explained, message=FALSE, warning=FALSE, echo=TRUE--------------
ve_df <- data.frame(
    PC = factor(seq_along(latent_pca$varExplained), 
                levels = seq_along(latent_pca$varExplained)),
    VarianceExplained = cumsum(latent_pca$varExplained)
)

## ----variance plot, fig.width = 6, fig.height = 4-----------------------------
ggplot(ve_df, aes(x = PC, y = VarianceExplained)) +
    geom_line(color = "#2ca1db", size = 1.5, group = 1) +
    geom_point(color = "#f44323", size = 3) +
    labs(title = "Cumulative Variance Explained by Principal Components",
        x = "Principal Component", y = "Cumulative Variance Explained (%)") +
    theme_minimal() +
    ylim(c(0, 1)) +
    theme(text = element_text(size = 12))

## ----PCA scatter--------------------------------------------------------------
# Extract latent space coordinates from PCA result.
pca_coords <- latent_pca$latentSpace

# Extract metadata
meta_df <- setaMetadata(
    df,
    sample_col = "mouse.id",
    meta_cols = c("age", "sex"))

# Merge latent coordinates with metadata
pca_plot_df <- cbind(pca_coords, meta_df)

## ----PCA scatter plot, fig.width = 6, fig.height = 4--------------------------
# Create the scatter plot
ggplot(pca_plot_df, aes(x = PC1, y = PC2, color = age)) +
    geom_text(aes(label = sample_id), size = 5) +
    scale_color_manual(
        values = age_palette
    ) +
    labs(title = "PCA Scatter Plot",
        x = "PC1", y = "PC2", color = "Age") +
    theme_linedraw() +
    xlab(sprintf("PC1 (%s%%)", signif(latent_pca$varExplained[1], 4) * 100)) +
    ylab(sprintf("PC2 (%s%%)", signif(latent_pca$varExplained[2], 4) * 100))

## ----loadings, message=FALSE, warning=FALSE, echo=TRUE------------------------
loadings_df <- as.data.frame(latent_pca$loadings)
loadings_df$Celltype <- rownames(loadings_df)

# Melt the loadings into long format for `ggplot2`
loadings_long <- melt(loadings_df, id.vars = "Celltype",
                    variable.name = "PC", value.name = "Loading")

## ----loadings viz, fig.width = 12, fig.height = 9-----------------------------
# Subset rows where PC is "PC1" or "PC2"
loading_df <- loadings_long[loadings_long$PC %in% c("PC1", "PC2"), ]

# Append PC to Celltype to mimic tidytext::reorder_within()
loading_df$Celltype_PC <- paste(loading_df$Celltype, loading_df$PC, sep = "___")

# Reorder by Loading *within each PC*
loading_df$Celltype_PC <- with(loading_df, reorder(Celltype_PC, Loading))

ggplot(loading_df,
        aes(x = Loading, y = Celltype, label = Celltype, fill = Loading)) +
    geom_col() +
    facet_wrap(~PC, scales = "free") +
    # scale_y_reordered() +
    labs(title = "PC Loadings",
        x = "Loading",
        y = "Cell Type") +
    theme_linedraw() +
    scale_fill_viridis_c() +
    expand_limits(x = c(-0.2, 0.4))

## ----packages used------------------------------------------------------------
sessionInfo()

