## ----installation, eval=FALSE-------------------------------------------------
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("SETA")

## ----load packages, message=FALSE, warning=FALSE------------------------------
library(SingleCellExperiment)
library(SETA)
library(ggplot2)
library(ggraph)
library(tidygraph)
library(dplyr)
library(ggplot2)
library(patchwork)
library(TabulaMurisSenisData)

## ----palettes-----------------------------------------------------------------
# Set up a color palette for plots
# 9-group distinct categoricals
d_palette <- c("#3B9AB2", "#6BAFAF", "#9BBDAC", "#C9C171",
                "#EBCC2A", "#E6A80F", "#E98903", "#F84C00", "#F21A00",
                "#B10026", "#67000D")

## ----load data, echo = FALSE--------------------------------------------------
sce <- TabulaMurisSenisDroplet(tissues = "Lung")$Lung

sce <- sce[, colData(sce)$subtissue != "immune-endo-depleted"]

sce$mouse.id <- gsub("/", "", sce$mouse.id)


## ----tabular data exploration-------------------------------------------------
table(sce$free_annotation, sce$age)

## ----add lineage information--------------------------------------------------
# We'll make a small mapping from original free_annotation -> broader 'Lineage'
lineage_map <- c(
    "Adventitial Fibroblast" = "Fibroblast",
    "Airway Smooth Muscle" = "SMC",
    "Alveolar Epithelial Type 2" = "Epithelial",
    "Alveolar Fibroblast" = "Fibroblast",
    "Alveolar Macrophage" = "Myeloid",
    "Artery" = "Endothelial",
    "B" = "B cell",
    "Basophil" = "Granulocyte",
    "Ccr7+ Dendritic" = "Myeloid",
    "CD4+ T" = "T cell",
    "CD8+ T" = "T cell",
    "Capillary" = "Endothelial",
    "Capillary Aerocyte" = "Endothelial",
    "Ciliated" = "Epithelial",
    "Classical Monocyte" = "Myeloid",
    "Club" = "Epithelial",
    "Intermediate Monocyte" = "Myeloid",
    "Interstitial Macrophage" = "Myeloid",
    "Lympatic" = "Endothelial",
    "Ly6g5b+ T" = "T cell",
    "Myeloid Dendritic Type 1" = "Myeloid",
    "Myeloid Dendritic Type 2" = "Myeloid",
    "Myofibroblast" = "Fibroblast",
    "Natural Killer" = "NK",
    "Natural Killer T" = "NK",
    "Neuroendocrine" = "Neuroendocrine",
    "Neutrophil" = "Granulocyte",
    "Nonclassical Monocyte" = "Myeloid",
    "Pericyte" = "Endothelial",
    "Plasma" = "Plasma Cell",
    "Plasmacytoid Dendritic" = "Myeloid",
    "Proliferating Alveolar Macrophage" = "Proliferating",
    "Proliferating Classical Monocyte" = "Proliferating",
    "Proliferating Dendritic" = "Proliferating",
    "Proliferating NK" = "Proliferating",
    "Proliferating T" = "Proliferating",
    "Regulatory T" = "T cell",
    "Vein" = "Endothelial",
    "Zbtb32+ B" = "B cell"
)

# Add a new column in  named 'Lineage'
free_annotations <- as.character(sce$free_annotation)
sce$Lineage <- lineage_map[free_annotations]

## ----create taxonomy DF-------------------------------------------------------
# In a Seurat Object, barcodes are in the rownames of the metadata.
# Similar to setaCounts, taxonomy DF creation can use rownames as bc.
# Then we ensure we have columns for 'free_annotation' and 'Lineage'.
print(head(sce$free_annotation))
print(head(sce$Lineage))

# We want a data frame that includes bc + 'free_annotation' + 'Lineage'
# Then we'll pass it to setaTaxonomyDF
tax_df <- setaTaxonomyDF(
    obj = data.frame(colData(sce)),
    resolution_cols = c("Lineage", "free_annotation"),
    bc_col = "cell"
)

# Each row is a unique 'free_annotation'.
# The last column 'free_annotation' is the finer label.
# cols must be entered in order of increasing resolution (broadest to finest)
tax_df

## ----ggraph, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 6------
# Create a tidygraph object using SETA built-in utils

tbl_g <- taxonomy_to_tbl_graph(
    tax_df,
    columns = c("Lineage", "free_annotation"))

# Plot with ggraph
ggraph(tbl_g, layout = "dendrogram") +
    geom_edge_diagonal2(aes(color = node.Lineage)) +
    geom_node_text(aes(filter = leaf,
                        label = free_annotation, color = Lineage),
                   nudge_y = 0.1, vjust = 0.5, hjust = 0, size = 5) +
    geom_node_text(aes(filter = !leaf,
                        label = Lineage),
                    color = 'black',
                    size = 5,
                    repel = TRUE) +
    guides(edge_colour = guide_legend(title = "Lineage"),
            color = guide_legend(title = "Lineage")) +
    theme_linedraw(base_size = 16) +
    scale_edge_color_manual(values = d_palette) +
    scale_color_manual(values = d_palette) +
    scale_y_reverse(breaks = seq(0, 2, by = 1),
                    labels = c("Type", "Lineage", "Root")) +
    theme(axis.text.y = element_blank()) +
    expand_limits(y = -5) +
    coord_flip() +
    ggtitle("SETA: Taxonomy")

## ----counts and metadata------------------------------------------------------
# Create sample x free_annotation counts:
taxa_counts <- setaCounts(
    data.frame(colData(sce)),
    cell_type_col = "free_annotation",
    sample_col = "mouse.id",
    bc_col = "rownames"
)

# Create SETA sample-level metadata
meta_df <- setaMetadata(data.frame(colData(sce)), sample_col = "mouse.id",
                        meta_cols = c("age", "sex"))


bal_out <- setaTransform(
    counts     = taxa_counts,
    method     = "balance",
    balances   = list(
        epi_vs_myeloid = list(
            num   = "Epithelial",
            denom = "Myeloid"
        )
    ),
    taxonomyDF   = tax_df,
    taxonomy_col = "Lineage"
)

# merge the balances with sample-level metadata
bal_df <- data.frame(
    sample_id       = rownames(bal_out$counts),
    epi_vs_myeloid  = as.numeric(bal_out$counts[, "epi_vs_myeloid"]),
    stringsAsFactors = FALSE
) |>
    dplyr::left_join(meta_df, by = "sample_id")

# Compare age groups by epithelial to myeloid ratios
library(ggplot2)
ggplot(bal_df, aes(x = age, y = epi_vs_myeloid, fill = age)) +
    geom_boxplot(outlier.shape = NA, alpha = 0.4) +
    geom_jitter(width = 0.15, size = 2) +
    scale_fill_manual(values = d_palette) +
    labs(title = "Epithelial / Myeloid balance across age groups",
        y = "log GM(Epithelial) - log GM(Myeloid)",
        x = "Age (months)") +
    theme_minimal(base_size = 14) +
    theme(legend.position = "none")

## ----reference frame calculation----------------------------------------------
# We'll transform them with a 'Lineage' grouping.
# The taxonomyDF uses rownames = free_annotation
# so that colnames(taxa_counts) align with rownames(tax_df).
refframe_out <- setaTransform(
    counts            = taxa_counts,
    method            = "CLR",
    taxonomyDF        = tax_df,
    taxonomy_col      = "Lineage",
    within_resolution = TRUE
)

refframe_out$within_resolution
refframe_out$grouping_var

# Compare to a standard global CLR transform:
global_out <- setaTransform(taxa_counts, method = "CLR")

## ----latent reference frames, fig.width = 10, fig.height = 6------------------
# A) Global CLR
latent_global <- setaLatent(global_out, method = "PCA", dims = 2)
pca_global <- latent_global$latentSpace
pca_global$sample_id <- rownames(pca_global)

# B) Within-Lineage CLR
latent_lineage <- setaLatent(refframe_out, method = "PCA", dims = 2)
pca_lineage <- latent_lineage$latentSpace
pca_lineage$sample_id <- rownames(pca_lineage)

# Merge with metadata
pca_global <- left_join(pca_global, meta_df)
pca_lineage <- left_join(pca_lineage, meta_df)

# Plot side by side
p1 <- ggplot(pca_global, aes(x = PC1, y = PC2, color = age)) +
    geom_text(aes(label = sample_id)) +
    scale_color_manual(
        values = d_palette
    ) +
    labs(title = "Global CLR PCA",
         x = "PC1", y = "PC2", color = "Age (Months)") +
    theme_minimal() +
    xlab(sprintf("PC1 (%s%%)",
                signif(latent_global$varExplained[1], 4) * 100)) +
    ylab(sprintf("PC2 (%s%%)",
                signif(latent_global$varExplained[2], 4) * 100)) +
    theme(legend.position = 'none')

p2 <- ggplot(pca_lineage, aes(x = PC1, y = PC2, color = age)) +
    geom_text(aes(label = sample_id)) +
    scale_color_manual(
        values = d_palette
    ) +
    labs(title = "Within-Lineage CLR PCA",
         x = "PC1", y = "PC2", color = "Age (Months)") +
    theme_minimal() +
    xlab(sprintf("PC1 (%s%%)",
                signif(latent_lineage$varExplained[1], 4) * 100)) +
    ylab(sprintf("PC2 (%s%%)",
                signif(latent_lineage$varExplained[2], 4) * 100))


p1 + p2

## ----libraries used-----------------------------------------------------------
sessionInfo()

