## ----setup, include = FALSE-------------------------------------------------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    crop = NULL ## Related to https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016656.html
)

## ----vignetteSetup, echo=FALSE, message=FALSE, warning = FALSE--------------------------------------------------------
## Bib setup
library("RefManageR")

## Write bibliography information
bib <- c(
    R = citation(),
    BiocStyle = citation("BiocStyle")[1],
    knitr = citation("knitr")[1],
    RefManageR = citation("RefManageR")[1],
    rmarkdown = citation("rmarkdown")[1],
    sessioninfo = citation("sessioninfo")[1],
    testthat = citation("testthat")[1],
    DeconvoBuddies = citation("DeconvoBuddies")[1]
)

## ----"install", eval = FALSE------------------------------------------------------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
#     install.packages("BiocManager")
# }
# 
# BiocManager::install("DeconvoBuddies")
# 
# ## Check that you have a valid Bioconductor installation
# BiocManager::valid()

## ----"load_packages", message=FALSE, warning=FALSE--------------------------------------------------------------------
## Packages for different types of RNA-seq data structures in R
library("SummarizedExperiment")
library("SingleCellExperiment")
library("Biobase")

## For downloading data
library("spatialLIBD")

## Other helper packages for this vignette
library("dplyr")
library("tidyr")
library("tibble")
library("ggplot2")

## Our main package
library("DeconvoBuddies")

## ----"load rse_gene"--------------------------------------------------------------------------------------------------
## use fetch deconvo data to load rse_gene
rse_gene <- fetch_deconvo_data("rse_gene")
rse_gene
# lobstr::obj_size(rse_gene)
# 41.16 MB

## Use gene "Symbol" as identifiers for the genes in rownames(rse_gene)
rownames(rse_gene) <- rowData(rse_gene)$Symbol

## bulk RNA seq samples were sequenced with different library types,
## and RNA extractions
table(rse_gene$library_type, rse_gene$library_prep)

## ----"load snRNA-seq"-------------------------------------------------------------------------------------------------
## Use spatialLIBD to fetch the snRNA-seq dataset used in this project
sce_path_zip <- fetch_deconvo_data("sce")

## unzip and load the data
sce_path <- unzip(sce_path_zip, exdir = tempdir())
sce <- HDF5Array::loadHDF5SummarizedExperiment(
    file.path(tempdir(), "sce_DLPFC_annotated")
)

# lobstr::obj_size(sce)
# 172.28 MB

## exclude Ambiguous cell type
sce <- sce[, sce$cellType_broad_hc != "Ambiguous"]
sce$cellType_broad_hc <- droplevels(sce$cellType_broad_hc)

## Check the number of genes by the number of nuclei that we
## have to work with:
dim(sce)

## Check the broad cell type distribution
table(sce$cellType_broad_hc)

## We're going to subset to the first 5k genes to save memory
## just for this example. You wouldn't do this on a full analysis.
sce <- sce[seq_len(5000), ]

## ----"access RNAScope proportions"------------------------------------------------------------------------------------
# Access the RNAScope proportion data.frame
data("RNAScope_prop")
head(RNAScope_prop)

## plot the RNAScope compositions
plot_composition_bar(
    prop_long = RNAScope_prop,
    sample_col = "SAMPLE_ID",
    x_col = "SAMPLE_ID",
    add_text = FALSE
) +
    facet_wrap(~Combo, nrow = 2, scales = "free_x")

## ----"Run Mean Ratio"-------------------------------------------------------------------------------------------------
# calculate the Mean Ratio of genes for each cell type
marker_stats <- get_mean_ratio(sce,
    cellType_col = "cellType_broad_hc",
    gene_ensembl = "gene_id",
    gene_name = "gene_name"
)

# check the top gene ranked gene for each cell type
marker_stats |>
    group_by(cellType.target) |>
    slice(1)

## ----"plot marker genes"----------------------------------------------------------------------------------------------
# plot expression across cell type the top 4 Excit marker genes
plot_marker_express(sce,
    stats = marker_stats,
    cell_type = "Excit",
    cellType_col = "cellType_broad_hc",
    rank_col = "MeanRatio.rank",
    anno_col = "MeanRatio.anno",
    gene_col = "gene"
)

## ----"marker_list"----------------------------------------------------------------------------------------------------
# select top 25 marker genes for each cell type, that are also in rse_gene
marker_genes <- marker_stats |>
    filter(MeanRatio.rank <= 25 & gene %in% rownames(rse_gene))

# check how many genes for each cell type (some genes are not in both datasets)
marker_genes |> count(cellType.target)

# create a vector of marker genes to subset data before deconvolution
marker_genes <- marker_genes |> pull(gene)

## ----"prep data as ExpressionSet"-------------------------------------------------------------------------------------
# NOT RUN - no longer running Bisque - see next section for details

if(FALSE){
## convert bulk data to Expression set, sub-setting to marker genes
## include sample ID
exp_set_bulk <- Biobase::ExpressionSet(
    assayData = assays(rse_gene[marker_genes, ])$counts,
    phenoData = AnnotatedDataFrame(
        as.data.frame(colData(rse_gene))[c("SAMPLE_ID")]
    )
)

## convert snRNA-seq data to Expression set, sub-setting to marker genes
## include cell type and donor information
exp_set_sce <- Biobase::ExpressionSet(
    assayData = as.matrix(assays(sce[marker_genes, ])$counts),
    phenoData = AnnotatedDataFrame(
        as.data.frame(colData(sce))[, c("cellType_broad_hc", "BrNum")]
    )
)

## check for nuclei with 0 marker expression
zero_cell_filter <- colSums(exprs(exp_set_sce)) != 0
message("Exclude ", sum(!zero_cell_filter), " cells")

exp_set_sce <- exp_set_sce[, zero_cell_filter]
}


## ----"run Bisque"-----------------------------------------------------------------------------------------------------
## NOT RUN - Bisque unavailable on CRAN

## Run Bisque with bulk and single cell ExpressionSet inputs
## For running deconvolution

# But the development version can be installed from github via:
# devtools::install_github("cozygene/bisque")

if(FALSE){
# library("BisqueRNA")

est_prop <- ReferenceBasedDecomposition(
    bulk.eset = exp_set_bulk,
    sc.eset = exp_set_sce,
    cell.types = "cellType_broad_hc",
    subject.names = "BrNum",
    use.overlap = FALSE
)

## Examine the output from Bisque, transpose to make it easier to work with
est_prop <- t(est_prop$bulk.props)

}


## ----"deconvo output"-------------------------------------------------------------------------------------------------
## Load the pre-computed estimated proportions
data("est_prop")

## sample x cell type matrix
head(est_prop)

## ----"composition plots"----------------------------------------------------------------------------------------------
## add Phenotype data to proportion estimates
pd <- colData(rse_gene) |>
    as.data.frame() |>
    select(SAMPLE_ID, Sample, library_combo)

## make proportion estimates long so they are ggplot2 friendly
prop_long <- est_prop |>
    as.data.frame() |>
    tibble::rownames_to_column("SAMPLE_ID") |>
    tidyr::pivot_longer(!SAMPLE_ID, names_to = "cell_type", values_to = "prop") |>
    left_join(pd)

## create composition bar plots

## for all library preparations by sample n=110
## Remove the SAMPLE_ID names since they are very long using ggplot2::theme()
plot_composition_bar(
    prop_long = prop_long,
    sample_col = "SAMPLE_ID",
    x_col = "SAMPLE_ID",
    add_text = FALSE
) +
    theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

## Average by brain donor
plot_composition_bar(
    prop_long = prop_long,
    sample_col = "SAMPLE_ID",
    x_col = "Sample",
    add_text = FALSE
)

## Each brain donor has up to 6 unique RNA library type and RNA extraction
## combinations
table(prop_long$Sample) / length(unique(prop_long$cell_type))

## Here are the 6 "SAMPLE_ID" values for brain donor with ID "Br8667_mid"
unique(prop_long$SAMPLE_ID[prop_long$Sample == "Br8667_mid"])

## ----"compare prop"---------------------------------------------------------------------------------------------------
## Combine Oligo and OPC into OligoOPC
prop_long_opc <- prop_long |>
    mutate(cell_type = gsub("Oligo|OPC", "OligoOPC", cell_type)) |>
    group_by(SAMPLE_ID, Sample, library_combo, cell_type) |>
    summarize(prop = sum(prop)) |>
    ungroup()

prop_long_opc |> count(cell_type)

## Join RNAScope/IF and Bisque cell type proportions
prop_compare <- prop_long_opc |>
    inner_join(
        RNAScope_prop |>
            select(Sample, cell_type, prop_RNAScope = prop, prop_sn),
        by = c("Sample", "cell_type")
    )

## ----"proportion scatter"---------------------------------------------------------------------------------------------
## compute correlation with RNAScope/IF proportions
cor(prop_compare$prop, prop_compare$prop_RNAScope)

## Scatter plot with RNAScope/IF proportions
prop_compare |>
    ggplot(aes(x = prop_RNAScope, y = prop, color = cell_type, shape = library_combo)) +
    geom_point() +
    geom_abline()


## correlation with snRNA-seq proportion
cor(prop_compare$prop, prop_compare$prop_sn)

## Scatter plot with RNAScope/IF proportions
prop_compare |>
    ggplot(aes(x = prop_sn, y = prop, color = cell_type, shape = library_combo)) +
    geom_point() +
    geom_abline()

## ----"prep and run hspe"----------------------------------------------------------------------------------------------
if (FALSE) {
    ## Install hspe
    # if (!requireNamespace("hspe", quietly = TRUE)) {
    #     ## Install version 0.1 which is the one listed on the main documentation
    #     ## at https://github.com/gjhunt/hspe/tree/main?tab=readme-ov-file#software
    #     remotes::install_url("https://github.com/gjhunt/hspe/raw/main/hspe_0.1.tar.gz")
    #     ## Alternatively, install from the latest version on GitHub with:
    #     # remotes::install_github("gjhunt/hspe", subdir = "lib_hspe")
    #
    #     ## As of 2024-08-23, it's been 3 years since files were last modified
    #     ## at https://github.com/gjhunt/hspe/tree/main/lib_hspe.
    # }
    # library("hspe")

    # pseudobulk the sce data by sample + cell type
    sce_pb <- spatialLIBD::registration_pseudobulk(sce,
        var_registration = "cellType_broad_hc",
        var_sample_id = "Sample"
    )


    ## extract the gene expression from the bulk rse_gene
    mixture_samples <- t(assays(rse_gene)$logcounts)
    mixture_samples[1:5, 1:5]

    ## create a vector of indexes of the different cell types
    pure_samples <- rafalib::splitit(sce_pb$cellType_broad_hc)

    ## extract the the pseudobulked logcounts
    reference_samples <- t(assays(sce_pb)$logcounts)
    reference_samples[1:5, 1:5]

    ## check the number of genes match in the bulk (mixture) and single cell (reference)
    ncol(mixture_samples) == ncol(reference_samples)

    ## run hspe
    est_prop_hspe <- hspe(
        Y = mixture_samples,
        reference = reference_samples,
        pure_samples = pure_samples,
        markers = marker_genes,
        seed = 10524
    )
}

## ----reproduce3, echo=FALSE-------------------------------------------------------------------------------------------
## Session info
library("sessioninfo")
options(width = 120)
session_info()

## ----vignetteBiblio, results = "asis", echo = FALSE, warning = FALSE, message = FALSE---------------------------------
## Print bibliography
PrintBibliography(bib, .opts = list(hyperlink = "to.doc", style = "html"))