## ----setup, include = FALSE---------------------------------------------------

knitr::opts_chunk$set(
  fig.path = "figures/",        
  dev = "png",                  
  dpi = 300,                     
  fig.width = 7,               
  fig.height = 4,               
  fig.align = "center",          
  out.width = "70%",           
  message = FALSE,              
  warning = FALSE               
)


## ----installation, eval=FALSE-------------------------------------------------
# 
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("DspikeIn")
# 
# library(DspikeIn)

## ----eval=TRUE----------------------------------------------------------------

library(SummarizedExperiment)
library(DspikeIn)

# DspikeIn requirements
# --------------------
# DspikeIn operates on a taxonomy table with exactly seven ranks:
# Kingdom  Phylum  Class  Order  Family  Genus  Species

expected <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
actual <- colnames(as.data.frame(SummarizedExperiment::rowData(tse)))

if (!identical(expected, actual)) {
  stop(
    paste0(
      "DspikeIn requires exactly 7 taxonomic ranks in this order:\n  ",
      paste(expected, collapse = "  "),
      "\n\nYour taxonomy columns are:\n  ",
      paste(actual, collapse = ", "),
      "\n\nPlease remove or rename extra ranks (e.g., 'Strain', 'Subfamily') ",
      "to match the required format."
    )
  )
}


## ----eval=TRUE----------------------------------------------------------------

library(phyloseq)
# Get path to external data folder
extdata_path <- system.file("extdata", package = "DspikeIn")
list.files(extdata_path)
# and
data("physeq_16SOTU", package = "DspikeIn")
length(taxa_names(physeq_16SOTU))
new_ids <- paste0("OTU", seq_along(taxa_names(physeq_16SOTU)))
taxa_names(physeq_16SOTU) <- new_ids

# Verify
head(taxa_names(physeq_16SOTU))
# [1] "OTU1" "OTU2" "OTU3" "OTU4" "OTU5" "OTU6"


tse_16SOTU <- convert_phyloseq_to_tse(physeq_16SOTU)
tse_16SOTU <- tidy_phyloseq_tse(tse_16SOTU)


## -----------------------------------------------------------------------------

# Use the Neighbor-Joining method  based on a Jukes-Cantor distance matrix and plot the tree with bootstrap values.
# we compare the Sanger read of Tetragenococcus halophilus with the FASTA sequence of Tetragenococcus halophilus from our phyloseq object.

library(Biostrings)
library(TreeSummarizedExperiment)
library(SummarizedExperiment)
library(DspikeIn)


# Filter TSE object to keep only Bacteria and Archaea
tse_16SOTU <- tse_16SOTU[
  rowData(tse_16SOTU)$Kingdom %in% c("Bacteria", "Archaea"),]


library(Biostrings)
# 
# # Subset the TSE object to include only Tetragenococcus
# tetragenococcus_tse <- tse_16SOTU[
#   rowData(tse_16SOTU)$Genus == "Tetragenococcus" &
#     !is.na(rownames(tse_16SOTU)) &
#     rownames(tse_16SOTU) != "", ]
# 
# ref_sequences <- referenceSeq(tetragenococcus_tse)
# 
# # Convert to DNAStringSet if needed
# ref_sequences <- Biostrings::DNAStringSet(ref_sequences)
# Biostrings::writeXStringSet(ref_sequences, filepath = "Sample.fasta")

ref_fasta <- system.file("extdata", "Ref.fasta", package = "DspikeIn")
sample_fasta <- system.file("extdata", "Sample.fasta", package = "DspikeIn")


result <- validate_spikein_clade(
  reference_fasta = ref_fasta,
  sample_fasta = sample_fasta,
  bootstrap = 200,
  output_prefix = NULL)

result$tree_plot

## ----fig.align='center', fig.width=7, fig.height=5, out.width='50%', dpi=120----

library(ggstar)
library(ggplot2)

# Filter your object to only include spike-in taxa beforehand:
# We changed the OTU IDs for easy detection
# Big stars = detected in many samples
# Small stars = rarely detected
# log10(Mean Abundance) Bars= Color intensity reflects mean abundance.
# The log-transformed average abundance of each OTU across all samples where it appears.
# Extreme blue may signal unintended over-representation.
# Metadata Ring = factor of your interest e.g. Animal.type
# Each OTU is colored by where it was observed.
# Branch length numbers=	Actual evolutionary distances (small = very similar)


library(DspikeIn)
library(TreeSummarizedExperiment)

# ---- 1. Subset taxa where Genus is Tetragenococcus ----
spikein_tse <- tse_16SOTU[
  rowData(tse_16SOTU)$Genus == "Tetragenococcus", ]


# ---- 2. Diagnostic plot (tree-based) ----
ps <- plot_spikein_tree_diagnostic(
  obj = spikein_tse,
  metadata_var = "Animal.type",
  save_plot = FALSE )

print(ps)

## -----------------------------------------------------------------------------
#                      PREREQUISITE FOR 16S & CALCULATE SPIKED %

# Define spiked species and related parameters**

library(flextable)
library(DspikeIn)
library(TreeSummarizedExperiment)
library(SummarizedExperiment)
library(dplyr)

# Define spike-in parameters
spiked_cells <- 1847
species_name <- spiked_species <- c("Tetragenococcus_halophilus", "Tetragenococcus_sp.")

merged_spiked_species <- "Tetragenococcus"

# If you prefer Genus-level matching
# species_name <- spiked_species <- c("Tetragenococcus")
# merged_spiked_species <- "Tetragenococcus"


# --------------------------------------------
# Subset taxa for the spike-in species
tse_16SOTU_spiked_taxa <- tse_16SOTU[rowData(tse_16SOTU)$Species %in% species_name, ]

# Subset samples based on spike-in volume (e.g., "1" or "2")
tse_16SOTU_spiked_samples <- tse_16SOTU[, colData(tse_16SOTU)$spiked.volume %in% c("2", "1") ]

# --------------------------------------------
# Merge spiked OTUs using the DspikeIn function
# merges monophyletic ASVs/OTUs
# The function Pre_processing_species() merges ASVs/OTUs of an spiked species using "sum" or "max" methods, 
# preserving taxonomic, phylogenetic, and sequencing data.

output_rds <- file.path(tempdir(), "merged_tse_sum.rds")

Spiked_16S_sum_scaled <- Pre_processing_species(
  tse_16SOTU_spiked_samples,
  species_name,
  merge_method = "sum",
  output_file = output_rds)


## -----------------------------------------------------------------------------
# * Calculate the percentage of spiked species retrieval per sample*

library(mia)         
library(dplyr)
library(SummarizedExperiment)

# Subset TSE to remove blanks 
Spiked_16S_sum_scaled_filtered <- Spiked_16S_sum_scaled[, colData(Spiked_16S_sum_scaled)$sample.or.blank != "blank"]

# Calculate spike-in retrieval percentage
Perc <- calculate_spike_percentage(
  Spiked_16S_sum_scaled_filtered,
  merged_spiked_species,
 #output_file = "output_tse.docx",
  passed_range = c(0.1, 20)
)

head(Perc)


## ----fig.align='center', fig.width=7, fig.height=5, out.width='50%', dpi=120----

# The acceptable range of spiked species retrieval is system-dependent
# Spiked species become centroid of the community (Distance to Centroid)
# Spiked species become dominant and imbalance the community (Evenness)

# What range of spiked species retrieval is appropriate for your system?
# Calculate Pielou's Evenness using Shannon index and species richness (Observed)
# Hill number q = 1 = exp(Shannon index), representing the effective number of equally abundant species. Hill is weighted by relative abundance, so dominant species have more influence.
# Unlike Pielou's evenness, this metric is not normalized by richness and it shows Effective number of common species

library(TreeSummarizedExperiment)
library(SummarizedExperiment)
library(dplyr)
library(tibble)
library(microbiome)
library(mia)
library(vegan)
library(S4Vectors)
library(ggplot2)


# --- 1. Extract current metadata from TSE ---
metadata <- colData(Spiked_16S_sum_scaled) %>%
  as.data.frame() %>%
  rownames_to_column("Sample")

# --- 2. Add spike-in reads (Perc) ---
# Ensure Perc has 'Sample' column and matching format
metadata <- dplyr::left_join(metadata, Perc, by = "Sample")

# --- 3. Estimate alpha diversity indices and extract ---
Spiked_16S_sum_scaled <- mia::addAlpha(
  Spiked_16S_sum_scaled,
  index = c("observed", "shannon", "pielou", "hill")
)

alpha_df <- colData(Spiked_16S_sum_scaled) %>%
  as.data.frame() %>%
  rownames_to_column("Sample") %>%
  select(Sample, observed, shannon, pielou, hill)

metadata <- dplyr::left_join(metadata, alpha_df, by = "Sample")

# --- 4. Compute Bray-Curtis distance to centroid ---
otu_mat <- assay(Spiked_16S_sum_scaled, "counts")
otu_mat <- t(otu_mat)  # samples as rows
otu_mat_rel <- vegan::decostand(otu_mat, method = "total")

centroid_profile <- colMeans(otu_mat_rel)

dist_to_centroid <- apply(otu_mat_rel, 1, function(x) {
  vegan::vegdist(rbind(x, centroid_profile), method = "bray")[1]
})

# Match and assign distances
metadata$Dist_to_Centroid <- dist_to_centroid[metadata$Sample]

# --- 5. Assign updated metadata to TSE ---
metadata <- column_to_rownames(metadata, var = "Sample")
metadata <- metadata[colnames(Spiked_16S_sum_scaled), , drop = FALSE]  # ensure correct order and size

colData(Spiked_16S_sum_scaled) <- S4Vectors::DataFrame(metadata)


# 4. Regression Plots: Diversity vs. Spike-in Reads
# ============================================================================

# Pielous Evenness
plot_object_pielou <- regression_plot(
  data = metadata,
  x_var = "pielou",
  y_var = "Spiked_Reads",
  custom_range = c(0.1, 20, 30, 40, 50, 60, 100),
  plot_title = "Pielous Evenness vs. Spike-in Reads"
)

# Hill Number (q = 1)
plot_object_hill <- regression_plot(
  data = metadata,
  x_var = "hill",
  y_var = "Spiked_Reads",
  custom_range = c(0.1, 10, 20, 30, 100),
  plot_title = "Hill Number (q = 1) vs. Spike-in Reads"
)

plot_object_hill


# Distance to Centroid
plot_object_centroid <- regression_plot(
  data = metadata,
  x_var = "Dist_to_Centroid",
  y_var = "Spiked_Reads",
  custom_range = c(0.1, 20, 30, 40, 50, 60, 100),
  plot_title = "Distance to Global Centroid vs. Spike-in Reads"
)


# Interpretation
# ============================================================================
# - Pielou's evenness is normalized by richness; useful for detecting imbalance.
# - Hill number q = 1 gives effective number of common species; sensitive to dominance.
# - Distance to centroid in full Bray-Curtis space shows deviation from the average community.


## ----eval=FALSE---------------------------------------------------------------
# 
# #                      PREREQUISITE FOR ITS & CALCULATE SPIKED %
# 
# 
# # Define spiked species and related parameters**
# 
# 
# # Define the spiked species
# # spiked_cells <- 733
# # species_name <- spiked_species <- merged_spiked_species <- "Dekkera_bruxellensis"
# 
# # Subset taxa for spiked species
# # Dekkera <- phyloseq::subset_taxa(
# # physeq_ITSOTU,
# # Species %in% species_name)
# 
# # hashcodes <- row.names(phyloseq::tax_table(Dekkera))
# 
# # Subset samples based on spiked volume
# # physeq_ITSOTU_spiked <- phyloseq::subset_samples(physeq_ITSOTU, spiked.volume %in% c("2", "1"))
# 
# # if TSE format
# # tse_ITSOTU <- convert_phyloseq_to_tse(physeq_ITSOTU)
# # physeq_ITSOTU_spiked <- tse_ITSOTU[, tse_ITSOTU$spiked.volume %in% c("2", "1")]

## -----------------------------------------------------------------------------

#                      CALCULATE SCALING FACTORS

result <- calculate_spikeIn_factors(
  Spiked_16S_sum_scaled_filtered,
  spiked_cells = spiked_cells,
  merged_spiked_species = species_name)

# View extracted outputs
result$spiked_species_reads # Merged spiked species name
result$total_reads # Total reads detected for the spike
scaling_factors <- result$scaling_factors
head(scaling_factors) # Vector of scaling factors per sample

## -----------------------------------------------------------------------------


#             Convert relative counts to absolute counts


# absolute counts=relative countssample-specific scaling factor

# Convert to absolute counts
library(DspikeIn)
absolute <- convert_to_absolute_counts(Spiked_16S_sum_scaled_filtered, scaling_factors)

# Extract processed data
absolute_counts <- absolute$absolute_counts
tse_absolute <- absolute$obj_adj

tse_absolute <- tidy_phyloseq_tse(tse_absolute)

# View absolute count data
head(tse_absolute)

## -----------------------------------------------------------------------------


#                      CALCULATE SPIKE PERCENTAGE & summary stat

# ** Calculate spike percentage & Generate summary statistics for absolute counts**
# absolute_counts is a data.frame of OTU/ASV table
# Generate summary statistics for absolute counts
post_eval_summary <- calculate_summary_stats_table(absolute_counts)

# You may want to Back normal to check calculation accuracy
# the scaling factor was computed based on spiked species reads and fixed cell count.
# Multiplying the spiked species read count by this scaling factor restores the exact spiked cell count.
# lets check it
# BackNormal <- calculate_spike_percentage(
#  tse_absolute,
#  merged_spiked_species,
#  passed_range = c(0.1, 20))


## -----------------------------------------------------------------------------
# * Calculate the percentage of spiked species retrieval per sample*

library(mia)         
library(dplyr)
library(SummarizedExperiment)

# Subset TSE to remove blanks 
tse_absolute_filtered <- tse_absolute[, colData(tse_absolute)$sample.or.blank != "blank"]

# Generate conclusion report
conc <- conclusion(
  tse_absolute_filtered,
  merged_spiked_species,
  max_passed_range = 20,
  output_path = output_path)

conc$full_report

# Filter to keep only the samples that passed
passed_samples <- Perc$Sample[Perc$Result == "passed"]

# Subset the TSE object to include only passed samples
tse_passed <- tse_absolute_filtered[, colnames(tse_absolute_filtered) %in% passed_samples]
dim(tse_passed)


## ----fig.align='center', fig.width=7, fig.height=5, out.width='70%', dpi=120----


pps_Abs <- DspikeIn::get_long_format_data(tse_passed)

# calculation for relative abundance needs sum of total reads
# total_reads <- sum(pps_Abs$Abundance)

# Generate an alluvial plot

alluvial_plot_abs <- alluvial_plot(
  data = pps_Abs,
  axes = c("Animal.type", "Ecoregion.III","Diet", "Animal.ecomode"),
  abundance_threshold = 5000,
  fill_variable = "Class",
  silent = TRUE,
  abundance_type = "absolute",
  top_taxa = 10,
  text_size = 3,
  legend_ncol = 1,
  custom_colors = DspikeIn::color_palette$MG_Awesome  # Use the color palette from DspikeIn
)


## -----------------------------------------------------------------------------
# you may need to normalize/transform your data to reduce biases

ps <- tse_passed

# TC Normalization
result_TC <- normalization_set(ps, method = "TC", groups = "Host.species")
normalized_ps_TC <- result_TC$dat.normed
scaling_factors_TC <- result_TC$scaling.factor

# UQ Normalization
# result_UQ <- normalization_set(ps, method = "UQ", groups = "Host.species")
# normalized_ps_UQ <- result_UQ$dat.normed
# scaling_factors_UQ <- result_UQ$scaling.factor
# 
# # Median Normalization
# result_med <- normalization_set(ps, method = "med", groups = "Host.species")
# normalized_ps_med <- result_med$dat.normed
# scaling_factors_med <- result_med$scaling.factor

# DESeq Normalization
# ps_n <- remove_zero_negative_count_samples(ps)
# result_DESeq <- normalization_set(ps_n, method = "DESeq", groups = "Animal.type")
# normalized_ps_DESeq <- result_DESeq$dat.normed
# scaling_factors_DESeq <- result_DESeq$scaling.factor

# Poisson Normalization
# result_Poisson <- normalization_set(ps, method = "Poisson", groups = "Host.genus")
# normalized_ps_Poisson <- result_Poisson$dat.normed
# scaling_factors_Poisson <- result_Poisson$scaling.factor

# Quantile Normalization
# result_QN <- normalization_set(ps, method = "QN")
# normalized_ps_QN <- result_QN$dat.normed
# scaling_factors_QN <- result_QN$scaling.factor

# TMM Normalization
# result_TMM <- normalization_set(ps, method = "TMM", groups = "Animal.type")
# normalized_ps_TMM <- result_TMM$dat.normed
# scaling_factors_TMM <- result_TMM$scaling.factor

# CLR Normalization
# result_clr <- normalization_set(ps, method = "clr")
# normalized_ps_clr <- result_clr$dat.normed
# scaling_factors_clr <- result_clr$scaling.factor

# Rarefying
# result_rar <- normalization_set(ps, method = "rar")
# normalized_ps_rar <- result_rar$dat.normed
# scaling_factors_rar <- result_rar$scaling.factor

# CSS Normalization
# result_css <- normalization_set(ps, method = "css")
# normalized_ps_css <- result_css$dat.normed
# scaling_factors_css <- result_css$scaling.factor
# 
# TSS Normalization
# result_tss <- normalization_set(ps, method = "tss")
# normalized_ps_tss <- result_tss$dat.normed
# scaling_factors_tss <- result_tss$scaling.factor

# RLE Normalization
# result_rle <- normalization_set(ps, method = "rle")
# normalized_ps_rle <- result_rle$dat.normed
# scaling_factors_rle <- result_rle$scaling.factor


## ----fig.align='center', fig.width=7, fig.height=5, out.width='50%', dpi=120----

normalized_tse_TC<-convert_phyloseq_to_tse(normalized_ps_TC)

rf_tse <- RandomForest_selected(
  normalized_ps_TC,
  response_var = "Host.genus",
  na_vars = c("Habitat", "Ecoregion.III", "Host.genus", "Diet")
)

ridge_tse<- ridge_plot_it(rf_tse, taxrank = "Family", top_n = 10) + scale_fill_manual(values = DspikeIn::color_palette$cool_MG)

ridge_tse

## ----fig.align='center', fig.width=11, fig.height=7, out.width='60%', dpi=120----
library(mia)
library(dplyr)
library(SummarizedExperiment)

# ---Remove unwanted taxa by Genus, Family, or Order ---
tse_filtered <- tse_absolute[
  rowData(tse_absolute)$Genus != "Tetragenococcus" &
  rowData(tse_absolute)$Family != "Chloroplast" &
  rowData(tse_absolute)$Order != "Chloroplast", ]

tse_caudate <- tse_filtered[, colData(tse_filtered)$Clade.Order == "Caudate"]
genera_keep <- c("Desmognathus", "Plethodon", "Eurycea")
tse_three_genera <- tse_caudate[, colData(tse_caudate)$Host.genus %in% genera_keep]
tse_blue_ridge <- tse_three_genera[, colData(tse_three_genera)$Ecoregion.III == "Blue Ridge"]
tse_desmog <- tse_blue_ridge[, colData(tse_blue_ridge)$Host.genus == "Desmognathus"]

# --- Differential Abundance with DESeq2 ---
results_DESeq2 <- perform_and_visualize_DA(
  obj = tse_desmog,
  method = "DESeq2",
  group_var = "Host.taxon",
  contrast = c("Desmognathus monticola", "Desmognathus imitator"),
  output_csv_path = "DA_DESeq2.csv",
  target_glom = "Genus",
  significance_level = 0.01)


results_DESeq2$results
results_DESeq2$obj_significant
# Optional: Visualization
 results_DESeq2$bar_plot


## ----eval=FALSE---------------------------------------------------------------
# 
# 
# # DspikeIn: Differential Abundance Examples
# # Using perform_and_visualize_DA() with multiple contrast scenarios
# 
# # 1. Single Contrast
# 
# res_single <- perform_and_visualize_DA(
#   obj = tse_absolute,
#   method = "DESeq2",
#   group_var = "Diet",
#   contrast = c("Insectivore", "Carnivore"),
#   target_glom = "Genus")
# 
# 
# 
# # 2. Single Factor  All Pairwise Contrasts
# col_df <- as.data.frame(colData(tse_absolute))
# col_df$Host.taxon <- factor(make.names(as.character(col_df$Host.taxon)))
# colData(tse_absolute)$Host.taxon <- col_df$Host.taxon
# 
# # Get unique DESeq2-compatible factor levels
# host_levels <- levels(col_df$Host.taxon)
# print(host_levels)
# 
# # contrast list
# contrast_named <- list(
#   Host.taxon = combn(host_levels, 2, simplify = FALSE))
# 
# # multiple pairwise contrasts
# res_multi <- perform_and_visualize_DA(
#   obj = tse_absolute,
#   method = "DESeq2",
#   group_var = "Host.taxon",
#   contrast = contrast_named,
#   target_glom = "Genus")
# 
# 
# 
# # 3. Single Factor  Selected Contrasts
# 
# contrast_list <- list(
#   c("Insectivore", "Carnivore"),
#   c("Omnivore", "Herbivore"))
# 
# res_selected <- perform_and_visualize_DA(
#   obj = tse_absolute,
#   method = "DESeq2",
#   group_var = "Diet",
#   contrast = contrast_list,
#   global_fdr = TRUE)
# 
# 
# 
# # 4. Multiple Factors  Selected Contrasts
# 
# contrast_named <- list(
#   Diet = list(
#     c("Insectivore", "Carnivore"),
#     c("Omnivore", "Carnivore") ),
#   Animal.type = list(
#     c("Frog", "Salamander") ))
# 
# res_multi_factor <- perform_and_visualize_DA(
#   obj = tse_absolute,
#   method = "DESeq2",
#   contrast = contrast_named,
#   target_glom = "Genus",
#   significance_level = 0.01,
#   global_fdr = TRUE)
# 
# 
# 
# # 5. Multiple Factors  all pairwise contrasts
# 
# colData(tse_absolute)$Host.taxon <- droplevels(factor(colData(tse_absolute)$Host.taxon))
# colData(tse_absolute)$Habitat <- droplevels(factor(colData(tse_absolute)$Habitat))
# host_levels <- levels(colData(tse_absolute)$Host.taxon)
# habitat_levels <- levels(colData(tse_absolute)$Habitat)
# 
# # Define pairwise contrasts as a named list
# contrast_named <- list(
#   Host.taxon = combn(host_levels, 2, simplify = FALSE),
#   Habitat = combn(habitat_levels, 2, simplify = FALSE))
# 
# # Define ComboGroup variable
# colData(tse_absolute)$ComboGroup <- factor(interaction(
#   colData(tse_absolute)$Animal.type,
#   colData(tse_absolute)$Diet,
#   drop = TRUE))
# 
# 
# 
# # --- Run multi-factor DESeq2 comparisons ---
# results_multi_factor <- perform_and_visualize_DA(
#   obj = tse_absolute,
#   method = "DESeq2",
#   contrast = contrast_named,
#   target_glom = "Genus",
#   significance_level = 0.01)
# 
# # --- Combined Group Interactions ---
# colData(tse_absolute)$ComboGroup <- factor(interaction(
#   colData(tse_absolute)$Animal.type,
#   colData(tse_absolute)$Diet,
#   drop = TRUE))
# 
# # --- Define custom interaction contrasts ---
# contrast_list <- list(
#   c("Salamander.Insectivore", "Lizard.Insectivore"),
#   c("Salamander.Carnivore", "Snake.Carnivore"),
#   c("Salamander.Carnivore", "Frog.Carnivore"))
# 
# # --- Run combined group comparisons ---
# res_combo <- perform_and_visualize_DA(
#   obj = tse_absolute,
#   method = "DESeq2",
#   group_var = "ComboGroup",
#   contrast = contrast_list,
#   target_glom = "Genus",
#   global_fdr = TRUE)
# 

## -----------------------------------------------------------------------------
# --------------------------------------------
#  Extract OTU matrices from TSE objects
# --------------------------------------------
# relative dataset -> tse_desmog_rel
# tse_16SOTU_spiked_samples <- tse_16SOTU[, colData(tse_16SOTU)$spiked.volume %in% c("2", "1") ]

tse_filtered_rel <- tse_16SOTU_spiked_samples[
  rowData(tse_16SOTU_spiked_samples)$Genus != "Tetragenococcus" &
  rowData(tse_16SOTU_spiked_samples)$Family != "Chloroplast" &
  rowData(tse_16SOTU_spiked_samples)$Order != "Chloroplast", ]

tse_caudate_rel <- tse_filtered_rel[, colData(tse_filtered_rel)$Clade.Order == "Caudate"]
genera_keep <- c("Desmognathus", "Plethodon", "Eurycea")
tse_three_genera_rel <- tse_caudate[, colData(tse_caudate_rel)$Host.genus %in% genera_keep]
tse_blue_ridge_rel <- tse_three_genera_rel[, colData(tse_three_genera_rel)$Ecoregion.III == "Blue Ridge"]
tse_desmog_rel <- tse_blue_ridge_rel[, colData(tse_blue_ridge_rel)$Host.genus == "Desmognathus"]

# absolute dataset -> tse_desmog

otu_rel <- assay(tse_desmog_rel, "counts") 
otu_abs <- assay(tse_desmog, "counts")        

# Make sure samples are rows and taxa are columns
otu_rel <- t(otu_rel)
otu_abs <- t(otu_abs)

# Ensure they are matrices
otu_rel <- as.matrix(otu_rel)
otu_abs <- as.matrix(otu_abs)

# --------------------------------------------
# 2. Convert to Presence/Absence 
# --------------------------------------------
otu_rel_pa <- (otu_rel > 0) * 1
otu_abs_pa <- (otu_abs > 0) * 1

# --------------------------------------------
# 3. Identify common samples & taxa
# --------------------------------------------
shared_samples <- intersect(rownames(otu_rel_pa), rownames(otu_abs_pa))
shared_taxa <- intersect(colnames(otu_rel_pa), colnames(otu_abs_pa))

# Subset to common set
otu_rel_pa <- otu_rel_pa[shared_samples, shared_taxa]
otu_abs_pa <- otu_abs_pa[shared_samples, shared_taxa]

# --------------------------------------------
# 4. Compare presence/absence profiles
# --------------------------------------------
shared_pa <- (otu_rel_pa + otu_abs_pa) == 2
total_present <- (otu_rel_pa + otu_abs_pa) >= 1

# Calculate percent agreement (global)
percent_shared <- sum(shared_pa) / sum(total_present)

cat("Percent of shared taxa detections (AA vs RA):",
    round(100 * percent_shared, 1), "%\n")


## ----fig.align='center', fig.width=8, fig.height=5, out.width='50%', dpi=120----

#   Visualization of community composition

# Subset rows where Genus is not Tetragenococcus and not NA
keep_taxa <- !is.na(rowData(tse_absolute)$Genus) & rowData(tse_absolute)$Genus != "Tetragenococcus"
# Subset the TSE by keeping only those taxa (rows)
tse_filtered <- tse_absolute[keep_taxa, ]
# Exclude blanks
tse_filtered <- tse_filtered[, colData(tse_filtered)$sample.or.blank != "blank"]
# subset Salamander samples
tse_sal <- tse_filtered[, colData(tse_filtered)$Animal.type == "Salamander"]
Prok_OTU_sal <- tidy_phyloseq_tse(tse_sal)

TB<-taxa_barplot(
  Prok_OTU_sal,
  target_glom = "Family",
  custom_tax_names = NULL,
  normalize = TRUE,
  treatment_variable = "Habitat",
  abundance_type = "relative",
  x_angle = 15,
  fill_variable = "Family",
  facet_variable = "Diet",
  top_n_taxa = 10,
  palette = DspikeIn::color_palette$cool_MG,
  legend_size = 11,
  legend_columns = 1,
  x_scale = "free",
  xlab = NULL)


## ----fig.align='center', fig.width=11, fig.height=6, out.width='80%', dpi=120----

# 1. Initialization and loading NetWorks for Comparision


# library(SpiecEasi)
# library(ggnet)
# library(igraph)
library(DspikeIn)
library(tidyr)
library(dplyr)
library(ggpubr)
library(igraph)


# herp.Bas is a merged phyloseq object for both bacterial and fungal domains
# spiec.easi(herp.Bas, method='mb', lambda.min.ratio=1e-3, nlambda=250,pulsar.select=TRUE )

Complete <- DspikeIn::load_graphml("Complete.graphml")
NoBasid <- DspikeIn::load_graphml("NoBasid.graphml")
NoHubs <- DspikeIn::load_graphml("NoHubs.graphml")

# degree_network -> please note that needs full path of your graphml
result_kk <- DspikeIn::degree_network(load_graphml("Complete.graphml"),
  save_metrics = TRUE,
  layout_type = "stress")

result_kk$metrics


 result <- DspikeIn::weight_Network(graph_path = "Complete.graphml")
 result$plot


## ----fig.align='center', fig.width=8, fig.height=6, out.width='70%', dpi=120----
# Load required libraries
library(igraph)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggrepel)
ggplot2::margin()

# 2.  Metrics Calculation

completeMetrics <- DspikeIn::node_level_metrics(Complete)
NoHubsMetrics <- DspikeIn::node_level_metrics(NoHubs)
NoBasidMetrics <- DspikeIn::node_level_metrics(NoBasid)


Complete_metrics <- completeMetrics$metrics
Nohub_metrics <- NoHubsMetrics$metrics
Nobasid_metrics <- NoBasidMetrics$metrics

Complete_metrics <- data.frame(lapply(Complete_metrics, as.character), stringsAsFactors = FALSE)
Nohub_metrics <- data.frame(lapply(Nohub_metrics, as.character), stringsAsFactors = FALSE)
Nobasid_metrics <- data.frame(lapply(Nobasid_metrics, as.character), stringsAsFactors = FALSE)


print(igraph::vcount(Complete)) # Number of nodes in Complete network
print(igraph::ecount(Complete)) # Number of edges in Complete network

print(igraph::vcount(NoBasid))
print(igraph::ecount(NoBasid))

print(igraph::vcount(NoHubs))
print(igraph::ecount(NoHubs))


metrics_scaled <- bind_rows(
  Complete_metrics %>% mutate(Network = "Complete"),
  Nohub_metrics %>% mutate(Network = "NoHubs"),
  Nobasid_metrics %>% mutate(Network = "NoBasid")
) %>%
  dplyr::mutate(dplyr::across(where(is.numeric), scale))

metrics_long_scaled <- metrics_scaled %>%
  tidyr::pivot_longer(cols = -c(Node, Network), names_to = "Metric", values_to = "Value")


# Ensure each dataset has a "Network" column before combining
completeMetrics$metrics <- completeMetrics$metrics %>% mutate(Network = "Complete Network")
NoHubsMetrics$metrics <- NoHubsMetrics$metrics %>% mutate(Network = "Network & Module Hubs Removed")
NoBasidMetrics$metrics <- NoBasidMetrics$metrics %>% mutate(Network = "Basidiobolus Subnetwork Removed")

# Combine All Data
combined_data <- bind_rows(
  completeMetrics$metrics,
  NoHubsMetrics$metrics,
  NoBasidMetrics$metrics
)


# Add Node Identifier if missing
if (!"Node" %in% colnames(combined_data)) {
  combined_data <- combined_data %>% mutate(Node = rownames(.))
}

# Convert `Network` into Factor
combined_data$Network <- factor(combined_data$Network, levels = c(
  "Complete Network",
  "Network & Module Hubs Removed",
  "Basidiobolus Subnetwork Removed"))

# Convert Data to Long Format
metrics_long <- combined_data %>%
  pivot_longer(
    cols = c("Redundancy", "Efficiency", "Betweenness"),
    names_to = "Metric", values_to = "Value")


# Define Custom Colors and Shapes
  network_colors <- c(
  "Complete Network" = "#F1E0C5",
  "Network & Module Hubs Removed" = "#D2A5A1",
  "Basidiobolus Subnetwork Removed" = "#B2C3A8")
  
network_shapes <- c(
  "Complete Network" = 21,
  "Network & Module Hubs Removed" = 22,
  "Basidiobolus Subnetwork Removed" = 23)

# Determine Top 30% of Nodes to Label/Optional
metrics_long <- metrics_long %>%
  group_by(Network, Metric) %>%
  mutate(Label = ifelse(rank(-Value, ties.method = "random") / n() <= 0.3, Node, NA))

# ?quadrant_plot() can creat plot for indivisual network
 plot4 <- quadrant_plot(completeMetrics$metrics, x_metric = "Among_Module_Connectivity", y_metric = "Degree")

 
# Customized comparision Plots
create_metric_plot <- function(metric_name, data, title) {
  data_filtered <- data %>% filter(Metric == metric_name)
  median_degree <- median(data_filtered$Degree, na.rm = TRUE)
  median_value <- median(data_filtered$Value, na.rm = TRUE)

  ggplot(data_filtered, aes(x = Degree, y = Value, fill = Network)) +
    geom_point(aes(shape = Network), size = 3, stroke = 1, color = "black") +
    geom_text_repel(aes(label = Label), size = 3, max.overlaps = 50) +
    scale_fill_manual(values = network_colors) +
    scale_shape_manual(values = network_shapes) +
    geom_vline(xintercept = median_degree, linetype = "dashed", color = "black", size = 1) +
    geom_hline(yintercept = median_value, linetype = "dashed", color = "black", size = 1) +
    labs(
      title = title,
      x = "Degree",
      y = metric_name,
      fill = "Network",
      shape = "Network" ) +
    theme_minimal() +
    theme(
      plot.title = element_text(
        hjust = 0.5, size = 16, face = "bold",
        margin = margin(t = 10, b = 20) # Moves the title downward
      ),
      axis.title = element_text(size = 14, face = "bold"),
      legend.position = "top",
      legend.title = element_text(size = 14, face = "bold"),
      legend.text = element_text(size = 12)  )
}


# Generate Plots
plot_redundancy <- create_metric_plot("Redundancy", metrics_long, "Redundancy vs. Degree Across Networks")
plot_efficiency <- create_metric_plot("Efficiency", metrics_long, "Efficiency vs. Degree Across Networks")
plot_betweenness <- create_metric_plot("Betweenness", metrics_long, "Betweenness vs. Degree Across Networks")

# Print Plots
 print(plot_betweenness)

## ----fig.align='center', fig.width=10, fig.height=8, out.width='70%', dpi=120----

# 2.  Metrics Calculation

result_Complete <- DspikeIn::node_level_metrics(Complete)
result_NoHubs <- DspikeIn::node_level_metrics(NoHubs)
result_NoBasid <- DspikeIn::node_level_metrics(NoBasid)

Complete_metrics <- result_Complete$metrics
Nohub_metrics <- result_NoHubs$metrics
Nobasid_metrics <- result_NoBasid$metrics

Complete_metrics <- data.frame(lapply(Complete_metrics, as.character), stringsAsFactors = FALSE)
Nohub_metrics <- data.frame(lapply(Nohub_metrics, as.character), stringsAsFactors = FALSE)
Nobasid_metrics <- data.frame(lapply(Nobasid_metrics, as.character), stringsAsFactors = FALSE)


print(igraph::vcount(Complete)) # Number of nodes in Complete network
print(igraph::ecount(Complete)) # Number of edges in Complete network

print(igraph::vcount(NoBasid))
print(igraph::ecount(NoBasid))

print(igraph::vcount(NoHubs))
print(igraph::ecount(NoHubs))


metrics_scaled <- bind_rows(
  Complete_metrics %>% mutate(Network = "Complete"),
  Nohub_metrics %>% mutate(Network = "NoHubs"),
  Nobasid_metrics %>% mutate(Network = "NoBasid")
) %>%
  dplyr::mutate(dplyr::across(where(is.numeric), scale))

metrics_long_scaled <- metrics_scaled %>%
  tidyr::pivot_longer(cols = -c(Node, Network), names_to = "Metric", values_to = "Value")


# Remove missing values
metrics_long_scaled <- na.omit(metrics_long_scaled)

# We visualize only six metrics
selected_metrics <- c(
  "Degree", "Closeness", "Betweenness",
  "EigenvectorCentrality", "PageRank", "Transitivity"
)

metrics_long_filtered <- metrics_long_scaled %>%
  filter(Metric %in% selected_metrics) %>%
  mutate(
    Value = as.numeric(as.character(Value)),
    Network = recode(Network,
      "Complete" = "Complete Network",
      "NoHubs" = "Network & Module Hubs Removed",
      "NoBasid" = "Basidiobolus Subnetwork Removed" ) ) %>%
  na.omit() # Remove any NA if any

metrics_long_filtered$Network <- factor(metrics_long_filtered$Network,
  levels = c(
    "Complete Network",
    "Network & Module Hubs Removed",
    "Basidiobolus Subnetwork Removed" ))

# DspikeIn::color_palette$light_MG
network_colors <- c(
  "Complete Network" = "#F1E0C5",
  "Network & Module Hubs Removed" = "#D2A5A1",
  "Basidiobolus Subnetwork Removed" = "#B2C3A8")

# statistical comparisons a vs b
comparisons <- list(
  c("Complete Network", "Network & Module Hubs Removed"),
  c("Complete Network", "Basidiobolus Subnetwork Removed"),
  c("Network & Module Hubs Removed", "Basidiobolus Subnetwork Removed"))

networks_in_data <- unique(metrics_long_filtered$Network)
comparisons <- comparisons[sapply(comparisons, function(pair) all(pair %in% networks_in_data))]

met <- ggplot(metrics_long_filtered, aes(x = Network, y = Value, fill = Network)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(aes(color = Network),
    position = position_jitter(0.2), alpha = 0.2, size = 1.5 ) +
  scale_fill_manual(values = network_colors) +
  scale_color_manual(values = network_colors) +
  facet_wrap(~Metric, scales = "free_y", labeller = label_wrap_gen(width = 20)) +
  ggpubr::stat_compare_means(method = "wilcox.test", label = "p.signif", comparisons = comparisons) +
  theme_minimal() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    axis.text.x = element_text(size = 10, angle = 10, hjust = 0.8),
    strip.text = element_text(size = 12, margin = margin(t = 15, b = 5)),
    legend.position = "top",
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13, face = "bold"),
    plot.title = element_text(size = 13, face = "bold")
  ) + labs(title = "Selected Node Metrics Across Networks", fill = "Network Type", color = "Network Type")
met

## -----------------------------------------------------------------------------
Complete <- load_graphml("Complete.graphml")
result2 <- DspikeIn::extract_neighbors(
  graph = Complete,
  target_node = "OTU69:Basidiobolus_sp", mode = "all")
head(result2$summary)

## ----fig.align='center', fig.width=8, fig.height=5, out.width='60%', dpi=120----

# Options: `"stress"` (default), `"graphopt"`, `"fr"`

# Compute degree metrics 
result <- degree_network(graph_path = Complete, save_metrics = TRUE)

# Compute network weights for different graph structures
NH <- weight_Network(graph_path = "NoHubs.graphml")
NB <- weight_Network(graph_path = "NoBasid.graphml")
C  <- weight_Network(graph_path = "Complete.graphml")

# Extract metrics from the computed network weights
CompleteM <- C$metrics
NoHubsM   <- NH$metrics
NoBasidM  <- NB$metrics

# Combine metrics into a single dataframe for comparison
df <- bind_rows(
  CompleteM %>% mutate(Group = "Complete Network"),
  NoHubsM %>% mutate(Group = "Network & Module Hubs Removed"),
  NoBasidM %>% mutate(Group = "Basidiobolus Subnetwork Removed")
) %>%
  pivot_longer(cols = -Group, names_to = "Metric", values_to = "Value")

# Aggregate total values by metric and group
df_bar <- df %>%
  group_by(Metric, Group) %>%
  summarise(Total_Value = sum(Value, na.rm = TRUE), .groups = "drop")

# Order metrics by mean value (optional)
metric_order <- df_bar %>%
  group_by(Metric) %>%
  summarise(mean_value = mean(Total_Value)) %>%
  arrange(desc(mean_value)) %>%
  pull(Metric)

df_bar$Metric <- factor(df_bar$Metric, levels = metric_order)

# Define refined color palette
network_colors <- c(
  "Complete Network" = "#F1E0C5",
  "Network & Module Hubs Removed" = "#D2A5A1",
  "Basidiobolus Subnetwork Removed" = "#B2C3A8"
)

# Create elegant bar plot
pg <- ggplot(df_bar, aes(x = Metric, y = log1p(Total_Value), fill = Group)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.8), width = 0.7, alpha = 0.9) +
  scale_fill_manual(values = network_colors) +
  theme_minimal(base_size = 14) +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text.x = element_text(size = 11, angle = 15, hjust = 1, vjust = 1, color = "gray20"),
    axis.text.y = element_text(size = 11, color = "gray25"),
    axis.title = element_text(size = 13, face = "bold"),
    plot.title = element_text(size = 15, face = "bold", hjust = 0.5, margin = ggplot2::margin(b = 10)),
    legend.position = "top",
    legend.title = element_blank(),
    legend.text = element_text(size = 12),
    panel.background = element_rect(fill = "white", color = NA),
    plot.background = element_rect(fill = "white", color = NA)
  ) +
  labs(
    subtitle = "Comparison of total network-level metrics (log-scaled)",
    x = NULL,
    y = "Total Value (log1p)"
  ) +
  geom_text(
    aes(label = round(log1p(Total_Value), 1)),
    position = position_dodge(width = 0.8),
    vjust = -0.3,
    size = 3.5,
    color = "gray25"
  )


## ----eval=FALSE---------------------------------------------------------------
# library(igraph)
# library(tidygraph)
# library(ggraph)
# library(DspikeIn)
# 
# 
# # AA abundance
# Complete<-load_graphml("Complete.graphml")
# 
# deg <- degree(Complete)
# 
# hist(deg, breaks = 30, main = "Degree Distribution", xlab = "Degree")
# fit <- fit_power_law(deg + 1)  # Avoid zero degrees
# print(fit)
# 
# 
# # ---- empirical network ----
# g_empirical = Complete
# #g_empirical = Complete_Rel
# 
# 
# 
# # ---- Calculate real network metrics ----
# creal <- transitivity(g_empirical, type = "global")
# E(g_empirical)$weight <- abs(E(g_empirical)$weight)
# lreal <- mean_distance(g_empirical, directed = FALSE, unconnected = TRUE, weights = E(g_empirical)$weight)
# 
# # ---- Generate 1000 Erds-Rnyi random graphs with same size ----
# set.seed(42)  # for reproducibility
# n_nodes <- vcount(g_empirical)
# n_edges <- ecount(g_empirical)
# 
# crand_vals <- numeric(1000)
# lrand_vals <- numeric(1000)
# 
# for (i in 1:1000) {
# g_rand <- sample_gnm(n = n_nodes, m = n_edges, directed = FALSE)
# if (!is_connected(g_rand)) next
# 
#   crand_vals[i] <- transitivity(g_rand, type = "global")
#   lrand_vals[i] <- mean_distance(g_rand, directed = FALSE, unconnected = TRUE)
# }
# 
# # ---- Calculate mean values across random graphs ----
# crand <- mean(crand_vals, na.rm = TRUE)
# lrand <- mean(lrand_vals, na.rm = TRUE)
# 
# # ---- Compute Small-World Index () ----
# sigma <- (creal / crand) / (lreal / lrand)
# 
# 
# cat("Global clustering coefficient (real):", creal, "\n")
# cat("Average path length (real):", lreal, "\n")
# cat("Mean clustering coefficient (random):", crand, "\n")
# cat("Mean path length (random):", lrand, "\n")
# cat("Small-World Index ():", round(sigma, 2), "\n")
# 

## -----------------------------------------------------------------------------
sessionInfo()

## ----cleanup, include = FALSE-------------------------------------------------
# =====================================================================
# FINAL CLEANUP: Remove files created during vignette execution
# =====================================================================

# Note: This cleanup ensures no large or temporary files remain after vignette execution.
# All deletions are conditional (safe) and comply with CRAN/Bioconductor policies.

files_to_delete <- c(
  "Global_Network_Metrics.csv", "absolute_counts.csv", "Summary.csv",
  "post eval summary.csv", "abundance_analysis_filtered_phyloseq.rds",
  "abundance_analysis_filtered_tse.rds",
  "abundance_analysis_high_abundance_phyloseq.rds",
  "abundance_analysis_high_abundance_tse.rds",
  "abundance_analysis_low_abundance_phyloseq.rds",
  "abundance_analysis_low_abundance_tse.rds",
  "adjusted_prevalence.rds", "core_microbiome.csv", "core_microbiome.rds",
  "comp_spikein_patristic_distances.csv", "comp_spikein_branch_lengths.pdf",
  "comp_spikein_patristic_distances_hist.pdf", "merged_data.csv", "merged_data.docx",
  "comp_spikein.nwk", "comp_spikein_tree.png",
  "comp_spikein_patristic_distances_hist.pdf",
  "post_eval_summary.csv", "post_eval_summary.docx",
  "merged_physeq_max_processed.rds", "merged_physeq_sum.rds",
  "merged_physeq_sum_processed.rds", "spikeIn_factors_output",
  "plot_Spike.percentage_Kruskal.pdf", "plot_Spike.percentage_Kruskal.png",
  "plot_Spike.reads_Kruskal.pdf", "plot_Spike.reads_Kruskal.png",
  "plot_Total.reads_Kruskal.pdf", "plot_Total.reads_Kruskal.png",
  "proportion_adjusted_physeq.rds", "proportion_adjusted_tse.rds",
  "randomsubsample_Trimmed_evenDepth.rds", "summary.csv", "summary.docx",
  "DspikeIn-manual.tex", "output.csv", "output.docx",
  "tree_with_bootstrap_and_cophenetic.png"
)

# Safely remove existing files
if (any(file.exists(files_to_delete))) {
  invisible(file.remove(files_to_delete[file.exists(files_to_delete)]))
}

# Safely remove known temporary folders
folders_to_delete <- c("spikeIn_factors_output")
for (folder in folders_to_delete) {
  if (dir.exists(folder)) unlink(folder, recursive = TRUE, force = TRUE)
}