## ----global_options, include=FALSE--------------------------------------------
knitr::opts_chunk$set(
    echo = TRUE, warning = FALSE, message = FALSE, fig.pos = "h",
    collapse = TRUE,
    comment = "#>"
)

## ----setup, include = FALSE---------------------------------------------------
chooseCRANmirror(graphics = FALSE, ind = 1)
options(tinytex.verbose = TRUE)

## ----batch_workflow, include = TRUE, fig.align = 'center', echo=FALSE, fig.cap='proBatch in batch correction workflow', out.width = '50%'----
knitr::include_graphics("Batch_effects_workflow_staircase.png")

## ----install_proBatch, fig.show='hold', eval = FALSE--------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
#     install.packages("BiocManager")
# }
# BiocManager::install("proBatch")
# 
# library(proBatch)

## ----load_packages------------------------------------------------------------
require(dplyr)
require(tibble)
require(ggplot2)

## ----col_names----------------------------------------------------------------
feature_id_col <- "peptide_group_label"
measure_col <- "Intensity"
sample_id_col <- "FullRunName"
essential_columns <- c(feature_id_col, measure_col, sample_id_col)

## ----tech_bio_cols------------------------------------------------------------
technical_factors <- c("MS_batch", "digestion_batch")
biological_factors <- c("Strain", "Diet", "Sex")
biospecimen_id_col <- "EarTag"

## ----set_batch_col------------------------------------------------------------
batch_col <- "MS_batch"

## ----load_data, fig.show='hold'-----------------------------------------------
library(proBatch)
data(
    "example_proteome", "example_sample_annotation", "example_peptide_annotation",
    package = "proBatch"
)

## ----date_to_order, fig.show='hold'-------------------------------------------
generated_sample_annotation <- date_to_sample_order(
    example_sample_annotation,
    time_column = c("RunDate", "RunTime"),
    new_time_column = "generated_DateTime",
    dateTimeFormat = c("%b_%d", "%H:%M:%S"),
    new_order_col = "generated_order",
    instrument_col = NULL
)
library(knitr)
kable(generated_sample_annotation[1:5, ] %>%
    select(c("RunDate", "RunTime", "order", "generated_DateTime", "generated_order")))

## ----pep_annotation, fig.show='hold'------------------------------------------
generated_peptide_annotation <- create_peptide_annotation(
    example_proteome,
    feature_id_col = "peptide_group_label",
    protein_col = "Protein"
)

## ----reduce_proteome----------------------------------------------------------
example_proteome <- example_proteome %>% select(one_of(essential_columns))
gc()

## ----long_to_matrix-----------------------------------------------------------
example_matrix <- long_to_matrix(
    example_proteome,
    feature_id_col = "peptide_group_label",
    measure_col = "Intensity",
    sample_id_col = "FullRunName"
)

## ----log_transform------------------------------------------------------------
log_transformed_matrix <- log_transform_dm(
    example_matrix,
    log_base = 2, offset = 1
)

## ----color_scheme, fig.show='hold'--------------------------------------------
color_list <- sample_annotation_to_colors(
    example_sample_annotation,
    factor_columns = c(
        "MS_batch", "digestion_batch",
        "EarTag", "Strain",
        "Diet", "Sex"
    ),
    numeric_columns = c("DateTime", "order")
)

## ----plot_mean, fig.show='hold', fig.width=5, fig.height=2--------------------
plot_sample_mean(
    log_transformed_matrix, example_sample_annotation,
    order_col = "order",
    batch_col = batch_col, color_by_batch = TRUE, ylimits = c(12, 16.5),
    color_scheme = color_list[[batch_col]],
    base_size = 10
)

## ----plot_boxplots, fig.show='hold', fig.width=10, fig.height=5---------------
log_transformed_long <- matrix_to_long(log_transformed_matrix)
batch_col <- "MS_batch"
plot_boxplot(
    log_transformed_long, example_sample_annotation,
    batch_col = batch_col, color_scheme = color_list[[batch_col]]
)

## ----median_normalization_log, fig.show='hold'--------------------------------
median_normalized_matrix <- normalize_data_dm(
    log_transformed_matrix,
    normalize_func = "medianCentering"
)

## ----median_normalization_raw, fig.show='hold'--------------------------------
median_normalized_matrix <- normalize_data_dm(
    example_matrix,
    normalize_func = "medianCentering",
    log_base = 2, offset = 1
)

## ----quantile_norm, fig.show='hold'-------------------------------------------
quantile_normalized_matrix <- normalize_data_dm(
    log_transformed_matrix,
    normalize_func = "quantile"
)

## ----plot_mean_normalized, fig.show='hold', fig.width=5, fig.height=2---------
plot_sample_mean(
    quantile_normalized_matrix, example_sample_annotation,
    color_by_batch = TRUE, ylimits = c(12, 16),
    color_scheme = color_list[[batch_col]]
)

## ----plot_hierarchical, fig.show='hold', fig.width=10, fig.height=5-----------
selected_annotations <- c("MS_batch", "digestion_batch", "Diet")

# Plot clustering between samples
plot_hierarchical_clustering(
    quantile_normalized_matrix,
    sample_annotation = example_sample_annotation,
    color_list = color_list,
    factors_to_plot = selected_annotations,
    distance = "euclidean", agglomeration = "complete",
    label_samples = FALSE
)

## ----plot_heatmap, fig.show='hold', fig.width=10, fig.height=11---------------
plot_heatmap_diagnostic(
    quantile_normalized_matrix, example_sample_annotation,
    factors_to_plot = selected_annotations,
    cluster_cols = TRUE,
    color_list = color_list,
    show_rownames = FALSE, show_colnames = FALSE
)

## ----plot_PCA, fig.show='hold', fig.width=14, fig.height=8.5------------------
pca1 <- plot_PCA(
    quantile_normalized_matrix, example_sample_annotation,
    color_by = "MS_batch",
    plot_title = "MS batch", color_scheme = color_list[["MS_batch"]]
)
pca2 <- plot_PCA(
    quantile_normalized_matrix, example_sample_annotation,
    color_by = "digestion_batch",
    plot_title = "Digestion batch", color_scheme = color_list[["digestion_batch"]]
)
pca3 <- plot_PCA(
    quantile_normalized_matrix, example_sample_annotation,
    color_by = "Diet",
    plot_title = "Diet", color_scheme = color_list[["Diet"]]
)
pca4 <- plot_PCA(
    quantile_normalized_matrix, example_sample_annotation,
    color_by = "DateTime",
    plot_title = "DateTime", color_scheme = color_list[["DateTime"]]
)

library(gridExtra)
grid.arrange(pca1, pca2, pca3, pca4, ncol = 2, nrow = 2)

## ----plot_PCA_spec, fig.show='hold', fig.width=5, fig.height=3.5--------------
pca_spec <- plot_PCA(
    quantile_normalized_matrix, example_sample_annotation,
    color_by = "digestion_batch",
    plot_title = "Digestion batch"
)
pca_spec

## ----plot_PVCA, fig.show='hold', fig.width=5, fig.height=5--------------------
plot_PVCA(
    quantile_normalized_matrix, example_sample_annotation,
    technical_factors = technical_factors,
    biological_factors = biological_factors
)

## ----plot_spikeIn, fig.show='hold'--------------------------------------------
quantile_normalized_long <- matrix_to_long(quantile_normalized_matrix)
plot_spike_in(
    quantile_normalized_long, example_sample_annotation,
    peptide_annotation = example_peptide_annotation,
    protein_col = "Gene", spike_ins = "BOVINE_A1ag",
    plot_title = "Spike-in BOVINE protein peptides",
    color_by_batch = TRUE, color_scheme = color_list[[batch_col]],
    base_size = 8
)

## ----loess_fit, fig.show='hold', fig.width=5, fig.height=2.4------------------
loess_fit_df <- adjust_batch_trend_df(quantile_normalized_long, example_sample_annotation)

## ----loess_30, fig.show='hold', fig.width=5, fig.height=2.4-------------------
loess_fit_30 <- adjust_batch_trend_df(
    quantile_normalized_long,
    example_sample_annotation,
    span = 0.3
)

plot_with_fitting_curve(
    feature_name = "10231_QDVDVWLWQQEGSSK_2",
    fit_df = loess_fit_30, fit_value_col = "fit",
    df_long = quantile_normalized_long,
    sample_annotation = example_sample_annotation,
    color_by_batch = TRUE, color_scheme = color_list[[batch_col]],
    plot_title = "Span = 30%",
    base_size = 10 # text size
)

## ----loess_70, fig.show='hold', fig.width=5, fig.height=2.4-------------------
loess_fit_70 <- adjust_batch_trend_df(
    quantile_normalized_long, example_sample_annotation,
    span = 0.7
)
plot_with_fitting_curve(
    feature_name = "10231_QDVDVWLWQQEGSSK_2",
    fit_df = loess_fit_70, fit_value_col = "fit",
    df_long = quantile_normalized_long,
    sample_annotation = example_sample_annotation,
    color_by_batch = TRUE, color_scheme = color_list[[batch_col]],
    plot_title = "Span = 70%",
    base_size = 10 # text size
)

## ----median_batch, fig.show='hold', fig.width=3, fig.height=2.4---------------
peptide_median_df <- center_feature_batch_medians_df(
    loess_fit_df, example_sample_annotation
)
plot_single_feature(
    feature_name = "10231_QDVDVWLWQQEGSSK_2", df_long = peptide_median_df,
    sample_annotation = example_sample_annotation, measure_col = "Intensity",
    plot_title = "Feature-level Median Centered",
    base_size = 10 # text size
)

## ----comBat, fig.show='hold'--------------------------------------------------
comBat_df <- correct_with_ComBat_df(loess_fit_df, example_sample_annotation)

## ----combat_result, fig.show='hold',  fig.width=3, fig.height=2.4-------------
plot_single_feature(
    feature_name = "10231_QDVDVWLWQQEGSSK_2",
    df_long = loess_fit_df,
    sample_annotation = example_sample_annotation,
    plot_title = "Loess Fitted",
    base_size = 10 # text and dots size
)
plot_single_feature(
    feature_name = "10231_QDVDVWLWQQEGSSK_2",
    df_long = comBat_df,
    sample_annotation = example_sample_annotation,
    plot_title = "ComBat corrected",
    base_size = 10 # text and dots size
)

## ----batch_corr_general, fig.show='hold'--------------------------------------
batch_corrected_df <- correct_batch_effects_df(
    df_long = quantile_normalized_long,
    sample_annotation = example_sample_annotation,
    discrete_func = "ComBat",
    continuous_func = "loess_regression",
    abs_threshold = 5, pct_threshold = 0.20
)
batch_corrected_matrix <- long_to_matrix(batch_corrected_df)

## ----setup_corr_heatmap, fig.show='hold', fig.height=5, fig.width=8-----------
earTags <- c(
    "ET1524", "ET2078", "ET1322", "ET1566", "ET1354", "ET1420", "ET2154",
    "ET1515", "ET1506", "ET2577", "ET1681", "ET1585", "ET1518", "ET1906"
)

replicate_filenames <- example_sample_annotation %>%
    filter(MS_batch %in% c("Batch_2", "Batch_3")) %>%
    filter(EarTag %in% earTags) %>%
    pull(!!sym("FullRunName"))

## ----corr_heatmap, fig.show='hold', fig.width=10, fig.height=11---------------
p1_exp <- plot_sample_corr_heatmap(
    log_transformed_matrix,
    samples_to_plot = replicate_filenames,
    plot_title = "Correlation of selected samples"
)

## ----color_scale, fig.show='hold'---------------------------------------------
breaksList <- seq(0.7, 1, by = 0.01) # color scale of pheatmap
heatmap_colors <- colorRampPalette(
    rev(RColorBrewer::brewer.pal(n = 7, name = "RdYlBu"))
)(length(breaksList) + 1)

## ----corr_samples_heatmap, fig.show='hold', fig.height=4, fig.width=5---------
# Plot the heatmap with annotations for the chosen samples
factors_to_show <- c(batch_col, biospecimen_id_col)

p1 <- plot_sample_corr_heatmap(
    log_transformed_matrix,
    samples_to_plot = replicate_filenames,
    sample_annotation = example_sample_annotation,
    factors_to_plot = factors_to_show,
    plot_title = "Log transformed correlation matrix of\nselected replicated samples",
    color_list = color_list,
    heatmap_color = heatmap_colors, breaks = breaksList,
    cluster_rows = FALSE, cluster_cols = FALSE, fontsize = 6,
    annotation_names_col = TRUE, annotation_legend = FALSE,
    show_colnames = FALSE
)

p2 <- plot_sample_corr_heatmap(
    batch_corrected_matrix,
    samples_to_plot = replicate_filenames,
    sample_annotation = example_sample_annotation,
    factors_to_plot = factors_to_show,
    plot_title = "Batch Corrected\nselected replicated samples",
    color_list = color_list,
    heatmap_color = heatmap_colors, breaks = breaksList,
    cluster_rows = FALSE, cluster_cols = FALSE, fontsize = 6,
    annotation_names_col = TRUE, annotation_legend = FALSE,
    show_colnames = FALSE
)

## ----corr_samples_heatmap_2, fig.show='hold', fig.height=4, fig.width=10------
library(gridExtra)
grid.arrange(grobs = list(p1$gtable, p2$gtable), ncol = 2)

## ----corr_samples_distrib-----------------------------------------------------
sample_cor_raw <- plot_sample_corr_distribution(
    log_transformed_matrix,
    example_sample_annotation,
    # repeated_samples = replicate_filenames,
    batch_col = "MS_batch",
    biospecimen_id_col = "EarTag",
    plot_title = "Correlation of samples (raw)",
    plot_param = "batch_replicate"
)
raw_corr <- sample_cor_raw +
    theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        text = element_text(size = 8)
    ) +
    ylim(0.7, 1) + xlab(NULL)

sample_cor_batchCor <- plot_sample_corr_distribution(
    batch_corrected_matrix,
    example_sample_annotation,
    batch_col = "MS_batch",
    plot_title = "Batch corrected",
    plot_param = "batch_replicate"
)
corr_corr <- sample_cor_batchCor +
    theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        text = element_text(size = 8)
    ) +
    ylim(0.7, 1) + xlab(NULL)

## ----combine_corr, fig.height=2.5, fig.show='hold', fig.width=6, warning=FALSE----
grid.arrange(
    grobs = list(raw_corr, corr_corr),
    size = "first", ncol = 2
)

## ----correlation_of_peptides, fig.height=2.5, fig.show='hold', fig.width=6----
peptide_cor_raw <- plot_peptide_corr_distribution(
    log_transformed_matrix,
    example_peptide_annotation,
    protein_col = "Gene",
    plot_title = "Peptide correlation (raw)"
)

peptide_cor_batchCor <- plot_peptide_corr_distribution(
    batch_corrected_matrix,
    example_peptide_annotation,
    protein_col = "Gene",
    plot_title = "Peptide correlation (batch corrected)"
)
g2 <- peptide_cor_raw +
    theme(text = element_text(size = 8)) +
    ylim(c(-1, 1))
g3 <- peptide_cor_batchCor +
    ylim(c(-1, 1)) + theme(text = element_text(size = 8))

grid.arrange(grobs = list(g2, g3), size = "first", ncol = 2)

## ----sessionInfo, eval=TRUE---------------------------------------------------
sessionInfo()

## ----citation-----------------------------------------------------------------
citation("proBatch")

