## ----global_options, include=FALSE--------------------------------------------
knitr::opts_chunk$set(
    echo = TRUE, warning = FALSE, message = FALSE, fig.pos = "h",
    collapse = TRUE,
    comment = "#>",
    knitr.table.format = "simple"
)

## ----setup, include = FALSE---------------------------------------------------
chooseCRANmirror(graphics = FALSE, ind = 1)
options(tinytex.verbose = TRUE)

## ----install_proBatch, fig.show='hold', eval = FALSE--------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
#     install.packages("BiocManager")
# }
# BiocManager::install("proBatch")

## ----load_packages, message=FALSE---------------------------------------------
library(proBatch)

library(dplyr)
library(tibble)
library(ggplot2)
library(QFeatures)

## ----load_data----------------------------------------------------------------
data("example_ecoli_data", package = "proBatch")

## ----extract-data-------------------------------------------------------------
# Extracting data
all_metadata <- example_ecoli_data$all_metadata
all_precursors <- example_ecoli_data$all_precursors
all_protein_groups <- example_ecoli_data$all_protein_groups
all_precursor_pg_match <- example_ecoli_data$all_precursor_pg_match

# Keeping only essential
rm(example_ecoli_data)

## ----build-pbf-from-matrix----------------------------------------------------
# Building from a (features x samples) matrix
pbf <- ProBatchFeatures(
    data_matrix = all_precursors, # matrix of peptide intensities (features x samples)
    sample_annotation = all_metadata, # data.frame of sample metadata
    sample_id_col = "Run", # column in metadata that matches sample IDs
    level = "peptide" # label this assay level as "peptide"
)

pbf # show basic info about the ProBatchFeatures object

## ----add-new-pbf-level--------------------------------------------------------
# Adding proteins as a new level and mapping them to precursors
#    all_precursor_pg_match has columns: "Precursor.Id", "Protein.Ids"
pbf <- pb_add_level(
    object = pbf,
    from = "peptide::raw",
    new_matrix = all_protein_groups,
    to_level = "protein", # will name "protein::raw" by default
    mapping_df = all_precursor_pg_match,
    from_id = "Precursor.Id",
    to_id = "Protein.Ids",
    map_strategy = "as_is"
)

pbf

## ----check_pbf----------------------------------------------------------------
# Check
validObject(pbf) # should be TRUE
assayLink(pbf, "protein::raw") # verify link summary

# Remove unused variables to clean up the workspace
rm(all_metadata, all_precursor_pg_match, all_precursors, all_protein_groups)

## ----nNA_calc-----------------------------------------------------------------
# pb_nNA returns a DataFrame with counts of NAs per feature/sample
na_counts <- pb_nNA(pbf)

# na_counts contains info about total number of NAs and % in the data:
na_counts[["nNA"]] %>% as_tibble()

## ----nNA_individual-----------------------------------------------------------
# check NAs per sample:
head(na_counts[["peptide::raw"]]$nNAcols) %>% as_tibble()

## ----print_na_counts----------------------------------------------------------
print(names(na_counts[["peptide::raw"]]))

## ----define_colors------------------------------------------------------------
# color scheme for lab and condition labels
color_scheme <- sample_annotation_to_colors(
    pbf,
    sample_id_col = "Run",
    factor_columns = c("Lab", "Condition"),
    numeric_columns = NULL
)

## ----plot_na_heatmap_single, fig.show='hold', fig.width=10, fig.height=4------
plot_NA_heatmap(
    pbf, # by default the last created assay
    show_rownames = FALSE, show_row_dend = FALSE,
    color_by = "Lab"
)

## ----plot_na_heatmap_multi, fig.show='hold', fig.width=10, fig.height=4-------
plot_NA_heatmap(
    pbf,
    pbf_name = c("peptide::raw", "protein::raw"),
    show_rownames = FALSE, show_row_dend = FALSE,
    color_by = "Lab"
)

## ----plot_na_frequency_panel, fig.show='hold', fig.width=7, fig.height=4------
plot_NA_frequency(
    pbf,
    pbf_name = c("peptide::raw", "protein::raw"),
    show_percent = FALSE
)

## ----apply_missing_filter-----------------------------------------------------
pbf <- pb_filterNA(
    pbf, # without specifying, the filter will be applied to all assays in the object
    inplace = TRUE, # if false, filtered matrix will be saved as a new assay.
    pNA = 0.6 # the maximal proportion of missing values per feature
)
pbf

## ----plot_na_heatmap_filtered, fig.show='hold', fig.width=7, fig.height=4-----
plot_NA_heatmap(
    pbf,
    pbf_name = c("peptide::raw", "protein::raw"),
    show_rownames = FALSE, show_row_dend = FALSE,
    color_by = "Lab", # currently only one level is supported
    border_color = NA # pheatmap parameter to remove cell borders
)

## ----run_log_transform--------------------------------------------------------
pbf <- log_transform_dm(
    pbf,
    log_base = 2, offset = 1,
    pbf_name = "protein::raw"
)

pbf <- log_transform_dm(
    pbf,
    log_base = 2, offset = 1,
    pbf_name = "peptide::raw"
)

pbf

## ----plot_logmean, fig.height=6, fig.width=8, warning=FALSE-------------------
plot_boxplot(
    pbf,
    pbf_name = c("peptide::log2_on_raw", "protein::log2_on_raw"), # plot multiple on one
    sample_id_col = "Run",
    order_col = "Run",
    batch_col = "Lab",
    color_by_batch = TRUE,
    color_scheme = color_scheme,
    base_size = 7,
    plot_ncol = 1, # how many columns use for plotting multy-assay plot
    plot_title = c( # title for each assay
        "Peptide level - log2 scale",
        "Protein level - log2 scale"
    )
)

## ----plot_na_density_panels, fig.show='hold', fig.width=10, fig.height=4------
plot_NA_density(
    pbf,
    pbf_name = c("peptide::log2_on_raw", "protein::log2_on_raw")
)

## ----plot_PCA, fig.show='hold', fig.width=8, fig.height=4---------------------
pca1 <- plot_PCA(
    pbf,
    pbf_name = "protein::log2_on_raw",
    sample_id_col = "Run",
    color_scheme = color_scheme,
    color_by = "Lab",
    shape_by = "Condition",
    fill_the_missing = NULL, # remove all rows with missing values. Can be also "remove"
    plot_title = "NA rows removed, protein, log2",
    base_size = 10, point_size = 3, point_alpha = 0.5
)

pca2 <- plot_PCA(
    pbf,
    pbf_name = "protein::log2_on_raw",
    sample_id_col = "Run",
    color_scheme = color_scheme,
    color_by = "Condition",
    shape_by = "Lab",
    fill_the_missing = 0, # default value is -1
    plot_title = "NA replaced with 0, protein, log2",
    base_size = 10, point_size = 3, point_alpha = 0.5
)

library(gridExtra)
grid.arrange(pca1, pca2, ncol = 2, nrow = 1)

## ----apply_median_normalization-----------------------------------------------
pbf <- pb_transform(
    object = pbf,
    from = "peptide::log2_on_raw",
    steps = "medianNorm"
)

pbf <- pb_transform(
    object = pbf,
    from = "protein::log2_on_raw",
    steps = "medianNorm"
)

pbf

## ----compare_median_norm_diagnostics, fig.show='hold', fig.width=8, fig.height=4----
# boxplot
plot_boxplot(
    pbf,
    sample_id_col = "Run",
    order_col = "Run",
    batch_col = "Lab",
    color_by_batch = TRUE,
    color_scheme = color_scheme,
    base_size = 7,
    pbf_name = c("protein::log2_on_raw", "protein::medianNorm_on_log2_on_raw"),
    plot_ncol = 1,
    plot_title = c(
        "Before Median Normalization, protein level",
        "After Median Normalization, protein level"
    )
)

# plot PCA plot
plot_PCA(
    pbf,
    pbf_name = c("protein::log2_on_raw", "protein::medianNorm_on_log2_on_raw"),
    sample_id_col = "Run",
    color_scheme = color_scheme,
    color_by = "Lab",
    shape_by = "Condition",
    fill_the_missing = NULL, # remove all rows with missing values. Can be also "remove"
    plot_title = c(
        "Before Median Normalization, protein level",
        "After Median Normalization, protein level"
    ),
    base_size = 10, point_size = 3, point_alpha = 0.5
)

## ----prepare_sample_annotations-----------------------------------------------
# sample annotations used by both correction methods
sa <- colData(pbf) %>% as.data.frame()

## ----apply_limma_correction---------------------------------------------------
# limma::removeBatchEffect method
pbf <- pb_transform(
    pbf,
    from = "protein::log2_on_raw",
    steps = "limmaRBE",
    params_list = list(list(
        sample_annotation = sa,
        batch_col = "Lab",
        covariates_cols = c("Condition"),
        fill_the_missing = FALSE
    ))
)

pbf <- pb_transform(
    pbf,
    from = "protein::medianNorm_on_log2_on_raw",
    steps = "limmaRBE",
    params_list = list(list(
        sample_annotation = sa,
        batch_col = "Lab",
        covariates_cols = c("Condition"),
        fill_the_missing = FALSE
    ))
)

## ----inspect_pbf_after_limma--------------------------------------------------
print(pbf)

## ----apply_combat_correction--------------------------------------------------
# sva::ComBat wrapped via pb_transform()
pbf <- pb_transform(
    pbf,
    from = "protein::log2_on_raw",
    steps = "combat",
    params_list = list(list(
        sample_annotation = sa,
        batch_col = "Lab",
        sample_id_col = "Run",
        par.prior = TRUE,
        fill_the_missing = "remove"
    ))
)

pbf <- pb_transform(
    pbf,
    from = "protein::medianNorm_on_log2_on_raw",
    steps = "combat",
    params_list = list(list(
        sample_annotation = sa,
        batch_col = "Lab",
        sample_id_col = "Run",
        par.prior = TRUE,
        fill_the_missing = "remove"
    ))
)

## ----inspect_pbf_after_combat-------------------------------------------------
print(pbf)

## ----apply_batch_correction_peptide-------------------------------------------
pbf <- pb_transform(
    pbf,
    from = "peptide::medianNorm_on_log2_on_raw",
    steps = "limmaRBE",
    params_list = list(
        list(
            sample_annotation = sa,
            batch_col = "Lab",
            covariates_cols = c("Condition"),
            fill_the_missing = FALSE
        )
    )
)

pbf <- pb_transform(
    pbf,
    from = "peptide::medianNorm_on_log2_on_raw",
    steps = "combat",
    params_list = list(
        list(
            sample_annotation = sa,
            batch_col = "Lab",
            sample_id_col = "Run",
            par.prior = TRUE,
            fill_the_missing = "remove"
        )
    )
)

pbf

## ----plot_pca_limma_series, fig.height=7, fig.show='hold', fig.width=8, message=FALSE, warning=FALSE----
plot_PCA(
    pbf,
    pbf_name = c(
        "protein::log2_on_raw",
        "protein::medianNorm_on_log2_on_raw",
        "protein::limmaRBE_on_medianNorm_on_log2_on_raw",
        "protein::combat_on_medianNorm_on_log2_on_raw"
    ),
    sample_id_col = "Run",
    color_scheme = color_scheme,
    color_by = "Lab",
    shape_by = "Condition",
    fill_the_missing = NULL,
    base_size = 8, point_size = 3, point_alpha = 0.5
)

## ----plot_pca_peptides, fig.height=7, fig.show='hold', fig.width=8, message=FALSE, warning=FALSE----
plot_PCA(
    pbf,
    pbf_name = c(
        "peptide::log2_on_raw",
        "peptide::medianNorm_on_log2_on_raw",
        "peptide::limmaRBE_on_medianNorm_on_log2_on_raw",
        "peptide::combat_on_medianNorm_on_log2_on_raw"
    ),
    sample_id_col = "Run",
    color_scheme = color_scheme,
    color_by = "Lab",
    shape_by = "Condition",
    fill_the_missing = "remove",
    base_size = 8, point_size = 3, point_alpha = 0.5
)

## ----plot_hclust_combat, fig.show='hold', fig.width=12, fig.height=4----------
plot_hierarchical_clustering(
    pbf,
    pbf_name = "protein::combat_on_medianNorm_on_log2_on_raw",
    sample_id_col = "Run",
    label_font = 0.6,
    color_list = color_scheme
)

## ----plot_hclust_limma, fig.show='hold', fig.width=12, fig.height=4-----------
plot_hierarchical_clustering(
    pbf,
    pbf_name = "protein::limmaRBE_on_medianNorm_on_log2_on_raw",
    sample_id_col = "Run",
    label_font = 0.6,
    color_list = color_scheme,
    fill_the_missing = "remove"
)

## ----plot_pvca_panels, fig.height=6, fig.show='hold', fig.width=5, message=FALSE, warning=FALSE----
plot_PVCA(
    pbf,
    pbf_name = c(
        "protein::medianNorm_on_log2_on_raw",
        "protein::combat_on_medianNorm_on_log2_on_raw"
    ),
    sample_id_col = "Run",
    technical_factors = c("Lab"),
    biological_factors = c("Condition"),
    fill_the_missing = NULL,
    base_size = 7,
    plot_ncol = 2, # the number of plots in a row
    variance_threshold = 0 # the the percentile value of weight each of the
    # covariates needs to explain
    # (the rest will be lumped together)
)

## ----show_operation_log-------------------------------------------------------
get_operation_log(pbf) %>%
    as_tibble()

## ----compute_pipeline_name----------------------------------------------------
pb_pipeline_name(pbf, "protein::combat_on_medianNorm_on_log2_on_raw")

## ----matr---------------------------------------------------------------------
extracted_matrix <- pb_assay_matrix(
    pbf,
    assay = "peptide::raw"
)
# show
extracted_matrix[1:5, 1:5]
rm(extracted_matrix)

## ----extract_to_long----------------------------------------------------------
extracted_long <- pb_as_long(
    pbf,
    sample_id_col = "Run",
    pbf_name = "peptide::raw"
)

# show
extracted_long[1:3, ]
rm(extracted_long)

## ----sessionInfo--------------------------------------------------------------
sessionInfo()

## ----citation-----------------------------------------------------------------
citation("proBatch")

