## ----load_libraries-----------------------------------------------------------
suppressPackageStartupMessages({
library("QFeatures")  
library("dplyr") 
library("tidyr")
library("ggplot2")
library("msqrob2")    
library("stringr")
library("ExploreModelMatrix")
library("kableExtra")
library("ComplexHeatmap")
library("scater")
})

## ----file_location------------------------------------------------------------
peptideFile <- "https://github.com/statOmics/msqrob2data/raw/refs/heads/main/dda/cptacAvsB_lab3/peptides.txt"

## ----import_data--------------------------------------------------------------
peptides <- data.table::fread(peptideFile, 
                              check.names = TRUE, 
                              integer64 = "double")
head(peptides)

## ----extract_quantCols--------------------------------------------------------
quantCols <- grep("Intensity[.]", names(peptides), value = TRUE)

## ----add_species_info---------------------------------------------------------
peptides <- peptides |> 
  mutate(species = grepl(pattern = "UPS",Proteins) |> 
           as.factor() |>
           recode("TRUE"="ups","FALSE" = "yeast"))

## ----make_annotation----------------------------------------------------------
(annot <- data.frame(quantCols = quantCols) |> #1.
  mutate(sampleId = gsub("Intensity[.]6","", quantCols), #2. 
         condition = substr(sampleId,1,1), #3.
         rep = substr(sampleId, 3, 3)) #4.
)

## ----convert_to_QFeatures-----------------------------------------------------
(qf <- readQFeatures(assayData = peptides, 
                     colData = annot,
                     name = "peptides",
                     fnames = "Sequence")
)

## ----convert0_to_NA-----------------------------------------------------------
qf <- zeroIsNA(qf, "peptides") # convert 0 to NA

## ----log2_transformation------------------------------------------------------
qf <- logTransform(qf, base = 2, i = "peptides", name = "peptides_log")

## ----failed_protein_inference_filtering---------------------------------------
qf <- filterFeatures(
  qf, ~ Proteins != "" & ## Remove failed protein inference
    !grepl(";", Proteins)) ## Remove protein groups

## ----filtering_decoys_contaminants--------------------------------------------
qf <- filterFeatures(qf, ~ Reverse != "+" & ## Remove decoys
                          Potential.contaminant != "+") ## Remove contaminants

## ----assess_NA_features-------------------------------------------------------
nNaRes <- nNA(qf, "peptides")
range(nNaRes$nNAcols$pNA)

## ----plot_NA_features---------------------------------------------------------
data.frame(nNaRes$nNArows) |> 
  ggplot() +
  aes(x = nNA) +
  geom_histogram() +
  ggtitle("Number of missing values for each peptide") +
  theme_bw()

## ----filter_features_with_many_NA---------------------------------------------
nObs <- 3
n <- ncol(qf[["peptides_log"]])
(qf <- filterNA(qf, i = "peptides_log", pNA = (n - nObs) / n))

## ----normalisation_needed-----------------------------------------------------
qf[, , "peptides_log"] |> #1.
  longForm(colvars = colnames(colData(qf))) |>  #2. 
  data.frame() |> 
  filter(!is.na(value)) |>
  ggplot() + #3.
  aes(x = value,
      colour = condition,
      group = colname) +
  geom_density() +
  theme_minimal()

## ----normalize_peptides-------------------------------------------------------
qf <- sweep( #Subtract log2 norm factor column-by-column (MARGIN = 2)
  qf,
  MARGIN = 2,
  STATS = nfLogMedian(qf,"peptides_log"),
  i = "peptides_log",
  name = "peptides_norm"
)

## ----assess_normalisation-----------------------------------------------------
qf[, , "peptides_norm"] |>
  longForm(colvars = colnames(colData(qf))) |> 
  data.frame() |> 
  filter(!is.na(value)) |>
  ggplot() + 
  aes(x = value,
      colour = condition,
      group = colname) +
  geom_density() +
  labs(subtitle = "Normalised log2 precursor intensities") +
  theme_minimal()

## ----summarize_peptides_to_proteins, warning=FALSE----------------------------
qf <- aggregateFeatures(qf,
    i = "peptides_norm", 
    fcol = "Proteins", 
    na.rm = TRUE,
    name = "proteins"
)

## ----qc_normalisation_peptides------------------------------------------------
qf[, , "peptides_norm"] |>
  longForm(colvars = colnames(colData(qf))) |> 
  data.frame() |> 
  filter(!is.na(value)) |>
  ggplot() + 
  aes(x = value,
      colour = condition,
      group = colname) +
  geom_density() +
  theme_minimal() +
   labs(subtitle = "Normalised log2 precursor intensities")

## ----qc_normalisation_peptides_boxplot----------------------------------------
qf[, , "peptides_norm"] |>
  longForm(colvars = colnames(colData(qf))) |> 
  data.frame() |> 
  filter(!is.na(value)) |>
  ggplot() + 
  aes(x = sampleId,
      y = value,
      colour = condition,
      group = colname) +
  xlab("sample") +
  geom_boxplot() +
  theme_minimal() +
   labs(subtitle = "Normalised log2 precursor intensities")

## ----qc_normalisation_proteins------------------------------------------------
qf[, , "proteins"] |>
  longForm(colvars = colnames(colData(qf))) |> 
  data.frame() |> 
  filter(!is.na(value)) |>
  ggplot() + 
  aes(x = value,
      colour = condition,
      group = colname) +
  geom_density() +
  theme_minimal() +
   labs(subtitle = "Normalised log2 protein intensities")

## ----qc_normalisation_proteins_boxplot----------------------------------------
qf[, , "proteins"] |>
  longForm(colvars = colnames(colData(qf))) |> 
  data.frame() |> 
  filter(!is.na(value)) |>
  ggplot() + 
  aes(x = sampleId,
      y = value,
      colour = condition,
      group = colname) +
  xlab("sample") +
  geom_boxplot() +
  theme_minimal() +
   labs(subtitle = "Normalised log2 protein intensities")

## ----qc_identifications-------------------------------------------------------
qf[,,"peptides_norm"] |> 
  longForm(colvars = colnames(colData(qf)), 
           rowvars= c("Sequence", 
                      "Proteins")) |>
  data.frame() |>
  filter(!is.na(value)) |>
  group_by(condition, sampleId) |>
  summarise(Precursors = length(unique(Sequence)),
            `Protein Groups` = length(unique(Proteins))) |> 
  pivot_longer(-(1:2),
               names_to = "Feature",
               values_to = "IDs") |> 
  ggplot(aes(x = sampleId, y = IDs, fill = condition)) +
  geom_col() +
  #scale_fill_observable() +
  facet_wrap(~Feature,
             scales = "free_y") +
  labs(title = "Peptide and protein group identificiations per sample",
       x = "Sample",
       y = "Identifications") +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 90))

## ----qc_mds_proteins----------------------------------------------------------
getWithColData(qf, "proteins") |> 
  as("SingleCellExperiment") |> 
  runMDS(exprs_values = 1) |> 
  plotMDS(colour_by = "condition", shape_by = "rep", point_size = 3)

## ----qc_correlation_proteins--------------------------------------------------
corMat <- qf[["proteins"]] |> 
  assay() |>
  cor(method = "spearman", use = "pairwise.complete.obs") 
 
colnames(corMat) <- qf$sampleId
rownames(corMat) <- qf$sampleId
corMat |> 
  ggcorrplot::ggcorrplot() +
  scale_fill_viridis_c() +
  labs(title = "Correlation matrix",
       fill = "Correlation") +
  theme(axis.text.x = element_text(angle = 90))

## ----define_model-------------------------------------------------------------
model <- ~ condition

## ----fit_models, warning=FALSE------------------------------------------------
  qf <- msqrob(
    qf,  
    i = "proteins",
    formula = model,
    robust = TRUE)

## ----print_models-------------------------------------------------------------
models <- rowData(qf[["proteins"]])[["msqrobModels"]]
models[1:3]

## ----explore_model_matrix-----------------------------------------------------
vd <- ExploreModelMatrix::VisualizeDesign(
    sampleData =  colData(qf),
    designFormula = model,
    textSizeFitted = 4
)
vd$plotlist

## ----make_contrasts-----------------------------------------------------------
(L <- makeContrast(
  "conditionB = 0",
  parameterNames = colnames(vd$designmatrix)
))

## ----hypthesis_tests----------------------------------------------------------
qf <- hypothesisTest(qf, i = "proteins", contrast = L)

## ----collect_results----------------------------------------------------------
inferences <- 
  msqrobCollect(qf[["proteins"]], L) 
head(inferences)

## ----significant_table, results='asis'----------------------------------------
alpha <- 0.05 #1.
sigList <- inferences |> 
    filter(adjPval < alpha) #3.
kable(sigList) |>
      kable_styling(full_width = FALSE) |>
      scroll_box(height = "250px") #4.

## ----volcanoplot--------------------------------------------------------------
inferences |> 
  plotVolcano()

## ----heatmap------------------------------------------------------------------
sig <- sigList |> pull(feature) #1.


quants <- assay(qf,"proteins")[sig,] |> 
                     t() |> 
                     scale() |>
                     t() #2.
 
colnames(quants) <- qf$sampleId #3.
annotations <- columnAnnotation(condition = qf$condition) #3.

set.seed(1234) ## annotation colours are randomly generated by default
Heatmap(
  quants,
  name = "log2 intensity",
  top_annotation = annotations
  )  #4.

## ----define_alpha-------------------------------------------------------------
alpha <- 0.05

## ----real_logFC---------------------------------------------------------------
realLogFC <- log2(0.74/0.25)

## ----collect_results_species--------------------------------------------------
inferences <- 
  msqrobCollect(qf[["proteins"]], L)  |> 
  mutate(species =  rowData(qf[["proteins"]])$species)

## ----assess_logFC-------------------------------------------------------------
logFC <- inferences |> 
  filter(!is.na(logFC)) |>
  ggplot(aes(x = species, y = logFC, color = species)) + #1.
  geom_boxplot() + #2.
  theme_bw() + 
  scale_color_manual( 
    values = c("grey20", "firebrick"), #3. 
    name = "",
    labels = c("yeast", "ups")
    ) + 
  geom_hline(yintercept = 0, color="grey20") + # 4. 
  geom_hline(yintercept = realLogFC, color="firebrick") #5. 
logFC

## ----assess_TP_FP_FDP---------------------------------------------------------
inferences |>
    filter(adjPval < alpha) |>
    summarise("TP" = sum(species == "ups"),
              "FP" = sum(species != "ups"),
              "FDP" = mean(species != "ups"))

## ----FDR_TPR_functions--------------------------------------------------------
computeFDP <- function(pval, tp) {
    ord <- order(pval)
    fdp <- cumsum(!tp[ord]) / 1:length(tp)
    fdp[order(ord)]
}
computeTPR <- function(pval, tp, nTP = NULL) {
    if (is.null(nTP)) nTP <- sum(tp)
    ord <- order(pval)
    tpr <- cumsum(tp[ord]) / nTP
    tpr[order(ord)]
}

## ----calculate_performance----------------------------------------------------
performance <- inferences |> group_by(contrast) |> 
    na.exclude() |>
    mutate(tpr = computeTPR(pval, species == "ups"),
           fdp = computeFDP(pval, species == "ups")) |>
    arrange(fdp)

## ----calculate_workpoints-----------------------------------------------------
workPoints <- performance |> 
  group_by(contrast) |>
    filter(adjPval < 0.05) |>
    slice_max(pval)

## ----plot_tpr_fdp-------------------------------------------------------------
ggplot(performance) +
    aes(
        y = fdp,
        x = tpr,
    ) +
    geom_line() +
    geom_point(data = workPoints, size = 3) +
    geom_hline(yintercept = 0.05, linetype = 2) +
    coord_flip(ylim = c(0, 0.2)) +
    theme_minimal()

## ----sessionInfo--------------------------------------------------------------
sessionInfo()

