## ----setup, include=TRUE------------------------------------------------------
knitr::opts_chunk$set(echo = TRUE,dev="CairoPNG")
library(knitr)
library(tidyr)
library(ggplot2)
library(gridExtra)
library(magick)
library(limma)

## ----CONSTANd-----------------------------------------------------------------
#install.packages("BiocManager")
#BiocManager::install("CONSTANd")
library(CONSTANd)

## ----echo=FALSE, fig.align='center',fig.wide=TRUE, out.width='50%', fig.scap="short",fig.cap='This MA plot violates all assumptions: there is more than one cloud, there are more upregulated entities, and the bias changes strongly in the region with small magnitudes (A-values).'----
par(mfrow=c(1,1))
include_graphics('bad_MA.png')

## -----------------------------------------------------------------------------
leish <- read.csv("../inst/extdata/GSE95353_SL_SEQ_LOGSTAT_counttable.txt" , sep='\t')

## -----------------------------------------------------------------------------
rownames(leish) <- leish$gene
leish <- leish[, c("SL.LOG1", "SL.LOG2", "SL.LOG3", "SL.LOG4", 
                   "SL.STAT1", "SL.STAT2", "SL.STAT3", "SL.STAT4")]
conditions <- c(rep("LOG", 4), rep("STAT", 4))

## -----------------------------------------------------------------------------
# first set NA to 0 for calculating the row medians
leish[is.na(leish)] <- 0
leish <- leish[apply(leish, 1, median, na.rm = FALSE) > 0,]
leish.norm <- CONSTANd(leish)$normalized_data

## ----fig.align='center',fig.wide=TRUE,fig.cap="LOG samples (blue) are separated more clearly by PC1 from the STAT samples (red) in the Leishmania data set after CONSTANd normalization."----
leish[is.na(leish)] <- 0
# scale raw data
leish.pc.raw <- prcomp(x=t(leish), scale. = TRUE)
leish.norm[is.na(leish.norm)] <- 0
# CONSTANd already scaled the data
leish.pc.norm <- prcomp(x=t(leish.norm))
par(mfrow=c(1,2))
colors = ifelse(conditions == 'LOG', 'blue', 'red')
plot(leish.pc.raw$x[,'PC1'], leish.pc.raw$x[,'PC2'], 
     xlab='PC1', ylab='PC2', main='raw counts', col=colors)
plot(leish.pc.norm$x[,'PC1'], leish.pc.norm$x[,'PC2'], 
     xlab='PC1', ylab='PC2', main='normalized counts', col=colors)

## ----echo=FALSE, fig.align='center',fig.wide=TRUE, out.width='50%', fig.scap="short",fig.cap='Organs data set experimental setup: 8 mouse organ samples spread across 4 TMT 8-plex tndem-MS runs, one for each mouse. Image source: Bailey, Derek J., et al. "Intelligent data acquisition blends targeted and discovery methods." Journal of proteome research 13.4 (2014): 2152-2161. https://doi.org/10.1021/pr401278j.'----
par(mfrow=c(1,1))
include_graphics('organ_experiment_design.png')

## -----------------------------------------------------------------------------
LOAD_ORIGINAL = FALSE
# load extra functions to clean the original data and study design files
source('../inst/script/functions.R')
if (LOAD_ORIGINAL) {  # load original from web
    BR1 <- read.csv('Organs_PSMs_1.txt', sep='\t')
    BR2 <- read.csv('Organs_PSMs_2.txt', sep='\t')
    BR3 <- read.csv('Organs_PSMs_3.txt', sep='\t')
    BR4 <- read.csv('Organs_PSMs_4.txt', sep='\t')
    organs <- list(BR1, BR2, BR3, BR4)
    organs <- clean_organs(organs)
} else {  # load cleaned file that comes with this package
    organs <- readRDS('../inst/extdata/organs_cleaned.RDS')
}
# construct the study design from the QCQuan DoE file and arrange it properly
study.design <- read.csv("../inst/extdata/Organs_DoE.tsv", sep='\t', header = FALSE)
study.design <- rearrange_organs_design(study.design)

## -----------------------------------------------------------------------------
# the data has been summarized to peptide level
head(rownames(organs[[1]]))
# only the quantification columns were kept, which here happen to be all of them
quanCols <- colnames(organs[[1]])
quanCols
organs <- lapply(organs, function(x) x[,quanCols])
# make unique quantification column names
for (i in seq_along(organs)) { 
  colnames(organs[[i]]) <- paste0('BR', i, '_', colnames(organs[[i]])) }

## -----------------------------------------------------------------------------
organs.norm.obj <- lapply(organs, function(x) CONSTANd(x))
organs.norm <- lapply(organs.norm.obj, function(x) x$normalized_data)

## -----------------------------------------------------------------------------
merge_list_of_dataframes <- function(list.dfs) {
    return(Reduce(function(x, y) {
        tmp <- merge(x, y, by = 0, all = FALSE)
        rownames(tmp) <- tmp$Row.names
        tmp$Row.names <- NULL
        return(tmp)
    }, list.dfs))}
organs.df <- merge_list_of_dataframes(organs)
organs.norm.df <- merge_list_of_dataframes(organs.norm)

## -----------------------------------------------------------------------------
# PCA can't deal with NA values: impute as zero (this makes sense in a multiplex)
organs.df[is.na(organs.df)] <- 0
organs.norm.df[is.na(organs.norm.df)] <- 0
# scale raw data
organs.pc <- prcomp(x=t(organs.df), scale. = TRUE)
# CONSTANd already scaled the data
organs.norm.pc <- prcomp(x=t(organs.norm.df))

## -----------------------------------------------------------------------------
# prepare plot color mappings
organs.pcdf <- as.data.frame(organs.pc$x)
organs.pcdf <- merge(organs.pcdf, study.design, by=0)
rownames(organs.pcdf) <- organs.pcdf$Row.names
organs.norm.pcdf <- as.data.frame(organs.norm.pc$x)
organs.norm.pcdf <- merge(organs.norm.pcdf, study.design, by=0)
rownames(organs.norm.pcdf) <- organs.norm.pcdf$Row.names

## ----echo=FALSE,fig.wide=TRUE,fig.cap="Before normalization, the samples in the Organs data set are all scattered across the PCA plot. After normalization, they are correctly grouped according to biological condition, even though all 4 samples in each group have been measured in a different MS run. The big separation in PC1 suggests that the muscle tissue is very different from the tissues of other types of organs!"----
p1 <- ggplot(organs.pcdf, aes(x=PC1, y=PC2, color=condition)) + ggtitle("raw intensities") + geom_point() + theme_bw() + theme(legend.position = "none") + theme(plot.title = element_text(hjust=0.5))
p2 <- ggplot(organs.norm.pcdf, aes(x=PC1, y=PC2, color=condition)) + ggtitle("normalized intensities") + geom_point() + theme_bw() + theme(legend.position = "right") + theme(plot.title = element_text(hjust=0.5))
shared_legend <- extract_legend(p2)
grid.arrange(arrangeGrob(p1, p2 + theme(legend.position = "none"), ncol = 2), 
             shared_legend, widths=c(5,1))

## -----------------------------------------------------------------------------
MAplot <- function(x,y,use.order=FALSE, R=NULL, cex=1.6, showavg=TRUE) {
  # make an MA plot of y vs. x that shows the rolling average,
  M <- log2(y/x)
  xlab = 'A'
  if (!is.null(R)) {r <- R; xlab = "A (re-scaled)"} else r <- 1
  A <- (log2(y/r)+log2(x/r))/2
  if (use.order) {
    orig.order <- order(A)
    A <- orig.order
    M <- M[orig.order]
    xlab = "original rank of feature magnitude within IPS"
  }
  # select only finite values
  use <- is.finite(M)
  A <- A[use]
  M <- M[use]
  # plot
  print(var(M))
  plot(A, M, xlab=xlab, cex.lab=cex, cex.axis=cex)
  # rolling average
  if (showavg) { lines(lowess(M~A), col='red', lwd=5) }
}

## -----------------------------------------------------------------------------
# Example for cerebrum and kidney conditions in run (IPS) 1 in the Organs data set.
cerebrum1.colnames <- rownames(study.design %>% dplyr::filter(run=='BR1', condition=='cerebrum'))
kidney1.colnames <- rownames(study.design %>% dplyr::filter(run=='BR1', condition=='kidney'))
cerebrum1.raw <- organs.df[,cerebrum1.colnames]
kidney1.raw <- organs.df[,kidney1.colnames]
# rowMeans in case of multiple columns
if (!is.null(dim(cerebrum1.raw))) cerebrum1.raw <- rowMeans(cerebrum1.raw)  
if (!is.null(dim(kidney1.raw))) kidney1.raw <- rowMeans(kidney1.raw)
MAplot(cerebrum1.raw, kidney1.raw)

## -----------------------------------------------------------------------------
cerebrum1.norm <- organs.norm.df[,cerebrum1.colnames]
kidney1.norm <- organs.norm.df[,kidney1.colnames]
# rowMeans in case of multiple columns
if (!is.null(dim(cerebrum1.norm))) cerebrum1.norm <- rowMeans(cerebrum1.norm)  
if (!is.null(dim(kidney1.norm))) kidney1.norm <- rowMeans(kidney1.norm)
BR1.R <- organs.norm.obj[[1]]$R[rownames(organs.norm.df)]
MAplot(cerebrum1.norm, kidney1.norm, R=BR1.R)

## -----------------------------------------------------------------------------
get_design_matrix <- function(referenceCondition, study.design) {
    # ANOVA-like design matrix for use in moderated_ttest, indicating group 
    # (condition) membership of each entry in all_samples.
    otherConditions = setdiff(unique(study.design$condition), referenceCondition)
    all_samples = rownames(study.design)
    N_samples = length(all_samples) 
    N_conditions = 1+length(otherConditions)
    design = matrix(rep(0,N_samples*N_conditions), c(N_samples, N_conditions))
    design[, 1] <- 1  # reference gets 1 everywhere
    rownames(design) <- all_samples
    colnames(design) <- c(referenceCondition, otherConditions)
    for (i in 1:N_samples) {  # for each channel in each condition, put a "1" in the design matrix.
        design[as.character(rownames(study.design)[i]), as.character(study.design$condition[i])] <- 1 }
    return(design)
}
moderated_ttest <- function(dat, design_matrix, scale) {
  # inspired by http://www.biostat.jhsph.edu/~kkammers/software/eupa/R_guide.html
  design_matrix <- design_matrix[match(colnames(dat), rownames(design_matrix)),]  # fix column order
  reference_condition <- colnames(design_matrix)[colSums(design_matrix) == nrow(design_matrix)]
  nfeatures <- dim(dat)[1]
  
  fit <- limma::eBayes(lmFit(dat, design_matrix))
  
  reference_averages <- fit$coefficients[,reference_condition]
  logFC <- fit$coefficients
  logFC <- log2((logFC+reference_averages)/reference_averages)
  # correct the reference
  logFC[,reference_condition] <- logFC[,reference_condition] - 1
  # moderated q-value corresponding to the moderated t-statistic
  if(nfeatures>1) { 
    q.mod <- apply(X = fit$p.value, MARGIN = 2, FUN = p.adjust, method='BH') 
  } else q.mod <- fit$p.value
  return(list(logFC=logFC, qval=q.mod))
}

design_matrix <- get_design_matrix('cerebrum', study.design)
result <- moderated_ttest(organs.norm.df, design_matrix)

## -----------------------------------------------------------------------------
sessionInfo()