## ----install, eval=FALSE------------------------------------------------------
# if (!requireNamespace('BiocManager', quietly=TRUE))
#     install.packages('BiocManager')
# 
# BiocManager::install('AWAggregator')
# BiocManager::install('AWAggregatorData')

## ----load package-------------------------------------------------------------
library(AWAggregator)
library(AWAggregatorData)

## ----aggregate PSMs from FragPipe---------------------------------------------
# Load the pre-trained random forest model that does not include the average CV 
# as a feature, which indicates the average CV in percentage for processed PSM 
# reporter ion intensities across different replicate groups. It is recommended 
# to load the pre-trained model with average CV when replicates are available; 
# otherwise, use the model without the average CV
data(sample.PSM.FP)
regr <- loadQuantInaccuracyModel(useAvgCV=FALSE)
# Load sample names (Sample 1 ~ Sample 9)
samples <- colnames(sample.PSM.FP)[grep('Sample', colnames(sample.PSM.FP))]
groups <- samples
df <- getPSMAttributes(
    PSM=sample.PSM.FP,
    # TMT tag (229.1629) and carbamidomethylation (57.0214) are applied as 
    # fixed post-translational modifications (PTMs)
    fixedPTMs=c('229.1629', '57.0214'),
    colOfReporterIonInt=samples,
    groups=groups,
    setProgressBar=TRUE
)
aggregated_results <- aggregateByAttributes(
    PSM=df,
    colOfReporterIonInt=samples,
    ranger=regr,
    ratioCalc=FALSE
)

## ----aggregate PSMs from Proteome Discoverer----------------------------------
# Load the pre-trained random forest model that does not include the average CV 
# as a feature, which indicates the average CV in percentage for processed PSM 
# reporter ion intensities across different replicate groups. It is recommended 
# to load the pre-trained model with average CV when replicates are available; 
# otherwise, use the model without the average CV
data(sample.PSM.PD)
data(sample.prot.PD)
regr <- loadQuantInaccuracyModel(useAvgCV=FALSE)
# Load sample names (Sample 1 ~ Sample 9)
samples <- colnames(sample.PSM.PD)[grep('Sample', colnames(sample.PSM.PD))]
groups <- samples
df <- convertPDFormat(
    PSM=sample.PSM.PD,
    protein=sample.prot.PD,
    colOfReporterIonInt=samples
)
df <- getPSMAttributes(
    PSM=df,
    # TMT tag and carbamidomethylation are applied as static PTMs
    fixedPTMs=c('TMT6plex', 'Carbamidomethyl'),
    colOfReporterIonInt=samples,
    groups=groups,
    setProgressBar=TRUE
)
aggregated_results <- aggregateByAttributes(
    PSM=df,
    colOfReporterIonInt=samples,
    ranger=regr,
    ratioCalc=FALSE
)

## ----load spike-in datasets---------------------------------------------------
library(ExperimentHub)
eh <- ExperimentHub()
benchmarkSet1 <- eh[['EH9637']] # Benchmark Set 1
benchmarkSet2 <- eh[['EH9638']] # Benchmark Set 2
benchmarkSet3 <- eh[['EH9639']] # Benchmark Set 3

## ----calculate X and y for benchmark.set.1------------------------------------
library(stringr)

# Load sample names (Sample 'H1+E1_1' ~ Sample 'H1+E6_3')
samples <- colnames(benchmarkSet1)[
    grep('H1[+]E[0-9]+_[1-4]', colnames(benchmarkSet1))
]
groups <- str_match(samples, 'H1[+]E[0-9]+')[, 1]
PSM1 <- getPSMAttributes(
    PSM=benchmarkSet1,
    # TMT tag (229.1629) and carbamidomethylation (57.0214) are applied as 
    # fixed PTMs
    fixedPTM=c('229.1629', '57.0214'),
    colOfReporterIonInt=samples,
    groups=groups
)
PSM1 <- getAvgScaledErrorOfLog2FC(
    PSM=PSM1,
    colOfReporterIonInt=samples,
    groups=groups,
    # The actual protein fold change may be deviated from the intended values 
    # after TMT labelling as the original work indicates when H1+Y6 is 
    # involved, and therefore, H1+Y6 is not used in the calculation of Average 
    # of Scaled Error of log2FC
    expectedRelativeAbundance=list(`H1+E1`=1, `H1+E2`=2, `H1+E6`=NA),
    speciesAtConstLevel='HUMAN'
)

## ----calculate X and y for benchmark.set.2------------------------------------
library(dplyr)

# Load sample names (Sample 'H1+Y1_1' ~ Sample 'H1+Y10_3')
samples <- colnames(benchmarkSet2)[
    grep('H1[+]Y[0-9]+_[1-3]', colnames(benchmarkSet2))
]
groups <- str_match(samples, 'H1[+]Y[0-9]+')[, 1]

# Process each replicate separately using lapply()
# lapply() loops over all unique replicate IDs in benchmarkSet2.
# 'X' is the current replicate ID.
tmp <- lapply(unique(benchmarkSet2$Replicate), FUN=function(X){
    # Select PSMs from the current replicate X
    df <- benchmarkSet2[benchmarkSet2$Replicate == X, ]
    df <- getPSMAttributes(
        PSM=df,
        fixedPTM=c('229.1629', '57.0214'),
        colOfReporterIonInt=samples,
        groups=groups,
        setProgressBar=FALSE
    )
    df <- getAvgScaledErrorOfLog2FC(
        PSM=df,
        colOfReporterIonInt=samples,
        groups=groups,
        expectedRelativeAbundance=list(`H1+Y1`=1, `H1+Y4`=4, `H1+Y10`=10),
        speciesAtConstLevel='HUMAN'
    )
    # Return the processed PSMs from the current replicate
    return(df)
})
# Combine results from all replicates into one dataframe
PSM2 <- bind_rows(tmp)

## ----calculate X and y for benchmark.set.3------------------------------------
# Load sample names (Sample 'H1+Y1_1' ~ Sample 'H1+Y10_2')
samples <- colnames(benchmarkSet3)[
    grep('H1[+]Y[0-9]+_[1-2]', colnames(benchmarkSet3))
]
groups <- str_match(samples, 'H1[+]Y[0-9]+')[, 1]
PSM3 <- getPSMAttributes(
    PSM=benchmarkSet3,
    fixedPTM=c('304.2071', '125.0476'),
    colOfReporterIonInt=samples,
    groups=groups,
    # The signals for yeast PSMs in group H1+Y0 is completely from noise, so 
    # they are not used for calculating Average CV
    groupsExcludedFromCV='H1+Y0'
)
PSM3 <- getAvgScaledErrorOfLog2FC(
    PSM=PSM3,
    colOfReporterIonInt=samples,
    groups=groups,
    expectedRelativeAbundance=list(
        `H1+Y0`=0, `H1+Y1`=1, `H1+Y5`=5, `H1+Y10`=10
    ),
    speciesAtConstLevel='HUMAN'
)

## ----merge spike-in datasets--------------------------------------------------
set.seed(1000)
PSM <- mergeTrainingSets(
    PSMList=list(
        `Benchmark Set 1`=PSM1,
        `Benchmark Set 2`=PSM2,
        `Benchmark Set 3`=PSM3
    ),
    numPSMs=min(nrow(PSM1), nrow(PSM2), nrow(PSM3))
)

## ----train new model with average CV------------------------------------------
regr <- fitQuantInaccuracyModel(PSM, useAvgCV=TRUE, seed=3979)

## ----session info-------------------------------------------------------------
sessionInfo()

