## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----warning=FALSE, message=FALSE, eval=FALSE---------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
#   install.packages("BiocManager")
# }
# BiocManager::install("CPSM")

## ----warning=FALSE, message=FALSE---------------------------------------------
library(CPSM)
library(SummarizedExperiment)
set.seed(7) # set seed

#load data (from the package)
data(Example_TCGA_LGG_FPKM_data, package = "CPSM")

# view data
str(Example_TCGA_LGG_FPKM_data[1:10])

## ----warning=FALSE, message=FALSE---------------------------------------------
# Step1 - Specify the file path to your data
#file_path <- "path/to/your/TCGA-LGG_FPKM_data_with_clin_data.txt"

# Step2 -  load/read data
#data <- read.table(file = file_path, header = TRUE, sep = "\t",stringsAsFactors = FALSE, check.names = FALSE)

# Step3 -  View/Inspect the first few rows
#head(data[1:30)


## -----------------------------------------------------------------------------
data(Example_TCGA_LGG_FPKM_data, package = "CPSM")
combined_df <- cbind(
  as.data.frame(colData(Example_TCGA_LGG_FPKM_data))
  [, -ncol(colData(Example_TCGA_LGG_FPKM_data))],
  t(as.data.frame(assay(
    Example_TCGA_LGG_FPKM_data,
    "expression"
  )))
)

# View top rows and first 30 columns of data
print(str(Example_TCGA_LGG_FPKM_data[1:30]),2)

#------------------------ OUTPUTS ---------------------#
# Access the output of function
New_data <- data_process_f(combined_df, col_num = 20, surv_time = "OS.time")

# View/Inspect the first few rows of output data after data pre-processing
str(New_data[1:10])

## -----------------------------------------------------------------------------
#load data
data(New_data, package = "CPSM")

# View top rows and first 30 columns of data
print(head(New_data[1:30]),3)

# Call/Run the function  for "New_data"
result <- tr_test_f(data = New_data, fraction = 0.9)

#------------------------  OUTPUTS ---------------------#
# Access  output from the function :train and test data
train_FPKM <- result$train_data

# View top rows and first 30 columns of data
print(head(train_FPKM[1:30]),2)

# Access output - test data
test_FPKM <- result$test_data

# View top rows and first 30 columns of data
print(head(test_FPKM[1:30]),3)

## -----------------------------------------------------------------------------
# Step 3 - Data Normalization 

#load train and test data from package
data(train_FPKM, package = "CPSM")

# View top rows and first 50 columns of data
print(head(train_FPKM[1:30]),3)

data(test_FPKM, package = "CPSM")

# View top rows and first 50 columns of data
print(head(test_FPKM[1:30]),3)

# Call function to Normalize the training and test data sets
Result_N_data <- train_test_normalization_f(
  train_data = train_FPKM,
  test_data = test_FPKM,
  col_num = 21
)

#------------------------  OUTPUTS ---------------------#
# Access the Normalized train and test data
Train_Clin <- Result_N_data$Train_Clin
Test_Clin <- Result_N_data$Test_Clin
Train_Norm_data <- Result_N_data$Train_Norm_data
Test_Norm_data <- Result_N_data$Test_Norm_data

# view output - train clinincal data 
str(Train_Clin[1:10])

# view output - Train normalized data
str(Train_Norm_data[1:10])

# view output - test clinincal data
str(Test_Clin[1:10])

# view output - Test normalized data
str(Test_Norm_data[1:10])


## ----warning=FALSE, message=FALSE, fig.width=7, fig.height=4------------------
# Step 4 - Lasso PI Score

#load data - Normalized train data
data(Train_Norm_data, package = "CPSM")

# View top rows and first 30 columns of data
print(str(Train_Norm_data[1:30]),2)

#load data - Normalized test data
data(Test_Norm_data, package = "CPSM")

# View top rows and first 30 columns of data
print(str(Test_Norm_data[1:30]),2)

# Call/Run function to select features and generate PI score using LASSO-Regression
Result_PI <- Lasso_PI_scores_f(
  train_data = Train_Norm_data,
  test_data = Test_Norm_data,
  nfolds = 5,
  n_repeats = 50,      # number of repeated CV runs for stability
  alpha = 1,           # 1 = LASSO, 0 < alpha < 1 = Elastic Net
  col_num = 21,
  freq_threshold = 0.6,
  surv_time = "OS_month",
  surv_event = "OS"
)

#------------------------  OUTPUTS ---------------------#
# Access output - Feature stability across repeated CV runs
Stability_table <- Result_PI$Stability_table

# view top features with highest selection frequency
head(Stability_table[order(-Stability_table$Selection_Frequency), ])

# Access output - selected features (with beta coefficient value)) by LASSO
Train_Lasso_key_variables <- Result_PI$Train_Lasso_key_variables

#view top features (from selected set) with beta coeff values
print(head(Train_Lasso_key_variables))


# Access output - Train set with PI values
Train_PI_data <- Result_PI$Train_PI_data

# view Train set with PI values
str(Train_PI_data)

# Access output - Test set with PI values
Test_PI_data <- Result_PI$Test_PI_data

# view Test set with PI values
str(Test_PI_data)

# view plot from LASSO-Lasso regression lambda plot
plot(Result_PI$cvfit)


## ----warning=FALSE, message=FALSE---------------------------------------------
# Step 4b Univariate  Survival Significant Feature Selection.

# load normalized train  data
data(Train_Norm_data, package = "CPSM")

# View top rows and first 30 columns of data
print(head(Train_Norm_data[1:30]),3)

# load normalized test  data
data(Test_Norm_data, package = "CPSM")

# View top rows and first 30 columns of data
print(head(Test_Norm_data[1:30]),3)

# Call/Run function to select features using Univariate COX method
Result_Uni <- Univariate_sig_features_f(
  train_data = Train_Norm_data,
  test_data = Test_Norm_data,
  col_num = 21,
  surv_time = "OS_month",
  surv_event = "OS"
)

#------------------------  OUTPUTS ---------------------#

# Access output - A table of univariate significant genes 
Univariate_Suv_Sig_G_L <- Result_Uni$Univariate_Survival_Significant_genes_List

# view top features (from selected set) using Univariate Cox regression
print(head(Univariate_Suv_Sig_G_L))
dim(Univariate_Suv_Sig_G_L)

# Access output - Train data with only sig features
Train_Uni_sig_data <- Result_Uni$Train_Uni_sig_data

# view Train data with only sig features
if (ncol(Train_Uni_sig_data) > 10) {
  message("Displaying first 10 columns only")
  str(Train_Uni_sig_data[, 1:10])
} else {
  str(Train_Uni_sig_data)
}
# Access output - Test data with only sig features
Test_Uni_sig_data <- Result_Uni$Test_Uni_sig_data

# view Test data with only sig features
if (ncol(Test_Uni_sig_data) > 10) {
  message("Displaying first 10 columns only")
  str(Test_Uni_sig_data[, 1:10])
} else {
  str(Test_Uni_sig_data)
}
# Access output - 
Uni_Sur_Sig_clin_List <- Result_Uni$Univariate_Survival_Significant_clin_List

# view A table of univariate significant clinical feature
print(head(Uni_Sur_Sig_clin_List))

# Access output - ZPH test results (Proportional Hazards assumption diagnostics)
ZPH_test <- Result_Uni$ZPH_Genes

# View ZPH (diagnostics test) results for first 10 genes
print(head(ZPH_test))

#  access a summarized table of PH test results if implemented
 PH_Summary <- Result_Uni$PH_Summary_Genes
 print(head(PH_Summary))

# Filter features that do NOT violate the PH assumption
valid_genes <- PH_Summary$Feature[PH_Summary$PH_Violation == FALSE]

# View the list of genes satisfying the PH assumption
print(head(valid_genes))


## ----warning=FALSE, message=FALSE, error = TRUE-------------------------------
try({

# load data for model building/development

# Load Training data
data(Train_Clin, package = "CPSM")

# View top rows of data
print(str(Train_Clin),2)

# Load Test data
data(Test_Clin, package = "CPSM")

# View top rows of data
print(str(Test_Clin),2)

# Load a list of selected features
data(Key_Clin_feature_list, package = "CPSM")

print(Key_Clin_feature_list)

# Call/Run function to develop MTLR model
Result_Model_Type1 <- MTLR_pred_model_f(
  train_clin_data = Train_Clin,
  test_clin_data = Test_Clin,
  Model_type = 1,
  train_features_data = Train_Clin,
  test_features_data = Test_Clin,
  Clin_Feature_List = Key_Clin_feature_list,
  surv_time = "OS_month",
  surv_event = "OS",
  nfolds = 5
)

#------------------------  OUTPUTS ---------------------#

# Access output - Predicted Survival probabilty across different time points
survCurves_data <- Result_Model_Type1$survCurves_data

# View Predicted Survival probabilty for test samples 
str(survCurves_data)

# Access output - Predicted mean and median survival of test data
mean_median_survival_tim_d <- Result_Model_Type1$mean_median_survival_time_data

# View Predicted mean and median survival time
str(mean_median_survival_tim_d)

# Access output -  all results for test data
survival_result_based_on_MTLR <- Result_Model_Type1$survival_result_based_on_MTLR

# View results on diff folds of Training dataset
CV_Fold_Metrics <- Result_Model_Type1$Fold_Metrics
str(CV_Fold_Metrics)

# View Summary
fold_summary_df <- Result_Model_Type1$fold_summary_df
str(fold_summary_df)

# Access output - Final Evaluation parameters of results
Error_mat_for_Model <- Result_Model_Type1$Error_mat_for_Model

# View Final Evaluation parameters of results on Test dataset
test_results <- Error_mat_for_Model["Test_set", ]
head(test_results)
})

## ----warning=FALSE, message=FALSE, error = TRUE-------------------------------
try({

# Load training data with clinical features
data(Train_Clin, package = "CPSM")

# View top rows of data
print(str(Train_Clin))

# Load test data with clinical features
data(Test_Clin, package = "CPSM")

# View top rows of data
print(str(Train_Clin))

# load training data with PI score
data(Train_PI_data, package = "CPSM")

# View top rows of data
print(head(Train_PI_data),3)

# Load test data with PI score
data(Test_PI_data, package = "CPSM")

# View top rows of data
print(head(Test_PI_data,3))

# Load a list of feature (PI )
data(Key_PI_list, package = "CPSM")

#view list of features to build model
print(str(Key_PI_list))

# Call/Run function to develop MTLR prediction model based on PI score
Result_Model_Type2 <- MTLR_pred_model_f(
  train_clin_data = Train_Clin,
  test_clin_data = Test_Clin,
  Model_type = 2,
  train_features_data = Train_PI_data,
  test_features_data = Test_PI_data,
  Clin_Feature_List = Key_PI_list,
  surv_time = "OS_month",
  surv_event = "OS",
  nfolds = 5
)

#------------------------  OUTPUTS ---------------------#

# Access output - Predicted Survival probabilty across different time points
survCurves_data <- Result_Model_Type2$survCurves_data

# View Predicted Survival probabilty for test samples 
str(survCurves_data)

# Access output - Predicted mean and median survival of test data
mean_median_surviv_tim_da <- Result_Model_Type2$mean_median_survival_time_data

# View Predicted mean and median survival of test data
str(mean_median_surviv_tim_da)

# Access output - all results for test data
survival_result_b_on_MTLR <- Result_Model_Type2$survival_result_based_on_MTLR

# View results on diff folds of training data
CV_Fold_Metrics <- Result_Model_Type2$Fold_Metrics
str(CV_Fold_Metrics)

# Access and View summary
fold_summary_df <- Result_Model_Type2$fold_summary_df
str(fold_summary_df)

# Access output - Final Evaluation results on Test data
Error_mat_for_Model <- Result_Model_Type1$Error_mat_for_Model
test_results <- Error_mat_for_Model["Test_set", ]
head(test_results)

})

## ----warning=FALSE, message=FALSE, error = TRUE-------------------------------
try({

# Load training data with clinical feature
data(Train_Clin, package = "CPSM")

# View top rows of data
print(str(Train_Clin),2)

# Load test data with clinical feature
data(Test_Clin, package = "CPSM")

# View top rows of data
print(str(Test_Clin),2)

# Load training data with PI score value
data(Train_PI_data, package = "CPSM")

# View top rows of data
print(head(Train_PI_data),3)

# Load test data with PI score value
data(Test_PI_data, package = "CPSM")

# View top rows of data
print(head(Test_PI_data),3)

# Load a list of feature containing Clinical feature along with PI
data(Key_Clin_features_with_PI_list, package = "CPSM")

# View top rows of feature list based on which model will build
print(head(Key_Clin_features_with_PI_list))

# Call/Run function to develop MTLR prediction model based on PI score with Clinical features
Result_Model_Type3 <- MTLR_pred_model_f(
  train_clin_data = Train_Clin,
  test_clin_data = Test_Clin,
  Model_type = 3,
  train_features_data = Train_PI_data,
  test_features_data = Test_PI_data,
  Clin_Feature_List = Key_Clin_features_with_PI_list,
  surv_time = "OS_month",
  surv_event = "OS",
  nfolds = 5
)

#------------------------  OUTPUTS ---------------------#

# Access output - Predicted Survival probabilty across different time points
survCurves_data <- Result_Model_Type3$survCurves_data

# View Predicted Survival probabilty for test samples
str(survCurves_data)

# Access output - Predicted mean and median survival of test data
mean_median_surv_tim_da <- Result_Model_Type3$mean_median_survival_time_data

# View Predicted mean and median survival of test data
str(mean_median_surv_tim_da) 

# Access output - all results for test data
survival_result_b_on_MTLR <- Result_Model_Type3$survival_result_based_on_MTLR

# View results on diff folds of Training data
CV_Fold_Metrics <- Result_Model_Type3$Fold_Metrics
str(CV_Fold_Metrics)
  

fold_summary_df <- Result_Model_Type3$fold_summary_df
str(fold_summary_df)

# Access output - Final Evaluation parameters of results
Error_mat_for_Model <- Result_Model_Type1$Error_mat_for_Model
test_results <- Error_mat_for_Model["Test_set", ]
head(test_results)

})

## ----warning=FALSE, message=FALSE, error = TRUE-------------------------------
try({

# Load training data with clinical features
data(Train_Clin, package = "CPSM")

# View top rows of data
print(head(Train_Clin,3))

# Load test data with clinical features
data(Test_Clin, package = "CPSM")

# View top rows of data
print(head(Test_Clin,3))

# Load normalized training data 
data(Train_Uni_sig_data, package = "CPSM")

# View top rows of data with first 10 columns
# view Train data with only sig features
if (ncol(Train_Uni_sig_data) > 10) {
  message("Displaying first 10 columns only")
  str(Train_Uni_sig_data[, 1:10])
} else {
  str(Train_Uni_sig_data)
}

# Load normalized test data
data(Test_Uni_sig_data, package = "CPSM")

# view Test data with only sig features
if (ncol(Test_Uni_sig_data) > 10) {
  message("Displaying first 10 columns only")
  str(Test_Uni_sig_data[, 1:10])
} else {
  str(Test_Uni_sig_data)
}
# Load a list of clinical feature along with top feature from univariate survival analysis
data(Key_univariate_features_with_Clin_list, package = "CPSM")

# View list of feature based on which model will built
print(head(Key_univariate_features_with_Clin_list))

# Call/Run function to develop MTLR model based on Clinical and selected univariate features 
Result_Model_Type5 <- MTLR_pred_model_f(
  train_clin_data = Train_Clin,
  test_clin_data = Test_Clin,
  Model_type = 4,
  train_features_data = Train_Uni_sig_data,
  test_features_data = Test_Uni_sig_data,
  Clin_Feature_List = Key_univariate_features_with_Clin_list,
  surv_time = "OS_month",
  surv_event = "OS",
  nfolds = 5
)

#------------------------  OUTPUTS ---------------------#

# Access output - Predicted Survival probabilty across different time points
survCurves_data <- Result_Model_Type5$survCurves_data

# View Predicted Survival probabilty for test samples
str(survCurves_data)

# Access output - Predicted mean and median survival of test data
mean_median_surv_tim_da <- Result_Model_Type5$mean_median_survival_time_data

# View Predicted mean and median survival of test data
str(mean_median_surv_tim_da)

# Access output - all results for test data
survival_result_b_on_MTLR <- Result_Model_Type5$survival_result_based_on_MTLR


# View results on diff folds of Training dataset
CV_Fold_Metrics <- Result_Model_Type5$Fold_Metrics
str(CV_Fold_Metrics)


fold_summary_df <- Result_Model_Type5$fold_summary_df
str(fold_summary_df)

# Access output - Final Evaluation parameters on test dataset
Error_mat_for_Model <- Result_Model_Type1$Error_mat_for_Model
test_results <- Error_mat_for_Model["Test_set", ]
head(test_results)
})

## ----warning=FALSE, message=FALSE, error = TRUE , fig.width=7, fig.height=4----
try({
# Create Survival curves/plots for individual patients

# Load predicted survival probability (at different time points ) data of test samples to plot survival curve plot
data(survCurves_data, package = "CPSM")

# View top rows of data
print(head(survCurves_data,3))

# Call/Run functions to plot survival curve plot
plots <- surv_curve_plots_f(
  Surv_curve_data = survCurves_data,
  selected_sample = "TCGA-TQ-A7RQ-01",
  font_size = 12, 
  line_size = 0.5, 
  all_line_col = "grey70", 
  highlight_col = "red"
)

#------------------------  OUTPUTS ---------------------#
# Access output - print survival plot with all patients in test data
print(plots$all_patients_plot)

# Access output - print survival plot with highlighting the survival curve of selected patient
print(plots$highlighted_patient_plot)

})

## ----warning=FALSE, message=FALSE, error = TRUE , fig.width=7, fig.height=4----
try({

# Load data of predicted mean/median survival time for test samples
data(mean_median_survival_time_data, package = "CPSM")


# View top rows of data 
print(head(mean_median_survival_time_data),3)

custom_colors <- c(
  "TRUE.Mean"   = "cyan",
  "TRUE.Median" = "pink",
  "FALSE.Mean"  = "gray90",
  "FALSE.Median"= "gray70"
)

# Call/Run functions to plot barplot 
plots_2 <- mean_median_surv_barplot_f(
  surv_mean_med_data =
    mean_median_survival_time_data,
  selected_sample = "TCGA-TQ-A7RQ-01",
  font_size = 8,
  font_color = "black",
  bar_colors = custom_colors

)

#------------------------  OUTPUTS ---------------------#
# Access output - Print barplots representing predicted mean/median survival time of patients
print(plots_2$mean_med_all_pat)

# Access output - Print barplots representing predicted mean/median survival time of patients with highlighting selected patient
print(plots_2$highlighted_selected_pat)

})

## ----warning=FALSE, message=FALSE, error=TRUE---------------------------------
try({
# Load example data from CPSM package

# Load training data with selected features (e.g. PI values)
data(Train_PI_data, package = "CPSM")

# View top rows of data 
print(head(Train_PI_data),3)

# Load test data with selected features (e.g. PI values)
data(Test_PI_data, package = "CPSM")

# View top rows of data 
print(head(Train_PI_data),3)

# Load feature list
data(Key_PI_list, package = "CPSM")

# View feature list
print(str(Key_PI_list))

# Call/Run function to Predict survival-based risk groups for test samples
Results_Risk_group_Prediction <- predict_survival_risk_group_f(
  selected_train_data = Train_PI_data,
  selected_test_data = Test_PI_data,
  Feature_List = Key_PI_list
)

#------------------------  OUTPUTS ---------------------#

# Access output - Performance of the best model on Training and Test data
Best_model_Prediction_results<- Results_Risk_group_Prediction$misclassification_results 

# Results on test data - Evaluation performance parameters
test_results1 <- Best_model_Prediction_results[, grep("^Test_", colnames(Best_model_Prediction_results))]

# View results on Test dataset - Evaluation performance parameters
head(test_results1)

# View Prediction results of the best model on Test set
Test_results <-  Results_Risk_group_Prediction$Test_results #Prediction resulst on Test data
print(head(Test_results))

#Prediction resulst on each fold of training data
Fold_results <- Results_Risk_group_Prediction$CV_Fold_Results 


#Prediction resulst on each fold of training data 
Fold_metrics <- do.call(rbind, lapply(Fold_results, function(f) { 
data.frame( Fold = f$Fold, 
Sensitivity = as.numeric(f$Sensitivity), 
Specificity = as.numeric(f$Specificity), 
Accuracy = as.numeric(f$Accuracy), 
Precision = as.numeric(f$Precision),
Recall = as.numeric(f$Recall),
F1 = as.numeric(f$F1) ) })) 

print(Fold_metrics)

})

## ----fig.height=6, fig.width=8, warning=FALSE, message=FALSE------------------
# Load example data

# Load predicted risk-group results for training samples
data(Train_results, package = "CPSM")

# View top rows of data 
print(head(Train_results),3)

# Load predicted risk-group results for test samples
data(Test_results, package = "CPSM")

# View top rows of data 
print(head(Test_results),3)

# Load predicted survival probabiliy data (at multiple time points) for test samples
data(survCurves_data, package = "CPSM")

# View top rows of data 
print(head(survCurves_data),3)

# Select a test sample to visualize
sample_id <- "TCGA-TQ-A7RQ-01"  

# Generate KM overlay plot 
KM_plot <- km_overlay_plot_f( Train_results = Train_results, 
	Test_results = Test_results, 
	survcurve_te_data = survCurves_data, 
	selected_sample = sample_id,
	font_size         = 12,                 #font size 
  	train_palette     = c("firebrick", "blue"), # custom risk group colors
  	test_curve_col    = "darkgreen",           # highlight test sample in darkgreen
  	test_curve_size   = 1.2,                 # thicker test sample curve
  	test_curve_lty    = "dotdash",           # dashed-dot test curve
  	annotation_col    = "black"            # annotation text in black 
)

#------------------------  OUTPUTS ---------------------#

# View KM plot representing the comparison of survival curve of selected test sample vs survival curves of risk-groups of training data 
KM_plot


## ----warning=FALSE, message=FALSE, error = TRUE, fig.width=7, fig.height=6----
try({

# Load Normalozed Training data with  along with Survival info selected features (based on which user want to develop nomogram) and survival info
data(Train_Data_Nomogram_input, package = "CPSM")

# View top rows of data 
print(head(Train_Data_Nomogram_input[1:30]),3)

# Load a list of selected features (based on which user want to develop nomogram)
data(feature_list_for_Nomogram, package = "CPSM")

# View feature list to build nomogram
print(str(feature_list_for_Nomogram))

# Call/run function to generate nomogram
Result_Nomogram <- Nomogram_generate_f(
  data = Train_Data_Nomogram_input,
  Feature_List = feature_list_for_Nomogram,
  surv_time = "OS_month",
  surv_event = "OS",
  font_size = 0.8,
  axis_cex = 0.5,
  tcl_len = 0.5,
  label_margin = 0.5,
  col_grid = gray(c(0.85, 0.95))
)

#------------------------  OUTPUTS ---------------------#

# Access output - C-index value and Nomogram plot
C_index_mat <- Result_Nomogram$C_index_mat

#display C-index
print(C_index_mat)


})

## -----------------------------------------------------------------------------
sessionInfo()