## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----install, eval = FALSE----------------------------------------------------
# if (!"BiocManager" %in% rownames(installed.packages()))
#     install.packages("BiocManager", repos = "https://cran.r-project.org")

## ----install-Bioconductor, eval = FALSE---------------------------------------
# BiocManager::install("SurfR")

## ----install-github, eval = FALSE---------------------------------------------
# # install.packages("devtools")
# devtools::install_github("auroramaurizio/SurfR", build_vignettes = TRUE)

## ----gene2protein, eval=FALSE-------------------------------------------------
# library(SurfR)
# 
# GeneNames <- c("CIITA", "EPCAM", "CD24", "CDCP1", "LYVE1")
# SurfaceProteins_df <- Gene2SProtein(GeneNames,
#                                     input_type = "gene_name",
#                                     output_tsv = FALSE,
#                                     Surfy_version = "new")
# #The output dataframe contains several information retrieved from Surfy.
# colnames(SurfaceProteins_df)
# #These are the 5 surface protein coding genes detected by SurfR.
# SurfaceProteins_df$UniProt.gene

## ----simulate zhang, eval=FALSE-----------------------------------------------
# library(SPsimSeq)
# 
# data("zhang.data.sub")
# zhang.counts <- zhang.data.sub$counts
# MYCN.status  <- zhang.data.sub$MYCN.status
# 
# # Simulation of bulk RNA data
# sim.data.bulk <- SPsimSeq(n.sim = 1,
#                           s.data = zhang.counts,
#                           group = MYCN.status,
#                           n.genes = 1000,
#                           batch.config = 1,
#                           group.config = c(0.5, 0.5),
#                           tot.samples = ncol(zhang.counts),
#                           pDE = 0.2,
#                           lfc.thrld = 0.5,
#                           result.format = "list",
#                           return.details = TRUE)
# 
# sim.data.bulk1 <- sim.data.bulk$sim.data.list[[1]]
# 
# countMatrix <- sim.data.bulk1$counts
# row.names(countMatrix) <- row.names(zhang.counts)
# metadata <- sim.data.bulk1$colData
# metadata$Group <- as.factor(metadata$Group)

## ----DGE zhang, eval=FALSE----------------------------------------------------
# library(SurfR)
# 
# df_zhang <- DGE(expression = countMatrix,
#                 metadata = metadata,
#                 Nreplica = 50,
#                 design = "~Group",
#                 condition = "Group",
#                 alpha = 0.05,
#                 TEST = "1", CTRL =  "0",
#                 output_tsv = FALSE)
# 
# head(df_zhang)

## ----G2P zhang, eval=FALSE----------------------------------------------------
# # remove NA values
# df_zhang <- df_zhang[!is.na(df_zhang$padj), ]
# 
# # all fdr
# fdr_GeneID <- df_zhang[df_zhang$padj < 0.05, "GeneID"]
# SP <- Gene2SProtein(genes = fdr_GeneID, input_type = "gene_name")
# 
# # upregulated fdr
# fdrUP_GeneID <- df_zhang[df_zhang$padj < 0.05 & df_zhang$log2FoldChange > 0,
#                          "GeneID"]
# SPup <- Gene2SProtein(genes = fdrUP_GeneID, input_type = "gene_name")
# 
# # dowregulated fdr
# fdrDW_GeneID <- df_zhang[df_zhang$padj < 0.05 & df_zhang$log2FoldChange < 0,
#                          "GeneID"]
# SPdw <- Gene2SProtein(genes = fdrDW_GeneID, input_type = "gene_name")

## ----GEO input meta_counts, eval=FALSE----------------------------------------
# library(SurfR)
# library(stringr)
# 
# # Download metadata from GEO
# mGSE121810 <- GEOmetadata(GSE = "GSE121810")
# 
# # create new metadata column in order to remove unwanted special characters
# unwanted_character <- " "
# fx <- function(x) {
#   str_split(string = x, pattern = unwanted_character)[[1]][1]
# }
# mGSE121810$condition <- sapply(mGSE121810$therapy, fx)
# mGSE121810$condition <- as.factor(mGSE121810$condition)
# 
# # Preview metadata
# head(mGSE121810)
# 
# # only select 3 samples per condition to save time
# na_samples <- c("GSM3447013", "GSM3447018", "GSM3447019")
# a_samples <- c("GSM3447023", "GSM3447024", "GSM3447026")
# mGSE121810 <- mGSE121810[c(na_samples, a_samples), ]
# 
# # Download count matrix from ArchS4
# cGSE121810 <- DownloadArchS4(mGSE121810$GSM,
#                              species = "human",
#                              print_tsv = FALSE,
#                              filename = NULL)
# 
# # Preview count matrix
# head(cGSE121810[, ])

## ----GEO dge, eval=FALSE------------------------------------------------------
# # Perform DGE
# df_GEO <- DGE(expression = cGSE121810,
#               metadata = mGSE121810,
#               Nreplica = 3,
#               design = "~condition",
#               condition = "condition",
#               alpha = 0.05,
#               TEST = "neoadjuvant", CTRL =  "adjuvant",
#               output_tsv = FALSE)
# 
# # remove NA values
# df_GEO <- df_GEO[!is.na(df_GEO$padj), ]
# 
# head(df_GEO)

## ----GEO SP, eval=FALSE-------------------------------------------------------
# # Detect SP amoung differentially expressed genes
# fdr_GeneID <- df_GEO[df_GEO$padj < 0.1, "GeneID"]
# 
# SP <- Gene2SProtein(genes = fdr_GeneID, input_type = "gene_name")
# 
# fdrUP_GeneID <- df_GEO[df_GEO$padj < 0.1 & df_GEO$log2FoldChange > 0, "GeneID"]
# SPup <- Gene2SProtein(genes = fdrUP_GeneID, input_type = "gene_name")
# 
# fdrDW_GeneID <- df_GEO[df_GEO$padj < 0.1 & df_GEO$log2FoldChange < 0, "GeneID"]
# SPdw <- Gene2SProtein(genes = fdrDW_GeneID, input_type = "gene_name")
# 

## ----TCGA install, eval=FALSE-------------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("BioinformaticsFMRP/TCGAbiolinksGUI.data")
# BiocManager::install("BioinformaticsFMRP/TCGAbiolinks")

## ----TCGA THYM Download, warning=FALSE, message = FALSE, eval=FALSE-----------
# # Download TCGA data
# # To save time only download 3 Tumor samples and 3 normal samples.
# # If barcodes are not specified TCGA_download function will download all the
# # available samples in the TCGA study
# barcodes2Download <- c("TCGA-X7-A8D6-11A-22R-A42C-07",
#                        "TCGA-X7-A8D7-11A-11R-A42C-07",
#                        "TCGA-XM-A8RB-01A-11R-A42C-07",
#                        "TCGA-X7-A8M0-01A-11R-A42C-07")
# TCGA.THYM <- TCGA_download(project = "TCGA-THYM", barcodes = barcodes2Download)
# 
# cTCGA.THYM <- TCGA.THYM[[1]]
# mTCGA.THYM <- TCGA.THYM[[2]]
# table(mTCGA.THYM$shortLetterCode)
# 
# mTCGA.THYM$shortLetterCode <- as.factor(mTCGA.THYM$shortLetterCode)

## ----TCGA DGE, eval=FALSE-----------------------------------------------------
# df_TCGA <- DGE(expression = cTCGA.THYM,
#                metadata = mTCGA.THYM,
#                Nreplica = 2,
#                design = "~shortLetterCode",
#                condition = "shortLetterCode",
#                alpha = 0.05,
#                TEST = "TP", CTRL =  "NT",
#                output_tsv = FALSE)
# 
# head(df_TCGA)

## ----TCGA SP, eval=FALSE------------------------------------------------------
# # remove NA values
# df_TCGA <- df_TCGA[!is.na(df_TCGA$padj), ]
# 
# fdr_GeneID <- df_TCGA[df_TCGA$padj < 0.05,
#                       "GeneID"]
# 
# SP <- Gene2SProtein(genes = fdr_GeneID, input_type = "gene_name")
# 
# fdrUP_GeneID <- df_TCGA[df_TCGA$padj < 0.05 & df_TCGA$log2FoldChange > 0,
#                         "GeneID"]
# SPup <- Gene2SProtein(genes = fdrUP_GeneID, input_type = "gene_name")
# 
# fdrDW_GeneID <- df_TCGA[df_TCGA$padj < 0.05 & df_TCGA$log2FoldChange < 0,
#                         "GeneID"]
# SPdw <- Gene2SProtein(genes = fdrDW_GeneID, input_type = "gene_name")

## ----TCGA BRCA Download, warning=FALSE, message = FALSE, eval=FALSE-----------
# # Download TCGA data
# # To save time only download 3 Tumor samples and 3 normal samples.
# # If barcodes are not specified TCGA_download function will download all the
# # available samples in the TCGA study
# barcodes2Download <- c("TCGA-BH-A1FU-11A-23R-A14D-07",
#                        "TCGA-BH-A1FC-11A-32R-A13Q-07",
#                        "TCGA-BH-A0DO-11A-22R-A12D-07",
#                        "TCGA-B6-A0RH-01A-21R-A115-07",
#                        "TCGA-BH-A1FU-01A-11R-A14D-07",
#                        "TCGA-A1-A0SE-01A-11R-A084-07")
# TCGA.BRCA <- TCGA_download(project = "TCGA-BRCA", barcodes = barcodes2Download)

## ----meta TCGA_input, eval=FALSE----------------------------------------------
# cTCGA.BRCA <- TCGA.BRCA[[1]]
# mTCGA.BRCA <- TCGA.BRCA[[2]]
# table(mTCGA.BRCA$shortLetterCode)
# 

## ----meta GEO_input metadata, eval = FALSE------------------------------------
# mGSE58135 <- GEOmetadata("GSE58135")
# mGSE58135 <- mGSE58135[mGSE58135$tissue != "Breast Cancer Cell Line", ]
# mGSE58135$condition <- "NT"
# mGSE58135$condition[mGSE58135$tissue %in% c("ER+ Breast Cancer Primary Tumor",
#                                             "Triple Negative Breast Cancer Primary Tumor")] <- "TP"
# 
# # only select 3 samples per condition to save time
# TP_samples <- c("GSM1401694", "GSM1401717", "GSM1401729")
# NT_samples <- c("GSM1401799", "GSM1401813", "GSM1401814")
# mGSE58135 <- mGSE58135[c(TP_samples, NT_samples), ]

## ----meta GEO_input counts, eval = FALSE--------------------------------------
# cGSE58135 <- DownloadArchS4(mGSE58135$GSM, species = "human")
# cGSE58135 <- cGSE58135[, row.names(mGSE58135)]
# 
# table(mGSE58135$condition)
# 

## ----meta DGE TCGA, eval=FALSE------------------------------------------------
# # TCGA DGE
# df.TCGA <- DGE(expression = cTCGA.BRCA,
#                metadata = mTCGA.BRCA,
#                Nreplica = 3,
#                design = "~shortLetterCode",
#                condition = "shortLetterCode",
#                alpha = 0.05,
#                TEST = "TP", CTRL =  "NT",
#                output_tsv = FALSE)
# head(df.TCGA)

## ----meta DGE GEO, eval = FALSE-----------------------------------------------
# # GSE58135 DGE
# df.GSE58135 <- DGE(expression = cGSE58135,
#                    metadata = mGSE58135,
#                    Nreplica = 3,
#                    design = "~condition",
#                    condition = "condition",
#                    alpha = 0.05,
#                    TEST = "TP", CTRL =  "NT",
#                    output_tsv = FALSE)
# head(df.GSE58135)

## ----meta fishercomb, eval=FALSE----------------------------------------------
# 
# L_fishercomb <- metaRNAseq(ind_deg = list(TCGA.BRCA =  df.TCGA, GEO.GSE58135 = df.GSE58135),
#                            test_statistic = "fishercomb",
#                            BHth = 0.05,
#                            adjpval.t = 0.05)

## ----meta invnorm, eval=FALSE-------------------------------------------------
# L_invnorm <- metaRNAseq(ind_deg = list(TCGA.BRCA =  df.TCGA, GEO.GSE58135 = df.GSE58135),
#                         test_statistic = "invnorm",
#                         BHth = 0.05,
#                         adjpval.t = 0.05,
#                         nrep = c(102, 56))

## ----metacombine, eval=FALSE--------------------------------------------------
# metacomb <- combine_fisher_invnorm(ind_deg = list(TCGA.BRCA =  df.TCGA,
#                                                   GEO.GSE58135 = df.GSE58135),
#                                    invnorm = L_invnorm,
#                                    fishercomb = L_fishercomb,
#                                    adjpval = 0.05)
# 
# 
# metacomb_GeneID <- metacomb[metacomb$signFC != 0, "GeneID"]
# SP <- Gene2SProtein(genes = metacomb_GeneID, input_type = "gene_name")

## ----metacombine up, eval=FALSE-----------------------------------------------
# metacombUP_GeneID <- metacomb[metacomb$signFC == 1, "GeneID"]
# SPup <- Gene2SProtein(genes = metacombUP_GeneID, input_type = "gene_name")

## ----metacombine dw, eval=FALSE-----------------------------------------------
# metacombDW_GeneID <- metacomb[metacomb$signFC == -1, "GeneID"]
# SPdw <- Gene2SProtein(genes = metacombDW_GeneID, input_type = "gene_name")
# 

## ----enrich, eval=FALSE-------------------------------------------------------
# 
# dfList <- list(GEO = df.GSE58135,
#                TCGA = df.TCGA)
# 
# # Enrichment analysis
# Enrich <- Enrichment(dfList,
#                      enrich.databases = c("GO_Biological_Process_2021",
#                                           "KEGG_2021_Human"),
#                      p_adj = 0.05, logFC = 1)
# 
# head(Enrich$GEO$fdr_up$GO_Biological_Process_2021)

## ----fig.width=7, fig.height=2.5, eval=FALSE----------------------------------
# library(ggplot2)
# 
# # barplot of the top 5 upregulated pathways
# Enrichment_barplot(Enrich$GEO,
#                    enrich.databases <- c("GO_Biological_Process_2021",
#                                          "KEGG_2021_Human"),
#                    p_adj = 0.05,
#                    num_term = 5,
#                    cond = "UP")
# 
# # barplot of the top 5 downregulated pathways
# Enrichment_barplot(Enrich$GEO,
#                    enrich.databases <- c("GO_Biological_Process_2021",
#                                          "KEGG_2021_Human"),
#                    p_adj = 0.05,
#                    num_term = 5,
#                    cond = "DOWN")

## ----anno spid, eval=FALSE----------------------------------------------------
# annotated <- Annotate_SPID(df.GSE58135, "WikiPathway_2021_Human")
# head(annotated, 10)

## ----fig.height= 4.5, fig.width=7, eval = FALSE-------------------------------
# # upregulated genes in GEO dataset
# fdrUP_GeneID <- df.GSE58135[df.GSE58135$padj < 0.05 & df.GSE58135$log2FoldChange > 0, "GeneID"]
# # corresponding Surface Proteins
# SPup <- Gene2SProtein(genes = fdrUP_GeneID, input_type = "gene_name")
# # Barplot of Almen classification
# Splot(SPup,
#       group.by = "Membranome.Almen.main-class",
#       main = "Almen class")

## ----fig.height= 6, fig.width=7, eval = FALSE---------------------------------
# 
# S_list <- list(SP_1 = c("EPCAM", "CD24",  "DLK1",  "CDCP1", "LYVE1"),
#                SP_2 = c("DLK1", "EPCAM", "EGFR", "UPK1A", "UPK2"))
# 
# SVenn(S_list,
#       cols.use = c("green", "blue"),
#       opacity = 0.5,
#       output_intersectionFile = FALSE)
# 

## ----fig.height= 6, fig.width=7, eval = FALSE---------------------------------
# 
# SurfR::plotPCA(matrix = edgeR::cpm(cGSE58135), metadata = mGSE58135,
#                dims = c(1, 2),
#                color.by = "condition", shape.by = "condition",
#                label = FALSE, main = "PCA GSE58135")

## -----------------------------------------------------------------------------
sessionInfo()