## ----setup, echo=FALSE, messages=FALSE, warnings=FALSE------------------------
suppressPackageStartupMessages({
    library(signatureSearchData)
})
# knitr::opts_knit$set(root.dir = "~/insync/project/longevityTools_eDRUG/")

## ----install, eval=FALSE------------------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("signatureSearchData")

## ----load, eval=FALSE---------------------------------------------------------
# library(signatureSearchData)

## ----eh_explore_ssd1, eval=TRUE, warning=FALSE, message=FALSE-----------------
library(ExperimentHub)
eh <- ExperimentHub()
ssd <- query(eh, c("signatureSearchData"))
ssd

## ----eh_explore_ssd2, eval=TRUE, warning=FALSE, message=FALSE-----------------
ssd$title

## ----eh_explore_ssd3, eval=TRUE-----------------------------------------------
as.list(ssd)[10]

## ----lincs, eval=FALSE--------------------------------------------------------
# library(ExperimentHub); library(rhdf5)
# eh <- ExperimentHub()
# query(eh, c("signatureSearchData", "lincs"))
# lincs_path <- eh[['EH3226']]
# rhdf5::h5ls(lincs_path)

## ----filter_meta42, eval=FALSE------------------------------------------------
# meta42 <- readr::read_tsv("./data/GSE92742_Broad_LINCS_sig_info.txt")
# dose <- meta42$pert_idose[7]
# ## filter rows by 'pert_type' as compound, 10uM concentration, and 24h treatment time
# meta42_filter <- sig_filter(meta42, pert_type="trt_cp", dose=dose, time="24 h") # 45956 X 14

## ----extract_modz, eval=FALSE-------------------------------------------------
# library(signatureSearch)
# gctx <- "./data/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx"
# gctx2h5(gctx, cid=meta42_filter$sig_id, new_cid=meta42_filter$pert_cell_factor,
#         h5file="./data/lincs.h5", by_ncol=5000, overwrite=TRUE)
# library(HDF5Array)
# se <- SummarizedExperiment(HDF5Array("./data/lincs.h5", name="assay"))
# rownames(se) <- HDF5Array("./data/lincs.h5", name="rownames")
# colnames(se) <- HDF5Array("./data/lincs.h5", name="colnames")

## ----lincs_degs, eval=FALSE---------------------------------------------------
# library(signatureSearch)
# # Get up and down 150 DEGs
# degs <- getDEGSig(cmp="vorinostat", cell="SKB", refdb="lincs", Nup=150, Ndown=150)
# 
# # Get DEGs by setting cutoffs
# degs2 <- getDEGSig(cmp="vorinostat", cell="SKB", refdb="lincs", higher=2, lower=-2)

## ----lincs_degs_db, eval=FALSE------------------------------------------------
# # gCMAP method
# gep <- getSig("vorinostat", "SKB", refdb="lincs")
# qsig_gcmap <- qSig(query = gep, gess_method = "gCMAP", refdb = "lincs")
# gcmap_res <- gess_gcmap(qsig_gcmap, higher=2, lower=-2)
# # Fisher method
# qsig_fisher <- qSig(query = degs, gess_method = "Fisher", refdb = "lincs")
# fisher_res <- gess_fisher(qSig=qsig_fisher, higher=2, lower=-2)

## ----lincs_expr, eval=FALSE---------------------------------------------------
# library(ExperimentHub)
# eh <- ExperimentHub()
# query(eh, c("signatureSearchData", "lincs_expr"))
# lincs_expr_path <- eh[['EH3227']]

## ----filter_expr, eval=FALSE--------------------------------------------------
# inst42 <- readr::read_tsv("./data/GSE92742_Broad_LINCS_inst_info.txt")
# inst42_filter <- inst_filter(inst42, pert_type="trt_cp", dose=10, dose_unit="um",
#                              time=24, time_unit="h") # 169,795 X 13

## ----extract_expr, eval=FALSE-------------------------------------------------
# # It takes some time
# library(signatureSearch)
# meanExpr2h5(gctx="./data/GSE92742_Broad_LINCS_Level3_INF_mlr12k_n1319138x12328.gctx",
#             inst=inst42_filter, h5file="./data/lincs_expr.h5") # 12328 X 38824

## ----lincs2, eval=FALSE-------------------------------------------------------
# library(ExperimentHub); library(rhdf5)
# eh <- ExperimentHub()
# query(eh, c("signatureSearchData", "lincs2"))
# lincs2_path <- eh[['EH7297']]
# rhdf5::h5ls(lincs2_path)

## ----lincs2_hdf5, eval=FALSE--------------------------------------------------
# siginfo_beta <- fread("https://s3.amazonaws.com/macchiato.clue.io/builds/LINCS2020/siginfo_beta.txt")
# exemplar <- siginfo_beta %>% filter(pert_type=="trt_cp" & is_exemplar_sig == 1)
# new_cid <- paste(exemplar$pert_id, exemplar$cell_iname, rep("trt_cp", length(exemplar$cmap_name)), sep="__")
# gctx2h5("level5_beta_trt_cp_n720216x12328.gctx", cid=exemplar$sig_id, new_cid=new_cid,
#         h5file="lincs2.h5", by_ncol=5000, overwrite=TRUE)
# DBpath <- "lincs2.h5"
# sedb <- SummarizedExperiment(HDF5Array(DBpath, name="assay"))
# rownames(sedb) <- HDF5Array(DBpath, name="rownames")
# colnames(sedb) <- HDF5Array(DBpath, name="colnames")

## ----cmap_rank, eval=FALSE----------------------------------------------------
# library(ExperimentHub)
# eh <- ExperimentHub()
# query(eh, c("signatureSearchData", "cmap_rank"))
# cmap_rank_path <- eh[["EH3225"]]
# se <- SummarizedExperiment(HDF5Array(cmap_rank_path, name="assay"))
# rownames(se) <- HDF5Array(cmap_rank_path, name="rownames")
# colnames(se) <- HDF5Array(cmap_rank_path, name="colnames")

## ----filter_rankm, eval=FALSE-------------------------------------------------
# path <- system.file("extdata", "cmap_instances_02.txt", package="signatureSearchData")
# cmap_inst <- read.delim(path, check.names=FALSE)
# inst_id <- cmap_inst$instance_id[!duplicated(paste(cmap_inst$cmap_name, cmap_inst$cell2, sep="_"))]
# rankM <- read.delim("./data/rankMatrix.txt", check.names=FALSE, row.names=1) # 22283 X 6100
# rankM_sub <- rankM[, as.character(inst_id)]
# colnames(rankM_sub) <- unique(paste(cmap_inst$cmap_name, cmap_inst$cell2, "trt_cp", sep="__"))

## ----affyid_annot, eval=FALSE, message=FALSE----------------------------------
# library(hgu133a.db)
# myAnnot <- data.frame(ACCNUM=sapply(contents(hgu133aACCNUM), paste, collapse=", "),
#                       SYMBOL=sapply(contents(hgu133aSYMBOL), paste, collapse=", "),
#                       UNIGENE=sapply(contents(hgu133aUNIGENE), paste, collapse=", "),
#                       ENTREZID=sapply(contents(hgu133aENTREZID), paste, collapse=", "),
#                       ENSEMBL=sapply(contents(hgu133aENSEMBL), paste, collapse=", "),
#                       DESC=sapply(contents(hgu133aGENENAME), paste, collapse=", "))
# saveRDS(myAnnot, "./data/myAnnot.rds")

## ----mr_prob, eval=FALSE------------------------------------------------------
# rankM_sub_gene <- probe2gene(rankM_sub, myAnnot)

## ----cmap_rank2h5, eval=FALSE-------------------------------------------------
# matrix2h5(rankM_sub_gene, "./data/cmap_rank.h5", overwrite=TRUE) # 12403 X 3587
# rhdf5::h5ls("./data/cmap_rank.h5")
# ## Read in cmap_rank.h5 as SummarizedExperiment object
# se <- SummarizedExperiment(HDF5Array("./data/cmap_rank.h5", name="assay"))
# rownames(se) <- HDF5Array("./data/cmap_rank.h5", name="rownames")
# colnames(se) <- HDF5Array("./data/cmap_rank.h5", name="colnames")

## ----cmap_expr, eval=FALSE----------------------------------------------------
# library(ExperimentHub)
# eh <- ExperimentHub()
# query(eh, c("signatureSearchData", "cmap_expr"))
# cmap_expr_path <- eh[["EH3224"]]

## ----work_envir, eval=FALSE---------------------------------------------------
# dir.create("data"); dir.create("data/CEL"); dir.create("results")

## ----download_cmap, eval=FALSE------------------------------------------------
# getCmapCEL(rerun=FALSE)

## ----get_cel_type, eval=FALSE-------------------------------------------------
# library(affxparser)
# celfiles <- list.files("./data/CEL", pattern=".CEL$")
# chiptype <- sapply(celfiles, function(x) affxparser::readCelHeader(paste0("data/CEL/", x))$chiptype)
# saveRDS(chiptype, "./data/chiptype.rds")

## ----normalize_chips, eval=FALSE----------------------------------------------
# chiptype <- readRDS("./data/chiptype.rds")
# chiptype_list <- split(names(chiptype), as.character(chiptype))
# normalizeCel(chiptype_list, batchsize=200, rerun=FALSE)

## ----comb_chip_type_data, eval=FALSE------------------------------------------
# chiptype_dir <- unique(chiptype)
# combineResults(chiptype_dir, rerun=FALSE)
# mas5df <- combineNormRes(chiptype_dir, norm_method="MAS5")

## ----prof2gene, eval=FALSE----------------------------------------------------
# myAnnot <- readRDS("./data/myAnnot.rds")
# mas5df <- probe2gene(mas5df, myAnnot)
# saveRDS(mas5df,"./data/mas5df.rds")

## ----rma2cmap_expr, eval=FALSE------------------------------------------------
# mas5df <- readRDS("./data/mas5df.rds") # dim: 12403 x 7056
# path <- system.file("extdata", "cmap_instances_02.txt", package="signatureSearchData")
# cmap_inst <- read.delim(path, check.names=FALSE)
# cmap_drug_cell_expr <- meanExpr(mas5df, cmap_inst) # dim: 12403 X 3587
# saveRDS(cmap_drug_cell_expr, "./data/cmap_drug_cell_expr.rds")

## ----gen_cmap_expr, eval=FALSE------------------------------------------------
# cmap_drug_cell_expr <- readRDS("./data/cmap_drug_cell_expr.rds")
# ## match colnames to '(drug)__(cell)__(factor)' format
# colnames(cmap_drug_cell_expr) <- gsub("(^.*)_(.*$)", "\\1__\\2__trt_cp",
#                                       colnames(cmap_drug_cell_expr))
# matrix2h5(cmap_drug_cell_expr, "./data/cmap_expr.h5", overwrite=TRUE)
# h5ls("./data/cmap_expr.h5")

## ----cmap, eval=FALSE---------------------------------------------------------
# library(ExperimentHub)
# eh <- ExperimentHub()
# query(eh, c("signatureSearchData", "cmap"))
# cmap_path <- eh[["EH3223"]]

## ----cel_file_list, eval=FALSE------------------------------------------------
# path <- system.file("extdata", "cmap_instances_02.txt", package="signatureSearchData")
# cmap_inst <- read.delim(path, check.names=FALSE)
# comp_list <- sampleList(cmap_inst, myby="CMP_CELL")

## ----deg_limma, eval=FALSE----------------------------------------------------
# mas5df <- readRDS("./data/mas5df.rds")
# degList <- runLimma(df=log2(mas5df), comp_list, fdr=0.10, foldchange=1, verbose=TRUE)
# saveRDS(degList, "./results/degList.rds") # saves entire degList

## ----se, eval=FALSE-----------------------------------------------------------
# degList <- readRDS("./results/degList.rds")
# logMA <- degList$logFC
# ## match colnames of logMA to '(drug)__(cell)__(factor)' format
# colnames(logMA) <- gsub("(^.*)_(.*$)", "\\1__\\2__trt_cp", colnames(logMA))
# fdr <- degList$FDR
# colnames(fdr) <- gsub("(^.*)_(.*$)", "\\1__\\2__trt_cp", colnames(fdr))
# matrix2h5(logMA, "./data/cmap.h5", name="assay", overwrite=TRUE) # 12403 X 3478
# matrix2h5(fdr, "./data/cmap.h5", name="padj", overwrite=FALSE)
# rhdf5::h5ls("./data/cmap.h5")

## ----cmap_degs, eval=FALSE----------------------------------------------------
# library(signatureSearch)
# # Get up and down 150 DEGs
# degs <- getDEGSig(cmp="vorinostat", cell="PC3", refdb="cmap", Nup=150, Ndown=150)
# 
# # Get DEGs by setting cutoffs
# degs2 <- getDEGSig(cmp="vorinostat", cell="PC3", refdb="cmap",
#                  higher=1, lower=-1, padj=0.05)

## ----cmap_degs_db, eval=FALSE-------------------------------------------------
# # gCMAP method
# gep <- getSig("vorinostat", "PC3", refdb="cmap")
# qsig_gcmap <- qSig(query = gep, gess_method = "gCMAP", refdb = "cmap")
# gcmap_res <- gess_gcmap(qsig_gcmap, higher=1, lower=-1, padj=0.05)
# # Fisher method
# qsig_fisher <- qSig(query = degs, gess_method = "Fisher", refdb = "cmap")
# fisher_res <- gess_fisher(qSig=qsig_fisher, higher=1, lower=-1, padj=0.05)

## ----sessionInfo--------------------------------------------------------------
sessionInfo()

