## ----chunk_settings, eval = TRUE, echo = FALSE, warning = FALSE, message = FALSE----
libv <- c("recountmethylation", "basilisk", "reticulate", "HDF5Array", "ggplot2")
sapply(libv, library, character.only = TRUE)
knitr::opts_chunk$set(eval = FALSE, echo = TRUE, warning = FALSE, message = FALSE)

## ----setup, echo = F----------------------------------------------------------
# si.dpath <- system.file("extdata", "sitest", package = "recountmethylation")
# # get object paths
# bval.fpath <- file.path(si.dpath, "bval_100.csv")
# fhtable.fpath <- file.path(si.dpath, "bval_100_fh10.csv")
# md.fpath <- file.path(si.dpath, "mdf_sitest.rda")
# # load objects
# bval <- read.csv(bval.fpath)
# fhtable <- read.csv(fhtable.fpath)
# md <- get(load(md.fpath))

## ----eval = FALSE-------------------------------------------------------------
# recountmethylation:::setup_sienv(env.name = "dnam_si_vignette")

## ----eval = FALSE-------------------------------------------------------------
# gr.fname <- "gr-noob_h5se_hm450k-epic-merge_0-0-3"
# gr <- HDF5Array::loadHDF5SummarizedExperiment(gr.fname)
# md <- as.data.frame(colData(gr)) # get sample metadata

## -----------------------------------------------------------------------------
# # identify samples from metadata
# gseid <- "GSE67393"
# gsmv <- md[md$gse == gseid,]$gsm # get study samples
# gsmv <- gsmv[sample(length(gsmv), 10)]

## -----------------------------------------------------------------------------
# # get random samples by group label
# set.seed(1)
# mdf <- md[md$blood.subgroup=="PBMC",]
# gsmv <- c(gsmv, mdf[sample(nrow(mdf), 20),]$gsm)
# mdf <- md[md$blood.subgroup=="whole_blood",]
# gsmv <- c(gsmv, mdf[sample(nrow(mdf), 20),]$gsm)

## ----eval = FALSE-------------------------------------------------------------
# # norm bvals for probe subset
# num.cg <- 100
# grf <- gr[,gr$gsm %in% gsmv]; dim(grf)
# bval <- getBeta(grf[sample(nrow(grf), num.cg),])
# bval <- t(bval) # get transpose
# rownames(bval) <- gsub("\\..*", "", rownames(bval)) # format rownames

## -----------------------------------------------------------------------------
# # save bval table
# bval.fpath <- paste0("bval_",num.cg,".csv")
# write.csv(bval, file = bval.fpath)

## ----eval = FALSE-------------------------------------------------------------
# # get example table and labels
# num.dim <- 10 # target reduced dimensions
# fhtable.fpath <- "bval_100_fh10.csv"
# recountmethylation:::get_fh(csv_savepath = fhtable.fpath,
#                             csv_openpath = bval.fpath,
#                             ndim = num.dim)

## -----------------------------------------------------------------------------
# # set file paths
# si.fname.str <- "new_search_index"
# si.fpath <- file.path(si.dpath, paste0(si.fname.str, ".pickle"))
# sidict.fpath <- file.path(si.dpath, paste0(si.fname.str, "_dict.pickle"))
# # make the new search index
# recountmethylation:::make_si(fh_csv_fpath = fhtable.fpath,
#                              si_fname = si.fpath,
#                              si_dict_fname = sidict.fpath)

## -----------------------------------------------------------------------------
# query.gsmv <- c("GSM1506297", "GSM1646161", "GSM1646175", "GSM1649753", "GSM1649758", "GSM1649762")

## -----------------------------------------------------------------------------
# lkval <- c(1, 5, 10, 20) # vector of query k values

## -----------------------------------------------------------------------------
# si_fpath <- system.file("extdata", "sitest", package = "recountmethylation")
# dfnn <- recountmethylation:::query_si(sample_idv = query.gsmv,
#                                       fh_csv_fpath = fhtable.fpath,
#                                       si_fname = "new_search_index",
#                                       si_fpath = si_fpath,
#                                       lkval = lkval)

## -----------------------------------------------------------------------------
# dim(dfnn)

## -----------------------------------------------------------------------------
# colnames(dfnn)

## -----------------------------------------------------------------------------
# unlist(strsplit(dfnn[1,"k=1"], ";"))

## -----------------------------------------------------------------------------
# unlist(strsplit(dfnn[1,"k=5"], ";"))

## -----------------------------------------------------------------------------
# unlist(strsplit(dfnn[1,"k=10"], ";"))

## -----------------------------------------------------------------------------
# gsmvi <- unlist(strsplit(dfnn[1, "k=5"], ";"))
# blood.typev <- md[gsmvi,]$blood.subgroup

## -----------------------------------------------------------------------------
# dist.wb <- unlist(lapply(seq(nrow(dfnn)), function(ii){
#   gsmvi <- unlist(strsplit(dfnn[ii,"k=20"], ";"))
#   length(which(md[gsmvi,]$blood.subgroup=="whole_blood"))
# }))

## -----------------------------------------------------------------------------
# dfp <- data.frame(num.samples = dist.wb)
# dfp$perc.samples <- 100*dfp$num.samples/20
# dfp$type <- "whole_blood"
# ggplot(dfp, aes(x = type, y = perc.samples)) +
#   geom_violin() + geom_boxplot(width = 0.2) + theme_bw() +
#   ylab("Percent of neighbors") + theme(axis.title.x = element_blank()) +
#   scale_y_continuous(labels = scales::percent_format(scale = 1))

## -----------------------------------------------------------------------------
# # function to get samples by label
# get_dfgrp <- function(md, ugroupv = c("whole_blood", "PBMC", "other/NOS")){
#   do.call(rbind, lapply(c("whole_blood", "PBMC", "other/NOS"), function(ugroupi){
#     num.grp <- length(which(md[ugroupv,]$blood.subgroup==ugroupi))
#     data.frame(num.samples = num.grp, type = ugroupi)
#   }))
# }

## -----------------------------------------------------------------------------
# # get samples by label across queries
# dfp <- do.call(rbind, lapply(seq(nrow(dfnn)), function(ii){
#   get_dfgrp(md = md, unlist(strsplit(dfnn[ii,"k=20"], ";")))
# }))

## -----------------------------------------------------------------------------
# # format dfp variables for plots
# dfp$perc.samples <- 100*dfp$num.samples/20
# # reorder on medians
# medianv <- unlist(lapply(c("whole_blood", "PBMC", "other/NOS"), function(groupi){
#   median(dfp[dfp$type==groupi,]$perc.samples)}))
# # define legend groups
# dfp$`Sample\ntype` <- factor(dfp$type, levels = c("whole_blood", "PBMC", "other/NOS")[order(medianv)])

## -----------------------------------------------------------------------------
# # make new composite plot
# ggplot(dfp, aes(x = `Sample\ntype`, y = perc.samples, fill = `Sample\ntype`)) +
#   geom_violin() + geom_boxplot(width = 0.2) + theme_bw() +
#   ylab("Percent of neighbors") + theme(axis.title.x = element_blank()) +
#   scale_y_continuous(labels = scales::percent_format(scale = 1))

## ----eval = TRUE--------------------------------------------------------------
utils::sessionInfo()

