## ----knitr_setup, include = FALSE---------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    fig.height = 4,
    fig.width = 4
)
options(rmarkdown.html_vignette.check_title = FALSE)

## ----load_libraries, message=FALSE--------------------------------------------
library(VariantAnnotation)
library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg19)
library(SEMPLR)

## ----load_data----------------------------------------------------------------
SEMC

## ----display_data-------------------------------------------------------------
SEMC

## ----view_sem_metadata--------------------------------------------------------
semData(SEMC)

## ----subset_sem_collection----------------------------------------------------
getSEMs(SEMC, semId = "JUN")

## ----inputs_scoreBinding------------------------------------------------------
# create a GRanges object
gr <- GenomicRanges::GRanges(
    seqnames = "chr12",
    ranges = 94136009
)

## ----scoreBinding-------------------------------------------------------------
# calculate binding propensity
sb <- scoreBinding(x = gr, sem = SEMC, genome = Hsapiens)
sb

## ----scores_accessor----------------------------------------------------------
scores(sb)

## ----plot_sem-----------------------------------------------------------------
# subset JUN score
jun_score <- scores(sb)[SEM == "JUN" & rc == "fwd"]
jun_score

# plot the JUN motif with the scored sequence
plotSEM(SEMC,
    motif = "JUN",
    motifSeq = jun_score$seq
)

## ----simulation_functions-----------------------------------------------------
# create random sequences weighted by ppm probabilities
simulatePPMSeqs <- function(ppm, nSeqs) {
    ppm_t <- t(ppm)
    position_samples <- lapply(
        seq_len(nrow(ppm_t)),
        function(i) {
            sample(colnames(ppm_t),
                size = nSeqs,
                replace = TRUE,
                prob = ppm_t[i, ]
            )
        }
    )

    rand_pwm_seq_mtx <- position_samples |>
        unlist() |>
        matrix(ncol = nrow(ppm_t), nrow = nSeqs)
    rand_pwm_seqs <- apply(
        rand_pwm_seq_mtx, 1,
        function(x) paste0(x, collapse = "")
    )

    return(rand_pwm_seqs)
}


# generate DNA sequences
simulateRandSeqs <- function(seqLength, nSeqs = 1) {
    bps <- c("A", "C", "G", "T")
    seqs <- sample(bps,
        replace = TRUE,
        size = seqLength
    ) |>
        stringi::stri_c(collapse = "") |>
        replicate(n = nSeqs)
    return(seqs)
}

## ----build_enrich_scenario----------------------------------------------------
ppm <- convertSEMsToPPMs(getSEMs(SEMC, "JUN"))[[1]]

# simulate sequences from the JUN PPM
sim_seqs <- simulatePPMSeqs(ppm = ppm, nSeqs = 200)
# add flanks to the simulated sequences to make them 3x the length of the motif
sim_seqs <- lapply(
    sim_seqs,
    function(x) {
        paste0(
            simulateRandSeqs(seqLength = ncol(ppm)), x,
            simulateRandSeqs(seqLength = ncol(ppm))
        )
    }
) |>
    unlist()

# generate random DNA sequences
rand_seqs <- simulateRandSeqs(seqLength = ncol(ppm) * 3, nSeqs = 800)

## ----scoring_for_enrich_test--------------------------------------------------
# combine all sequences into a single vector
all_seqs <- c(sim_seqs, rand_seqs)

sb <- scoreBinding(
    x = all_seqs,
    sem = SEMC, genome = BSgenome.Hsapiens.UCSC.hg19::Hsapiens
)
sb

## ----enrichSEMs---------------------------------------------------------------
e <- enrichSEMs(sb, sem = SEMC, seqs = all_seqs)

# order the results by adjusted pvalue
head(e[order(padj, decreasing = FALSE)])

## ----plotEnrich---------------------------------------------------------------
plotEnrich(e,
    sem = SEMC,
    threshold = 0.05, method = "WPCC",
    pvalRange = c(0, 50),
    heatmapCols = c("lightgrey", "dodgerblue2")
)

## ----promoter_enrichment------------------------------------------------------
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(BSgenome.Hsapiens.UCSC.hg38)
library(org.Hs.eg.db)

txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
orgdb <- org.Hs.eg.db

genes_oi <- c("ENSG00000139618", "ENSG00000157764")
background_genes <- c("ENSG00000111640", "ENSG00000075624", "ENSG00000165704")

result <- enrichmentSets(
    foreground_ids = genes_oi,
    background_ids = background_genes,
    txdb = txdb,
    orgdb = orgdb,
    id_type = "ENSEMBL"
)
result

## ----make_vranges-------------------------------------------------------------
vr <- VRanges(
    seqnames = c("chr12", "chr19"),
    ranges = c(94136009, 10640062),
    ref = c("G", "T"), alt = c("C", "A"),
    id = c("variant1", "variant2")
)

## ----score_variants-----------------------------------------------------------
sempl_results <- scoreVariants(
    x = vr,
    sem = SEMC,
    genome = BSgenome.Hsapiens.UCSC.hg38::Hsapiens,
    varId = "id"
)

sempl_results

## ----SEMScores_accessors, eval = FALSE----------------------------------------
# # access the variants slot
# getRanges(sempl_results)
# 
# # access the semData slot
# semData(sempl_results)
# 
# # access the scores slot
# scores(sempl_results)

## ----plotSemMotifs, fig.height=4, fig.width=4---------------------------------
plotSEMMotifs(
    s = sempl_results,
    variant = "variant2",
    label = "transcription_factor"
)

## ----plotSEMVariants----------------------------------------------------------
plotSEMVariants(sempl_results, sem = "HLF")

## ----load_custom_sems---------------------------------------------------------
# find .sem files
sem_folder <- system.file("extdata", "SEMs", package = "SEMPLR")
sem_files <- list.files(sem_folder, full.names = TRUE)

# load metadata
sempl_metadata_file <- system.file("extdata", "sempl_metadata.csv",
    package = "SEMPLR"
)
sempl_metadata <- read.csv(sempl_metadata_file)

## ----format_meta_data---------------------------------------------------------
ix <- lapply(
    sem_files,
    function(x) which(sempl_metadata$SEM == basename(x))
) |>
    unlist()
sem_ids <- sempl_metadata$transcription_factor[ix]

## ----loadSEMCollection--------------------------------------------------------
sc <- loadSEMCollection(
    semFiles = sem_files,
    semMetaData = sempl_metadata,
    semMetaKey = "transcription_factor",
    semIds = sem_ids
)
sc

## ----session_info-------------------------------------------------------------
devtools::session_info()

