## ----setup, include=FALSE-----------------------------------------------------
library(BiocStyle)

## ----eval=FALSE---------------------------------------------------------------
# if (!require("BiocManager", quietly=TRUE))
#     install.packages("BiocManager")
# BiocManager::install("igblastr")

## ----message=FALSE------------------------------------------------------------
library(igblastr)

## -----------------------------------------------------------------------------
if (!has_igblast())
    install_igblast()

## -----------------------------------------------------------------------------
igblast_info()

## -----------------------------------------------------------------------------
list_germline_dbs(builtin.only=TRUE)

## -----------------------------------------------------------------------------
head(list_IMGT_releases())

## -----------------------------------------------------------------------------
list_IMGT_organisms("202614-2")

## ----message=FALSE------------------------------------------------------------
install_IMGT_germline_db("202614-2", organism="Homo sapiens", overwrite=TRUE)

## -----------------------------------------------------------------------------
list_germline_dbs()

## -----------------------------------------------------------------------------
use_germline_db("IMGT-202614-2.Homo_sapiens.IGH+IGK+IGL")

## -----------------------------------------------------------------------------
list_germline_dbs()

## -----------------------------------------------------------------------------
list_c_region_dbs()

## -----------------------------------------------------------------------------
use_c_region_db("_IMGT.human.IGH+IGK+IGL.202412")

## -----------------------------------------------------------------------------
query <- system.file(package="igblastr", "extdata",
                     "BCR", "1279067_1_Paired_sequences.fasta.gz")

## -----------------------------------------------------------------------------
json <- system.file(package="igblastr", "extdata",
                    "BCR", "1279067_1_Paired_All.json")
query_metadata <- jsonlite::fromJSON(json)
query_metadata

## -----------------------------------------------------------------------------
use_germline_db()
use_c_region_db()

## -----------------------------------------------------------------------------
AIRR_df <- igblastn(query, num_alignments_V=1)

## -----------------------------------------------------------------------------
AIRR_df

## ----message=FALSE------------------------------------------------------------
library(ggplot2)
library(dplyr)
library(scales)
library(ggseqlogo)

## -----------------------------------------------------------------------------
AIRR_df |>
    ggplot(aes(locus, 100 - v_identity)) +
    theme_bw(base_size=14) +
    geom_point(position = position_jitter(width = 0.3), alpha = 0.1) +
    geom_boxplot(color = "blue", fill = NA, outliers = FALSE, alpha = 0.3) +
    ggtitle("Distribution of V percent mutation by locus") +
    xlab(NULL)

## -----------------------------------------------------------------------------
plot_gene_dist <- function(AIRR_df, loc) {
    df_v_gene <- AIRR_df |>
        filter(locus == loc) |>
        mutate(v_gene = allele2gene(v_call)) |> # drop allele info
        group_by(v_gene) |>
        summarize(n = n(), .groups = "drop") |>
        mutate(frac = n / sum(n))
    df_v_gene |>
        ggplot(aes(frac, v_gene)) +
        theme_bw(base_size=13) +
        geom_col() +
        scale_x_continuous('Percent of sequences', labels = scales::percent) +
        ylab("Germline gene") +
        ggtitle(paste0(loc, "V gene prevalence"))
}

## ----fig.height=8-------------------------------------------------------------
plot_gene_dist(AIRR_df, "IGH")

## ----fig.height=5.9-----------------------------------------------------------
plot_gene_dist(AIRR_df, "IGK")

## ----fig.height=5.3-----------------------------------------------------------
plot_gene_dist(AIRR_df, "IGL")

## ----fig.height=4.5-----------------------------------------------------------
AIRR_df$cdr3_aa_length <- nchar(AIRR_df$cdr3_aa)

AIRR_df |>
    group_by(locus, cdr3_aa_length) |>
    summarize(n = n(), .groups = "drop") |>
    ggplot(aes(cdr3_aa_length, n)) +
    theme_bw(base_size=14) +
    facet_wrap(~locus) +
    geom_col() +
    ggtitle("Histograms of CDR3 length by locus")

## -----------------------------------------------------------------------------
AIRR_df |>
    filter(locus == "IGK", cdr3_aa_length == 9) |>
    pull(cdr3_aa) |>
    ggseqlogo(method = "probability") +
    theme_bw(base_size=14) +
    ggtitle("Logo plot of kappa chain CDR3 sequences that are 9 AA long")

## -----------------------------------------------------------------------------
igblastn_help()

## ----eval=FALSE---------------------------------------------------------------
# V_alleles <- c("IGHV3-23*01", "IGHV3-23*04")
# igblastn(query, germline_db_V_seqidlist=V_alleles)

## ----eval=FALSE---------------------------------------------------------------
# igblastn(query, germline_db_V_seqidlist=file("path/to/my_V_gene_alleles.txt"))

## -----------------------------------------------------------------------------
filename <- "SRR11341217.fasta.gz"
query <- system.file(package="igblastr", "extdata", "TCR", filename)

## ----message=FALSE------------------------------------------------------------
db_name <- install_IMGT_germline_db("202614-2", organism="Homo sapiens",
                                    tcr.db=TRUE, overwrite=TRUE)
use_germline_db(db_name)
list_germline_dbs()

## -----------------------------------------------------------------------------
AIRR_df <- igblastn(query)
AIRR_df

## -----------------------------------------------------------------------------
sessionInfo()

