## ----echo=FALSE, results="hide", message=FALSE--------------------------------
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE, tidy = FALSE)
library(BiocStyle)
set.seed(42)

## -----------------------------------------------------------------------------
suppressMessages(library(immApex))
suppressMessages(library(ggplot2))
suppressMessages(library(viridis))
suppressMessages(library(dplyr))
suppressMessages(library(igraph))
suppressMessages(library(tidygraph))
suppressMessages(library(ggraph))

## -----------------------------------------------------------------------------
TRBV_aa <- getIMGT(species = "human",
                   chain = "TRB",
                   region = "v",
                   sequence.type = "aa")
has_IMGT <- !is.null(TRBV_aa)

# Display first sequence as an example
if (has_IMGT) TRBV_aa[[1]][1]

## -----------------------------------------------------------------------------
data("immapex_example.data")
Adaptive_example <- formatGenes(immapex_example.data[["Adaptive"]],
                                region = "v",
                                technology = "Adaptive", 
                                simplify.format = TRUE) 

head(Adaptive_example[,c("aminoAcid","vGeneName", "v_IMGT", "v_IMGT.check")])

## ----eval=has_IMGT------------------------------------------------------------
Adaptive_example <- inferCDR(Adaptive_example,
                             chain = "TRB",
                             reference = TRBV_aa,
                             technology = "Adaptive",
                             sequence.type = "aa",
                             sequences = c("CDR1", "CDR2"))

Adaptive_example[200:210,c("CDR1_IMGT", "CDR2_IMGT")]

## -----------------------------------------------------------------------------
sequences <- generateSequences(prefix.motif = "CAS",
                               suffix.motif = "YF",
                               number.of.sequences = 200,
                               min.length = 8,
                               max.length = 16)
sequences <- unique(sequences)
head(sequences)

## -----------------------------------------------------------------------------
nucleotide.sequences <- generateSequences(number.of.sequences = 200,
                                          min.length = 8,
                                          max.length = 16, 
                                          sequence.dictionary = c("A", "C", "T", "G"))
head(nucleotide.sequences)

## -----------------------------------------------------------------------------
mutated.sequences <- mutateSequences(sequences, 
                                     number.of.sequences = 1,
                                     position.start = 3,                                  
                                     position.end = 8)
head(sequences)
head(mutated.sequences)

## -----------------------------------------------------------------------------
freq.matrix <- calculateFrequency(sequences, 
                                  max.length = 20)
head(freq.matrix[, 1:10])

## -----------------------------------------------------------------------------
shannon.entropy <- calculateEntropy(sequences, 
                                    method = "shannon")
head(shannon.entropy)

## -----------------------------------------------------------------------------
# Calculate the mean of Atchley factors at each position
atchley.profile <- calculateProperty(sequences, 
                                     property.set = "atchleyFactors", 
                                     summary.fun = "mean")
head(atchley.profile[, 1:6])

## -----------------------------------------------------------------------------
# Using the built-in AIRR data
data_airr <- immapex_example.data[["AIRR"]]

# Calculate paired V-J gene usage as percentages
vj_usage <- calculateGeneUsage(data_airr, 
                               loci = c("v_call", "j_call"),
                               summary = "percent")
vj_usage[1:5, 1:5]

## -----------------------------------------------------------------------------
motif.counts <- calculateMotif(sequences, 
                               motif.lengths = 3, 
                               min.depth = 5)
head(motif.counts)

## -----------------------------------------------------------------------------
ppm.matrix <- probabilityMatrix(sequences)
head(ppm.matrix)

## -----------------------------------------------------------------------------
set.seed(42)
# Generate a sample background frequency
back.freq <- sample(1:1000, 20)
names(back.freq) <- amino.acids
back.freq <- back.freq/sum(back.freq)

pwm.matrix <- probabilityMatrix(sequences,
                                max.length = 20,
                                convert.PWM = TRUE,
                                background.frequencies = back.freq)
head(pwm.matrix)

## -----------------------------------------------------------------------------
adj.matrix <- adjacencyMatrix(sequences, 
                              normalize = FALSE)
adj.matrix[1:10, 1:10]

## ----basic-network------------------------------------------------------------
# Building an edge list with default Levenshtein distance
g1 <- buildNetwork(
  data.frame(sequences = sequences),
  seq_col = "sequences",
  threshold = 2  # Maximum 2 edits (insertions, deletions, or substitutions)
)

head(g1)

## ----bcr-network--------------------------------------------------------------
# Load example BCR data (assuming you have this)
# data(example_bcr_data)

# Build network with recommended settings
g_bcr <- buildNetwork(
  data.frame(sequences = sequences),
  seq_col = "sequences",
  threshold = 0.15,             # 15% dissimilarity threshold
  dist_type = "levenshtein",    # Standard edit distance
  normalize = "maxlen",         # Normalize by maximum length
  ids = paste0("seq", 1:length(sequences))
)

# Forming network
graph <- graph_from_edgelist(as.matrix(g_bcr[, 1:2]))
E(graph)$weight <- g_bcr[, 3]

# Remove isolated nodes for clearer visualization
graph <- delete_vertices(graph, which(degree(graph) == 0))

# Convert to tidygraph for use with ggraph
g_tidy <- as_tbl_graph(graph)

# Plot the network
ggraph(g_tidy, layout = "fr") + 
  geom_edge_link(aes(width = 1 - weight), color = "black", alpha = 0.5) + 
  geom_node_point(aes(size = degree(g_tidy, mode = "all")),
                  fill = "steelblue", color = "black", shape = 21) +
  theme_void() + 
  scale_edge_width(range = c(0.1, 0.5)) + 
  labs(
    title = "Sequence Network",
    subtitle = "Levenshtein distance ≤ 15% (normalized by max length)",
    size = "Degree"
  ) +
  theme(legend.position = "right")

## ----vgene-filtering, eval=FALSE----------------------------------------------
# 
# g_clones <- buildNetwork(
#   input.data = immapex_example.data[["AIRR"]],
#   seq_col = "junction_aa",
#   v_col = "v_call",           # Column with V gene assignments
#   threshold = 0.10,           # 10% dissimilarity
#   dist_type = "levenshtein",
#   normalize = "maxlen",
#   filter.v = TRUE,            # Only connect sequences with same V gene
#   filter.j = FALSE            # Optionally also filter by J gene
# )
# 
# # Build graph and find clonotype clusters
# graph <- graph_from_edgelist(as.matrix(g_clones[, 1:2]))
# E(graph)$weight <- g_clones[, 3]
# 
# # Find communities (putative clonotypes)
# communities <- cluster_louvain(graph)
# 
# cat("Number of clonotypes identified:", length(communities), "\n")
# cat("Sizes of top 5 clonotypes:\n")
# print(sort(table(membership(communities)), decreasing = TRUE)[1:5])

## -----------------------------------------------------------------------------
# Generate one-hot encoding using the main function
enc_onehot <- sequenceEncoder(sequences, mode = "onehot", verbose = FALSE)

# You can achieve the same result with the alias
enc_onehot_alias <- onehotEncoder(sequences, verbose = FALSE)

# View the first few columns of the flattened matrix output
head(enc_onehot$flattened[, 1:10])

## -----------------------------------------------------------------------------
# Generate property-based encoding using FASGAI factors
enc_prop <- sequenceEncoder(sequences,
                            mode = "property",
                            property.set = "FASGAI",
                            verbose = FALSE)

# The propertyEncoder() alias is a convenient shortcut
enc_prop_alias <- propertyEncoder(sequences, property.set = "FASGAI", verbose = FALSE)

# View the first few columns of the flattened property matrix
head(enc_prop$flattened[, 1:6])

## -----------------------------------------------------------------------------
# Generate a geometric embedding for each sequence
enc_geo <- sequenceEncoder(sequences,
                           mode = "geometric",
                           method = "BLOSUM62",
                           theta = pi / 3,
                           verbose = FALSE)

# The alias provides a direct path to this functionality
enc_geo_alias <- geometricEncoder(sequences, 
                                  method = "BLOSUM62", 
                                  theta = pi / 3, 
                                  verbose = FALSE)

# The output is a single summary matrix where each row is a sequence
head(enc_geo$summary)

## -----------------------------------------------------------------------------
token.matrix <- tokenizeSequences(input.sequences =  c(sequences, mutated.sequences), 
                                  add.startstop = TRUE,
                                  start.token = "!",
                                  stop.token = "^", 
                                  convert.to.matrix = TRUE)
head(token.matrix[,1:18])

## -----------------------------------------------------------------------------
property.matrix <- propertyEncoder(input.sequences =  c(sequences, mutated.sequences), 
                                   property.set = "FASGAI")

property.sequences <- sequenceDecoder(property.matrix[[1]],
                                      mode = "property",
                                      property.set  = "FASGAI",
                                      call.threshold = 1)
head(sequences)
head(property.sequences)

## ----tidy=FALSE---------------------------------------------------------------
sequence.matrix <- onehotEncoder(input.sequences =  c(sequences, mutated.sequences))

OHE.sequences <- sequenceDecoder(sequence.matrix,
                                 mode= "onehot")

head(OHE.sequences)

## -----------------------------------------------------------------------------
# Step 1a: Generate two distinct classes of sequences
class1.sequences <- generateSequences(prefix.motif = "CAS",
                                      min.length = 3,
                                      number.of.sequences = 250)

class2.sequences <- generateSequences(prefix.motif = "CSG",
                                      min.length = 3,
                                      number.of.sequences = 250)

# Combine sequences and create labels
all.sequences <- c(class1.sequences, class2.sequences)
labels <- as.factor(c(rep("Class1", 250), rep("Class2", 250)))

# Step 1b: Use propertyEncoder to create a feature matrix from Atchley factors
feature.matrix <- propertyEncoder(all.sequences, 
                                  property.set = "atchleyFactors",
                                  verbose = FALSE)

# Combine the flattened feature matrix and labels into the final data frame
training.data <- data.frame(feature.matrix$flattened, labels)

## -----------------------------------------------------------------------------
suppressMessages(library(randomForest))

# Train the Random Forest model
set.seed(42) # for reproducibility
rf.model <- randomForest(labels ~ ., 
                         data = training.data, 
                         ntree = 100, 
                         importance = TRUE) # Set importance=TRUE to calculate scores

# Print the confusion matrix to see model performance
print(rf.model)

## -----------------------------------------------------------------------------
# Extract feature importance data from the model
importance.data <- as.data.frame(importance(rf.model))
importance.data$Feature <- rownames(importance.data)

# Get the top 15 most important features
top_features <- importance.data[order(importance.data$MeanDecreaseGini, decreasing = TRUE), ][1:15,]

# Plot using ggplot2
ggplot(top_features, aes(x = MeanDecreaseGini, y = reorder(Feature, MeanDecreaseGini))) +
  geom_col(aes(fill = MeanDecreaseGini)) +
  scale_fill_viridis_c() +
  labs(title = "Top 15 Most Important Features",
       x = "Mean Decrease in Gini Impurity",
       y = "Feature (Property and Position)") +
  theme_classic() +
  theme(legend.position = "none")

## -----------------------------------------------------------------------------
# Generate three distinct groups of sequences
groupA <- generateSequences(prefix.motif = "CA", 
                            number.of.sequences = 100, 
                            min.length = 2)
groupB <- generateSequences(prefix.motif = "QR", 
                            number.of.sequences = 100, 
                            min.length = 2)
groupC <- generateSequences(prefix.motif = "YY", 
                            number.of.sequences = 100, 
                            min.length = 2)

# Combine them and create a vector of original group IDs for later visualization
mixed.sequences <- c(groupA, groupB, groupC)
original.groups <- as.factor(rep(c("Group A", "Group B", "Group C"), each = 100))

# Use geometricEncoder to create embeddings
geometric.embeddings <- geometricEncoder(mixed.sequences, 
                                         method = "BLOSUM62", 
                                         verbose = FALSE)

## -----------------------------------------------------------------------------
# Perform PCA on the embedding matrix
pca.result <- prcomp(geometric.embeddings$summary, center = TRUE, scale. = TRUE)

# Prepare data for plotting with ggplot2
pca.data <- data.frame(PC1 = pca.result$x[,1],
                       PC2 = pca.result$x[,2],
                       Group = original.groups)

# Plot PC1 vs PC2 using ggplot2 and viridis
ggplot(pca.data, aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 3, alpha = 0.8) +
  scale_color_viridis(discrete = TRUE) +
  labs(title = "PCA of Geometric Sequence Embeddings",
       x = "Principal Component 1",
       y = "Principal Component 2") +
  theme_classic()

## -----------------------------------------------------------------------------
set.seed(42)
# Generate three distinct groups of sequences to simulate clonal families
family1 <- unique(generateSequences(prefix.motif = "CASS", 
                             suffix.motif = "YF", 
                             min.length = 6,
                             number.of.sequences = 60))
family2 <- unique(generateSequences(prefix.motif = "CARS", 
                             suffix.motif = "GF",
                             min.length = 6,
                             number.of.sequences = 60))
family3 <- unique(generateSequences(prefix.motif = "CSVA", 
                             suffix.motif = "HF", 
                             min.length = 6,
                             number.of.sequences = 60))

# Combine into a single data frame
all_sequences_df <- data.frame(
  sequence = c(family1, family2, family3),
  original_family = c(rep("Family 1", 42), rep("Family 2", 40), rep("Family 3", 46))
)

## -----------------------------------------------------------------------------
# Build the edge list: connect sequences with an edit distance of 3 or less
edge_list <- buildNetwork(all_sequences_df,
                          seq_col = "sequence",
                          threshold = 3)

# Replace numerical edge list with the sequences
edge_list_sequences <- as.matrix(data.frame(
  from = all_sequences_df$sequence[as.numeric(edge_list$from)],
  to = all_sequences_df$sequence[as.numeric(edge_list$to)],
  dist = edge_list$dist
))

# Create a graph object from the edge list and node data
# This graph now contains all our sequences as nodes
sequence_graph <- graph_from_data_frame(d = edge_list_sequences, 
                                        vertices = all_sequences_df, 
                                        directed = FALSE)
E(sequence_graph)$weight <- edge_list_sequences[,3]

## -----------------------------------------------------------------------------
# Perform community detection using the Walktrap algorithm
communities <- cluster_walktrap(sequence_graph)

# Add the community membership as an attribute to the graph nodes
V(sequence_graph)$community <- communities$membership

## -----------------------------------------------------------------------------
# Convert to tidygraph for use with ggraph
g_tidy <- as_tbl_graph(sequence_graph)

# Plot the network, coloring nodes by their detected community
ggraph(g_tidy, layout = "fr") + 
  geom_edge_link(aes(alpha = weight), show.legend = FALSE) +
  geom_node_point(aes(color = as.factor(community)), size = 4) +
  scale_color_viridis(discrete = TRUE, option = "D") +
  labs(title = "Sequence Network with Detected Communities",
       color = "Detected\nCommunity") +
  theme_void()

## -----------------------------------------------------------------------------
sessionInfo()

