##Code to implement a simple RNN and a LSTM-RNN

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import random

# Data synthesis and encoding
#
DNA_ALPHABET = ['A', 'C', 'G', 'T']

def random_dna_sequence(length):
    return ''.join(random.choices(DNA_ALPHABET, k=length))

def inject_motif(seq, motif):
    """Inject the motif at a random position in the sequence."""
    pos = random.randint(0, len(seq) - len(motif))
    return seq[:pos] + motif + seq[pos+len(motif):]

#[N,seq_len,4]
def one_hot_encode(seq):
    mapping = {'A':0, 'C':1, 'G':2, 'T':3}
    arr = np.zeros((len(seq),4), dtype=np.float32)
    for i, base in enumerate(seq):
        arr[i,mapping[base]] = 1.0
    return arr

class SyntheticDNADataset(Dataset):
    def __init__(self, n_samples=1000, seq_length=101, motif="ACGTACGT"):
        self.sequences = []
        self.labels = []
        n_pos = n_samples // 2    # positive samples with motif
        n_neg = n_samples - n_pos # negatives
        for _ in range(n_pos):
            seq = random_dna_sequence(seq_length)
            seq = inject_motif(seq, motif)
            self.sequences.append(seq)
            self.labels.append(1)
        for _ in range(n_neg):
            seq = random_dna_sequence(seq_length)
            self.sequences.append(seq)
            self.labels.append(0)
        # Shuffle the dataset
        combined = list(zip(self.sequences, self.labels))
        random.shuffle(combined)
        self.sequences, self.labels = zip(*combined)
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        seq = self.sequences[idx]
        label = self.labels[idx]
        arr = one_hot_encode(seq) # shape (seq_length, 4)
        return torch.from_numpy(arr), torch.tensor(label)

# Dummy input: (batch, seq_len, input_size)
batch_size  = 2
seq_length  = 15
input_size  = 4
num_classes = 2
x = torch.randn(batch_size, seq_length, input_size)

# Dummy labels: (batch_size,)
labels = torch.randint(0, num_classes, (batch_size,))
print("dummyx", x.shape, x)
print("dummylabels", labels.shape, labels)

# Usage: create dataset and loader
if __name__ == "__main__":
    seq_length = 101
    dataset = SyntheticDNADataset(n_samples=10000, seq_length=seq_length, motif="ACGTACGT")
    dataloader = DataLoader(dataset, batch_size=200, shuffle=True)
    # Example batch:
    for batch_x, batch_y in dataloader:
        print("x", batch_x.shape)       # [batch, seq_length, 4]
        print("labels", batch_y.shape)  # [batch]
        print(batch_y)                  # labels (0 or 1)
        break  # Just one batch

dummyx torch.Size([2, 15, 4]) tensor([[[ 1.3746,  0.5188,  1.8659,  0.2995],
         [-0.1038, -0.6746, -0.1201,  0.2251],
         [ 0.9828, -0.4180, -1.1069, -0.5472],
         [ 1.0965,  0.3547, -0.7665, -0.4049],
         [ 0.3913, -0.3107, -0.7166, -0.6710],
         [ 0.6349, -0.5899,  0.9158,  2.0765],
         [-0.2090, -2.3032, -0.0098, -0.0551],
         [-1.4048, -0.5299,  0.1647,  0.5387],
         [ 1.2034, -0.0573,  0.5451, -1.4991],
         [-1.2191,  0.0976,  0.4868,  2.5067],
         [ 0.4775, -0.6181,  0.2944, -0.7072],
         [-0.2616, -0.3636, -0.9343,  1.3962],
         [-1.5264, -0.3483,  0.5365,  1.3829],
         [-0.3080, -0.3532,  0.9558,  0.6312],
         [-0.5590,  2.0711, -0.7533, -0.9984]],

        [[-1.1099,  0.1900, -1.2948, -1.5999],
         [ 1.7439, -0.1186, -0.9803, -1.4479],
         [ 0.4723, -1.0497,  1.1786,  0.1597],
         [-0.1113,  1.9036,  1.2823,  0.2452],
         [-1.3914,  0.1212,  0.1325,  1.1390],
         [ 1.3410, -0.2842, 

In [None]:
# Calculate accuracy
def accuracy(model, dataloader, device='cpu', threshold=0.5):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_x, batch_y in dataloader:
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            logits = model(batch_x)
            preds = torch.argmax(logits, dim=1)
            correct += (preds == batch_y).sum().item()
            total   += batch_y.size(0)
            #print('correct/total', correct, total)
    return correct / total


In [None]:
import torch
import torch.nn as nn

n_samples  = 5000   # number of sequences
seq_lenght = 30     # sequence length

motif      = "ACGCGGT"
batch_size = 40
epochs     = 100
lr         = 0.01

# Create the dataset with half sequences with motif and half without the motif
dataset = SyntheticDNADataset(n_samples=n_samples, seq_length=seq_length, motif=motif)

# Assign 80% to training / 20% to testing
train_size = int(0.8 * n_samples)
test_size  = n_samples - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
# Create to dataloaders for the train and test sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size)

# RNN parameters
input_size  = 4   # 1-hot DNA/RNA sequence
hidden_size = 18  # RNN hidden units
num_classes = 2   # binary classification

# data
#    x     [batch, seq_length, input_size]
#    labels[batch]
#
# outputs
#    rnn_out[batch, seq_length, hidden_size]
#    last_out[batch, hidden_size]
#    logits[batch,num_classes] = fc(last_out)
#
# then compare
#     labels[batch]
#     preds[batch] = torch.argmax(logits, dim=1)
#
# RNN Model
class SimpleRNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    def forward(self, x):
        rnn_out, h_n = self.rnn(x)       # rnn_out: (batch, seq_len, hidden_size)
        last_out = rnn_out[:, -1, :]     # get output from last timestep
        return self.fc(last_out)         # pass through final classifier


model = SimpleRNNClassifier(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(epochs):
      model.train()
      cumm_loss = 0.0
      for batch_x, batch_y in train_loader:
          batch_x, batch_y = batch_x, batch_y
          optimizer.zero_grad()
          logits = model(batch_x)
          loss   = criterion(logits, batch_y)
          loss.backward()
          optimizer.step()

          cumm_loss += loss.item() * batch_x.size(0)
      train_acc = accuracy(model, train_loader)
      test_acc  = accuracy(model, test_loader)
      print(f"Epoch {epoch+1}/{epochs} | Loss: {cumm_loss / train_size:.4f} | Train Acc: {train_acc:.3f} | Test Acc: {test_acc:.3f}")

print("Training complete.")



Epoch 1/100 | Loss: 0.6969 | Train Acc: 0.514 | Test Acc: 0.494
Epoch 2/100 | Loss: 0.6939 | Train Acc: 0.522 | Test Acc: 0.494
Epoch 3/100 | Loss: 0.6940 | Train Acc: 0.509 | Test Acc: 0.491
Epoch 4/100 | Loss: 0.6940 | Train Acc: 0.506 | Test Acc: 0.506
Epoch 5/100 | Loss: 0.6939 | Train Acc: 0.531 | Test Acc: 0.501
Epoch 6/100 | Loss: 0.6935 | Train Acc: 0.513 | Test Acc: 0.502
Epoch 7/100 | Loss: 0.6934 | Train Acc: 0.523 | Test Acc: 0.489
Epoch 8/100 | Loss: 0.6929 | Train Acc: 0.532 | Test Acc: 0.501
Epoch 9/100 | Loss: 0.6927 | Train Acc: 0.529 | Test Acc: 0.497
Epoch 10/100 | Loss: 0.6932 | Train Acc: 0.528 | Test Acc: 0.490
Epoch 11/100 | Loss: 0.6924 | Train Acc: 0.522 | Test Acc: 0.499
Epoch 12/100 | Loss: 0.6934 | Train Acc: 0.524 | Test Acc: 0.504
Epoch 13/100 | Loss: 0.6932 | Train Acc: 0.503 | Test Acc: 0.496
Epoch 14/100 | Loss: 0.6936 | Train Acc: 0.506 | Test Acc: 0.519
Epoch 15/100 | Loss: 0.6933 | Train Acc: 0.517 | Test Acc: 0.510
Epoch 16/100 | Loss: 0.6935 | Trai

In [None]:

# LSTM Model
class SimpleLSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out, (h_n, c_n) = self.lstm(x)
        # out: (batch, seq_len, hidden_size)
        last_out = out[:, -1, :]  # Use output from last timestep
        logits = self.fc(last_out)
        return logits

model = SimpleLSTMClassifier(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(epochs):
      model.train()
      cumm_loss = 0.0
      for batch_x, batch_y in train_loader:
          batch_x, batch_y = batch_x, batch_y
          optimizer.zero_grad()
          logits = model(batch_x)
          loss   = criterion(logits, batch_y)
          loss.backward()
          optimizer.step()

          cumm_loss += loss.item() * batch_x.size(0)
      train_acc = accuracy(model, train_loader)
      test_acc  = accuracy(model, test_loader)
      print(f"Epoch {epoch+1}/{epochs} | Loss: {cumm_loss / train_size:.4f} | Train Acc: {train_acc:.3f} | Test Acc: {test_acc:.3f}")

print("Training complete.")


Epoch 1/100 | Loss: 0.6949 | Train Acc: 0.517 | Test Acc: 0.512
Epoch 2/100 | Loss: 0.6932 | Train Acc: 0.524 | Test Acc: 0.492
Epoch 3/100 | Loss: 0.6933 | Train Acc: 0.526 | Test Acc: 0.515
Epoch 4/100 | Loss: 0.6929 | Train Acc: 0.524 | Test Acc: 0.499
Epoch 5/100 | Loss: 0.6925 | Train Acc: 0.502 | Test Acc: 0.492
Epoch 6/100 | Loss: 0.6923 | Train Acc: 0.526 | Test Acc: 0.491
Epoch 7/100 | Loss: 0.6918 | Train Acc: 0.516 | Test Acc: 0.518
Epoch 8/100 | Loss: 0.6935 | Train Acc: 0.516 | Test Acc: 0.501
Epoch 9/100 | Loss: 0.6917 | Train Acc: 0.513 | Test Acc: 0.510
Epoch 10/100 | Loss: 0.6927 | Train Acc: 0.528 | Test Acc: 0.516
Epoch 11/100 | Loss: 0.6922 | Train Acc: 0.535 | Test Acc: 0.498
Epoch 12/100 | Loss: 0.6921 | Train Acc: 0.534 | Test Acc: 0.504
Epoch 13/100 | Loss: 0.6916 | Train Acc: 0.540 | Test Acc: 0.507
Epoch 14/100 | Loss: 0.6912 | Train Acc: 0.534 | Test Acc: 0.508
Epoch 15/100 | Loss: 0.6911 | Train Acc: 0.537 | Test Acc: 0.519
Epoch 16/100 | Loss: 0.6905 | Trai