Alphapet generator

Implementing alphapet generator using RNN. We will try building the model using Pytorch then implement it using numpy.

1. Data preparation

1.1 Import libraries

Show Code
import torch
import torch.nn as nn

import numpy as np
import random
import matplotlib.pyplot as plt


# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
Using device: cpu

1.2 Define data

Show Code
inputs = np.array([
      ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"],
      ["Z","Y","X","W","V","U","T","S","R","Q","P","O","N","M","L","K","J","I","H","G","F","E","D","C","B","A"],
      ["B","D","F","H","J","L","N","P","R","T","V","X","Z","A","C","E","G","I","K","M","O","Q","S","U","W","Y"],
      ["M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","A","B","C","D","E","F","G","H","I","J","K","L"],
      ["H","G","F","E","D","C","B","A","L","K","J","I","P","O","N","M","U","T","S","R","Q","X","W","V","Z","Y"]
  ])

expected = np.array([
     ["B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","A"],
     ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"],
     ["C","E","G","I","K","M","O","Q","S","U","W","Y","A","B","D","F","H","J","L","N","P","R","T","V","X","Z"], 
     ["N","O","P","Q","R","S","T","U","V","W","X","Y","Z","A","B","C","D","E","F","G","H","I","J","K","L","M"],
     ["I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","A","B","C","D","E","F","G","H"]
 ])
Show Code
# define input and target tensors
import string

# 1. Create index mapping
char_to_ix = {char: i for i, char in enumerate(string.ascii_uppercase)}

# 2. Convert numpy array of chars to numpy array of integers
# using np.vectorize to apply the map to the whole array at once
input_indices = np.vectorize(char_to_ix.get)(inputs)
target_indices = np.vectorize(char_to_ix.get)(expected)

# 3. Convert to PyTorch LongTensor (Long is required for indices)
input_tensor = torch.from_numpy(input_indices).long().to(device)
target_tensor = torch.from_numpy(target_indices).long().to(device)

print("Input tensor shape:", input_tensor.shape)
print("Target tensor shape:", target_tensor.shape)
Input tensor shape: torch.Size([5, 26])
Target tensor shape: torch.Size([5, 26])

The shape (5, 26) comes from your input data:

  • We have 5 sequences (rows) in your inputs and expected arrays.
  • Each sequence contains 26 characters (columns), one for each letter of the alphabet.

2. Define RNN Model Using Pytorch

Show Code
class AlphabetRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(AlphabetRNN, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden):
        embedded = self.embedding(x)
        rnn_out, hidden = self.rnn(embedded, hidden)
        output = self.fc(rnn_out)
        return output, hidden
    
    def init_hidden(self, batch_size):
        return torch.zeros(self.n_layers, batch_size, self.hidden_size).to(device)
Show Code
#Define training step function
def train_step(model, criterion, optimizer, input_tensor, target_tensor):
    model.train()
    batch_size = input_tensor.size(0)
    hidden = model.init_hidden(batch_size)
    
    optimizer.zero_grad()
    
    output, hidden = model(input_tensor, hidden)
    
    loss = criterion(output, target_tensor)
    loss.backward()
    
    optimizer.step()
    
    return loss.item()

3.Training Process

Show Code
# define hyperparameters
n_characters = 26
hidden_size = 128
n_layers = 1
learning_rate = 0.002
epochs = 100


# Model, loss, optimizer
model = AlphabetRNN(n_characters, hidden_size, n_characters,
                n_layers=n_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

print(model)
AlphabetRNN(
  (embedding): Embedding(26, 128)
  (rnn): RNN(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=26, bias=True)
)
Show Code
all_losses = []
running_loss = 0.0


model.train()
for it in range(1, epochs + 1):
    
    loss = train_step(
        model=model,
        optimizer=optimizer,
        criterion=criterion,
        input_tensor=input_tensor,
        target_tensor=target_tensor,
    )
    running_loss += loss
    
    all_losses.append(loss)
    if it % 10 == 0:
        print(f"Epoch {it}/{epochs}, Loss: {loss:.4f}")
        
print("Training finished.")
    
Epoch 10/100, Loss: 1.7973
Epoch 20/100, Loss: 0.8749
Epoch 30/100, Loss: 0.3988
Epoch 40/100, Loss: 0.1792
Epoch 50/100, Loss: 0.0642
Epoch 60/100, Loss: 0.0268
Epoch 70/100, Loss: 0.0161
Epoch 80/100, Loss: 0.0114
Epoch 90/100, Loss: 0.0089
Epoch 100/100, Loss: 0.0074
Training finished.
Show Code
def forward_hook(name):
    def hook(module, inputs, outputs):
        print(f"\n๐Ÿ”น {name}")

        def _tensor_stats(t: torch.Tensor) -> str:
            shape = tuple(t.shape)
            dtype = t.dtype
            if t.is_floating_point() or t.is_complex():
                return f"shape={shape} dtype={dtype} mean={t.mean().item():.4f}"
            # integer/bool tensors (e.g., embedding indices)
            t_min = t.min().item() if t.numel() else "n/a"
            t_max = t.max().item() if t.numel() else "n/a"
            return f"shape={shape} dtype={dtype} min={t_min} max={t_max}"

        # inputs is a tuple
        for i, x in enumerate(inputs):
            if torch.is_tensor(x):
                print(f"  inp[{i}] {_tensor_stats(x)}")

        # outputs can be tensor or tuple
        if torch.is_tensor(outputs):
            print(f"  out {_tensor_stats(outputs)}")
        elif isinstance(outputs, tuple):
            for i, o in enumerate(outputs):
                if torch.is_tensor(o):
                    print(f"  out[{i}] {_tensor_stats(o)}")
    return hook
Show Code
model.embedding.register_forward_hook(forward_hook("Embedding"))
model.rnn.register_forward_hook(forward_hook("RNN"))
model.fc.register_forward_hook(forward_hook("Linear"))

# pick one sequence: shape (1, 26)
x = input_tensor[0:1]
hidden = model.init_hidden(x.size(0))

with torch.no_grad():
    output, hidden = model(x, hidden)

๐Ÿ”น Embedding
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[57], line 10
      7 hidden = model.init_hidden(x.size(0))
      9 with torch.no_grad():
---> 10     output, hidden = model(x, hidden)

File c:\Users\user\Documents\GitHub\simpe-AI\venv\Lib\site-packages\torch\nn\modules\module.py:1775, in Module._wrapped_call_impl(self, *args, **kwargs)
   1773     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1774 else:
-> 1775     return self._call_impl(*args, **kwargs)

File c:\Users\user\Documents\GitHub\simpe-AI\venv\Lib\site-packages\torch\nn\modules\module.py:1786, in Module._call_impl(self, *args, **kwargs)
   1781 # If we don't have any hooks, we want to skip the rest of the logic in
   1782 # this function, and just call forward.
   1783 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1784         or _global_backward_pre_hooks or _global_backward_hooks
   1785         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1786     return forward_call(*args, **kwargs)
   1788 result = None
   1789 called_always_called_hooks = set()

Cell In[26], line 12, in AlphabetRNN.forward(self, x, hidden)
     11 def forward(self, x, hidden):
---> 12     embedded = self.embedding(x)
     13     rnn_out, hidden = self.rnn(embedded, hidden)
     14     output = self.fc(rnn_out)

File c:\Users\user\Documents\GitHub\simpe-AI\venv\Lib\site-packages\torch\nn\modules\module.py:1775, in Module._wrapped_call_impl(self, *args, **kwargs)
   1773     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1774 else:
-> 1775     return self._call_impl(*args, **kwargs)

File c:\Users\user\Documents\GitHub\simpe-AI\venv\Lib\site-packages\torch\nn\modules\module.py:1881, in Module._call_impl(self, *args, **kwargs)
   1878     return inner()
   1880 try:
-> 1881     return inner()
   1882 except Exception:
   1883     # run always called hooks if they have not already been run
   1884     # For now only forward hooks have the always_call option but perhaps
   1885     # this functionality should be added to full backward hooks as well.
   1886     for hook_id, hook in _global_forward_hooks.items():

File c:\Users\user\Documents\GitHub\simpe-AI\venv\Lib\site-packages\torch\nn\modules\module.py:1842, in Module._call_impl.<locals>.inner()
   1840     hook_result = hook(self, args, kwargs, result)
   1841 else:
-> 1842     hook_result = hook(self, args, result)
   1844 if hook_result is not None:
   1845     result = hook_result

Cell In[49], line 8, in forward_hook.<locals>.hook(module, inputs, outputs)
      6 for i, x in enumerate(inputs):
      7     if torch.is_tensor(x):
----> 8         print(f"  inp[{i}] shape={tuple(x.shape)} mean={x.mean():.4f}")
     10 # outputs can be tensor or tuple
     11 if torch.is_tensor(outputs):

RuntimeError: mean(): could not infer output dtype. Input dtype must be either a floating point or complex dtype. Got: Long
Show Code
string.ascii_uppercase
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Show Code
import string
import torch

char_to_ix = {c: i for i, c in enumerate(string.ascii_uppercase)}
ix_to_char = {i: c for i, c in enumerate(string.ascii_uppercase)}


@torch.no_grad()
def generate_next_letters(model, seed, n_next=5, *, temperature=1.0, greedy=True):
    """
    seed: list[str] like ["A","B","C"] OR a string like "ABC"
    n_next: how many next letters to generate
    temperature: >1 more random, <1 more confident (only used if greedy=False)
    greedy: if True uses argmax; if False samples from softmax( logits / temperature )

    returns: (seed_letters, generated_letters)
    """
    model.eval()
    device_ = next(model.parameters()).device

    # Normalize seed to list of uppercase letters
    if isinstance(seed, str):
        seed_letters = [c for c in seed.upper() if c.strip() != ""]
    else:
        seed_letters = [str(c).upper() for c in seed]

    if len(seed_letters) == 0:
        raise ValueError("Seed must contain at least 1 letter.")

    # Map seed -> indices (batch=1)
    seed_ix = [char_to_ix[c] for c in seed_letters]
    x = torch.tensor(seed_ix, dtype=torch.long,
                     device=device_).unsqueeze(0)  # (1, seq_len)

    # Create hidden on the correct device (avoid relying on global `device`)
    hidden = torch.zeros(model.n_layers, 1, model.hidden_size, device=device_)

    # Feed the whole seed to update hidden state
    logits, hidden = model(x, hidden)  # logits: (1, seq_len, 26)

    generated = []
    last_ix = x[:, -1:]  # (1, 1) last token of seed

    for _ in range(n_next):
        step_logits, hidden = model(last_ix, hidden)   # (1, 1, 26)
        step_logits = step_logits[:, -1, :]            # (1, 26)

        if greedy:
            next_ix = step_logits.argmax(dim=-1)       # (1,)
        else:
            probs = torch.softmax(
                step_logits / max(temperature, 1e-8), dim=-1)  # (1, 26)
            next_ix = torch.multinomial(
                probs, num_samples=1).squeeze(-1)        # (1,)

        next_char = ix_to_char[int(next_ix.item())]
        generated.append(next_char)

        last_ix = next_ix.unsqueeze(0)  # (1, 1)

    return seed_letters, generated

# Example:
seed_letters, next5 = generate_next_letters(model, ["A", "B", "I"], n_next=5)
print("Seed:", seed_letters, "Next:", next5)

seed_letters, next5 = generate_next_letters(model, "JKL", n_next=5)
print("Seed:", seed_letters, "Next:", next5)
Seed: ['A', 'B', 'I'] Next: ['E', 'F', 'G', 'H', 'I']
Seed: ['J', 'K', 'L'] Next: ['B', 'M', 'N', 'O', 'P']

Rebuilding the Model using numpy

Encoding

Map each letter to ineger index, and reverse map index.

Show Code
char_to_ix = {c: i for i, c in enumerate(string.ascii_uppercase)}
ix_to_char = {i: c for i, c in enumerate(string.ascii_uppercase)}
Show Code
def string_to_one_hot(s):
    """
    s: string of uppercase letters, e.g. "ABC"
    returns: tensor of shape (len(s), 26) with one-hot encoding
    """
    indices = [char_to_ix[c] for c in s]
    #only numpy
    one_hot = np.zeros((len(s), len(string.ascii_uppercase)), dtype=np.float32)
    one_hot[np.arange(len(s)), indices] = 1
    
    return np.array(one_hot)

string_to_one_hot("ABC")
array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

Model architecture

Show Code
class VanillaRNN(nn.Module):
    

Comments