Arabic Text Generation with RNN

In this tutorial, we will explore how to generate Arabic text using a Recurrent Neural Network (RNN). We will use pytorch to build and train our model. The dataset we will use is a collection of Arabic text.

Importing Libraries and Loading Data

Show Code
import numpy as np
import torch
import torch.nn as nn
Show Code
#importing the dataset
with open('data/arabic_text.txt', 'r', encoding='utf-8') as f:
    text = f.read()
# Creating a set of unique characters in the text
print(f'Length of text: {len(text)} characters')
chars = sorted(list(set(text)))
print(f'Unique characters: {len(chars)}')
print(f'Sample characters: {chars[10:20]}')
Length of text: 3296 characters
Unique characters: 47
Sample characters: ['ئ', 'ا', 'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د']

The first step after imprting the data and taking the unique characters is to create a mapping of characters to integers and vice versa. We will create two dictionaries: char_to_int and int_to_char. The char_to_int dictionary will map each unique character to a unique integer, while the int_to_char dictionary will do the reverse mapping.

Show Code
char_to_int = {char: idx for idx, char in enumerate(chars)}
int_to_char = {idx: char for idx, char in enumerate(chars)}
print(f'Character to Integer Mapping: {list(char_to_int.items())[10:20]}')
Character to Integer Mapping: [('ئ', 10), ('ا', 11), ('ب', 12), ('ة', 13), ('ت', 14), ('ث', 15), ('ج', 16), ('ح', 17), ('خ', 18), ('د', 19)]

Notice how each unique character in the text is assigned a unique integer. This mapping will be used to convert the text into a format that can be fed into the RNN model for training.

Show Code
seq_length = 100
dataX = []
dataY = []
for i in range(0, len(text) - seq_length):
    seq_in = text[i:i + seq_length]
    seq_out = text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])

print(f'Total Sequences: {len(dataX)}')
print(f'Sample Input Sequence: {dataX[0]}')
print(f'Sample Output Character: {int_to_char[dataY[0]]}')
Total Sequences: 3196
Sample Input Sequence: [31, 40, 1, 11, 34, 12, 19, 11, 40, 13, 4, 1, 33, 11, 36, 14, 1, 11, 34, 7, 21, 26, 1, 18, 11, 34, 40, 13, 1, 38, 31, 11, 21, 30, 13, 4, 1, 38, 33, 11, 36, 14, 1, 11, 34, 28, 34, 35, 13, 1, 14, 29, 34, 38, 1, 38, 16, 37, 1, 11, 34, 30, 35, 21, 2, 0, 11, 34, 29, 34, 35, 1, 36, 38, 21, 1, 38, 11, 34, 16, 37, 34, 1, 28, 34, 11, 35, 4, 1, 31, 11, 27, 34, 12, 1, 11, 34, 29, 34, 35]
Sample Output Character:  
Show Code
# Converting the data into PyTorch tensors
X = torch.tensor(dataX, dtype=torch.long)
y = torch.tensor(dataY, dtype=torch.long)

print(f'Input Tensor Shape: {X.shape}')
print(f'Output Tensor Shape: {y.shape}')
Input Tensor Shape: torch.Size([3196, 100])
Output Tensor Shape: torch.Size([3196])

Defining the RNN Model

We will define a simple RNN model using PyTorch.

Show Code
## Defining the RNN Model
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, test=False):
        super(CharRNN, self).__init__() # Calls nn.Module's (The parent class) and own its __init__.
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True) # batch_first=True --> (batch, seq_length, input_size)
        self.fc = nn.Linear(hidden_size, output_size)
        self.test = test
    
    def forward(self, x, hidden):
        out, hidden = self.rnn(x, hidden)
        if self.test:
            print(f'RNN output shape (before fc): {out.shape}')  # add this
            print(f'Last timestep shape (before fc): {out[:, -1, :].shape}')
        out = self.fc(out[:, -1, :])  # the output shape is (batch_size, sequence_length, hidden_size), we will take the 
        # last output of the sequence and pass it through the fully connected layer to get the final output shape of (batch_size, output_size)
        return out, hidden
    
    def init_hidden(self, batch_size, device): # Initialize hidden state with zeros
        return torch.zeros(1, batch_size, self.hidden_size).to(device)


input_size = len(chars)
hidden_size = 256
output_size = len(chars)
model = CharRNN(input_size, hidden_size, output_size, test=True)
print(model)
CharRNN(
  (rnn): RNN(47, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=47, bias=True)
)
Show Code
# pass a sample input through the model to check the output shape
sample_input = X[0].unsqueeze(0)  # Add batch dimension
sample_input = nn.functional.one_hot(sample_input, num_classes=input_size).float()  # Convert to one-hot encoding
print(f'Sample Input Shape: {sample_input.shape}')
hidden = model.init_hidden(batch_size=1, device='cpu')
output, hidden = model(sample_input, hidden)
print(f'Sample Output Shape after fc, the 256 got mapped to 47: {output.shape}, These are the logits; a scalar value for each character in the vocabulary.')
Sample Input Shape: torch.Size([1, 100, 47])
RNN output shape (before fc): torch.Size([1, 100, 256])
Last timestep shape (before fc): torch.Size([1, 256])
Sample Output Shape after fc, the 256 got mapped to 47: torch.Size([1, 47]), These are the logits; a scalar value for each character in the vocabulary.

What are logits?

They are raw unnormalized scores — a high number means the model thinks that character is more likely to come next. For example:

ا  →  3.21   ← highest, model predicts this character next
ب  →  1.05
ت  → -0.43
...
ي  → -2.11   ← lowest

At each timestep, the model takes two things — the current character vector (47) and the previous hidden state (256) — and produces a new hidden state (256).

timestep 1:   char_1 [47]  +  h0 [256]  →  h1 [256]
timestep 2:   char_2 [47]  +  h1 [256]  →  h2 [256]
timestep 3:   char_3 [47]  +  h2 [256]  →  h3 [256]
...
timestep 100: char_100[47] + h99 [256]  →  h100[256]

So the model will transform the input vector of size 47 into a hidden state of size 256, and this is rich enough to capture complex patterns in the text. And with this the model can learn what characters come before and after each character, enabling it to generate coherent text.

Training the Model

Show Code
input_size = len(chars)
hidden_size = 256
output_size = len(chars)

model = CharRNN(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


epochs = 100
batch_size = 64

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for i in range(0, len(X) - batch_size, batch_size): #loop through the data in batches
        X_batch = X[i:i + batch_size]
        Y_batch = y[i:i + batch_size]
        
        # Convert inputs to one-hot encoding
        X_batch_one_hot = nn.functional.one_hot(X_batch, num_classes=input_size).float()
        
        # Initialize hidden state
        hidden = model.init_hidden(batch_size, device='cpu')
        
        # Forward pass
        outputs, hidden = model(X_batch_one_hot, hidden)
        
        # Compute loss
        loss = criterion(outputs, Y_batch)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / (len(X) // batch_size)
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}')
Epoch [1/100], Loss: 3.2428
Epoch [2/100], Loss: 3.1102
Epoch [3/100], Loss: 2.9580
Epoch [4/100], Loss: 2.7829
Epoch [5/100], Loss: 2.6635
Epoch [6/100], Loss: 2.5801
Epoch [7/100], Loss: 2.5208
Epoch [8/100], Loss: 2.4808
Epoch [9/100], Loss: 2.4332
Epoch [10/100], Loss: 2.3844
Epoch [11/100], Loss: 2.3411
Epoch [12/100], Loss: 2.2933
Epoch [13/100], Loss: 2.2481
Epoch [14/100], Loss: 2.2152
Epoch [15/100], Loss: 2.1748
Epoch [16/100], Loss: 2.1247
Epoch [17/100], Loss: 2.0934
Epoch [18/100], Loss: 2.0354
Epoch [19/100], Loss: 1.9731
Epoch [20/100], Loss: 1.9176
Epoch [21/100], Loss: 1.8771
Epoch [22/100], Loss: 1.8426
Epoch [23/100], Loss: 1.7679
Epoch [24/100], Loss: 1.7100
Epoch [25/100], Loss: 1.6673
Epoch [26/100], Loss: 1.6027
Epoch [27/100], Loss: 1.5730
Epoch [28/100], Loss: 1.5056
Epoch [29/100], Loss: 1.4663
Epoch [30/100], Loss: 1.3984
Epoch [31/100], Loss: 1.3092
Epoch [32/100], Loss: 1.2697
Epoch [33/100], Loss: 1.2105
Epoch [34/100], Loss: 1.1701
Epoch [35/100], Loss: 1.1481
Epoch [36/100], Loss: 1.1229
Epoch [37/100], Loss: 1.0642
Epoch [38/100], Loss: 0.9668
Epoch [39/100], Loss: 0.9169
Epoch [40/100], Loss: 0.8436
Epoch [41/100], Loss: 0.7723
Epoch [42/100], Loss: 0.7143
Epoch [43/100], Loss: 0.6618
Epoch [44/100], Loss: 0.6403
Epoch [45/100], Loss: 0.6210
Epoch [46/100], Loss: 0.5513
Epoch [47/100], Loss: 0.5120
Epoch [48/100], Loss: 0.4519
Epoch [49/100], Loss: 0.3936
Epoch [50/100], Loss: 0.3587
Epoch [51/100], Loss: 0.3240
Epoch [52/100], Loss: 0.2842
Epoch [53/100], Loss: 0.2711
Epoch [54/100], Loss: 0.2483
Epoch [55/100], Loss: 0.2326
Epoch [56/100], Loss: 0.2178
Epoch [57/100], Loss: 0.2032
Epoch [58/100], Loss: 0.1671
Epoch [59/100], Loss: 0.1473
Epoch [60/100], Loss: 0.1289
Epoch [61/100], Loss: 0.1103
Epoch [62/100], Loss: 0.0884
Epoch [63/100], Loss: 0.0664
Epoch [64/100], Loss: 0.0515
Epoch [65/100], Loss: 0.0397
Epoch [66/100], Loss: 0.0338
Epoch [67/100], Loss: 0.0301
Epoch [68/100], Loss: 0.0275
Epoch [69/100], Loss: 0.0254
Epoch [70/100], Loss: 0.0237
Epoch [71/100], Loss: 0.0222
Epoch [72/100], Loss: 0.0209
Epoch [73/100], Loss: 0.0197
Epoch [74/100], Loss: 0.0186
Epoch [75/100], Loss: 0.0176
Epoch [76/100], Loss: 0.0167
Epoch [77/100], Loss: 0.0159
Epoch [78/100], Loss: 0.0151
Epoch [79/100], Loss: 0.0144
Epoch [80/100], Loss: 0.0138
Epoch [81/100], Loss: 0.0132
Epoch [82/100], Loss: 0.0127
Epoch [83/100], Loss: 0.0122
Epoch [84/100], Loss: 0.0118
Epoch [85/100], Loss: 0.0114
Epoch [86/100], Loss: 0.0110
Epoch [87/100], Loss: 0.0104
Epoch [88/100], Loss: 0.0100
Epoch [89/100], Loss: 0.0096
Epoch [90/100], Loss: 0.0094
Epoch [91/100], Loss: 0.0093
Epoch [92/100], Loss: 0.0091
Epoch [93/100], Loss: 0.0089
Epoch [94/100], Loss: 0.0088
Epoch [95/100], Loss: 0.0086
Epoch [96/100], Loss: 0.0086
Epoch [97/100], Loss: 0.0082
Epoch [98/100], Loss: 0.0077
Epoch [99/100], Loss: 0.0074
Epoch [100/100], Loss: 0.0070

In the below code, will take the trained model and generate new text based on a starting string. We will take the starting string, then predict the next 200 characters one by one, feeding the predicted character back into the model at each step.

Show Code
def predict(model, start_str, predict_len=200, temperature=0.8):
    model.eval()
    
    # convert starting string to indices
    input_seq = [char_to_int[ch] for ch in start_str] #the input converted to indices
    input_tensor = torch.tensor(input_seq).unsqueeze(0)  # [1, seq_len] - Add batch dimension
    
    hidden = model.init_hidden(batch_size=1, device='cpu')
    generated = start_str 

    for _ in range(predict_len):

        # one-hot encode for each character in the input sequence
        x = nn.functional.one_hot(input_tensor, num_classes=input_size).float()  # [1, seq_len, 47]

        # forward pass
        output, hidden = model(x, hidden)  # output: [1, 47], for what character comes next 

        # apply temperature then sample
        output = output / temperature # with temperature < 1, high-probability chars get even higher
        probs = torch.softmax(output, dim=1)  # [1, 47] - convert logits to probabilities
        next_char_idx = torch.multinomial(probs, num_samples=1).item() # sample the next character index based on the probabilities

        # append predicted character
        next_char = int_to_char[next_char_idx]
        generated += next_char

        # slide the window — drop first char, append predicted
        input_seq = input_seq[1:] + [next_char_idx]
        input_tensor = torch.tensor(input_seq).unsqueeze(0)

    return generated


# run it
print(predict(model, start_str='رحلة '))
رحلة تسنة النما تياتا تخد مع دق بع ال وادة المميا تلطفي نل على ال بعدل شيم.
الظيامت قليمة ول ال شبد.
الواحنياوم القا، تالم تجعل بل أب لي تل للباررة ال بطجه، ومعدائن الطاس يلطفي نل فلى اللبانرة وللم، وال يا
Show Code
import pickle, torch

torch.save(model.state_dict(), 'arabic_rnn.pth')

with open('vocab.pkl', 'wb') as f:
    pickle.dump({
        'chars':        chars,
        'char_to_int':  char_to_int,
        'int_to_char':  int_to_char
    }, f)

Comments