Show Code
import numpy as np
import torch
import torch.nn as nnIn this tutorial, we will explore how to generate Arabic text using a Recurrent Neural Network (RNN). We will use pytorch to build and train our model. The dataset we will use is a collection of Arabic text.
#importing the dataset
with open('data/arabic_text.txt', 'r', encoding='utf-8') as f:
text = f.read()
# Creating a set of unique characters in the text
print(f'Length of text: {len(text)} characters')
chars = sorted(list(set(text)))
print(f'Unique characters: {len(chars)}')
print(f'Sample characters: {chars[10:20]}')Length of text: 3296 characters
Unique characters: 47
Sample characters: ['ئ', 'ا', 'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د']
The first step after imprting the data and taking the unique characters is to create a mapping of characters to integers and vice versa. We will create two dictionaries: char_to_int and int_to_char. The char_to_int dictionary will map each unique character to a unique integer, while the int_to_char dictionary will do the reverse mapping.
Character to Integer Mapping: [('ئ', 10), ('ا', 11), ('ب', 12), ('ة', 13), ('ت', 14), ('ث', 15), ('ج', 16), ('ح', 17), ('خ', 18), ('د', 19)]
Notice how each unique character in the text is assigned a unique integer. This mapping will be used to convert the text into a format that can be fed into the RNN model for training.
seq_length = 100
dataX = []
dataY = []
for i in range(0, len(text) - seq_length):
seq_in = text[i:i + seq_length]
seq_out = text[i + seq_length]
dataX.append([char_to_int[char] for char in seq_in])
dataY.append(char_to_int[seq_out])
print(f'Total Sequences: {len(dataX)}')
print(f'Sample Input Sequence: {dataX[0]}')
print(f'Sample Output Character: {int_to_char[dataY[0]]}')Total Sequences: 3196
Sample Input Sequence: [31, 40, 1, 11, 34, 12, 19, 11, 40, 13, 4, 1, 33, 11, 36, 14, 1, 11, 34, 7, 21, 26, 1, 18, 11, 34, 40, 13, 1, 38, 31, 11, 21, 30, 13, 4, 1, 38, 33, 11, 36, 14, 1, 11, 34, 28, 34, 35, 13, 1, 14, 29, 34, 38, 1, 38, 16, 37, 1, 11, 34, 30, 35, 21, 2, 0, 11, 34, 29, 34, 35, 1, 36, 38, 21, 1, 38, 11, 34, 16, 37, 34, 1, 28, 34, 11, 35, 4, 1, 31, 11, 27, 34, 12, 1, 11, 34, 29, 34, 35]
Sample Output Character:
Input Tensor Shape: torch.Size([3196, 100])
Output Tensor Shape: torch.Size([3196])
We will define a simple RNN model using PyTorch.
## Defining the RNN Model
class CharRNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size, test=False):
super(CharRNN, self).__init__() # Calls nn.Module's (The parent class) and own its __init__.
self.hidden_size = hidden_size
self.rnn = nn.RNN(input_size, hidden_size, batch_first=True) # batch_first=True --> (batch, seq_length, input_size)
self.fc = nn.Linear(hidden_size, output_size)
self.test = test
def forward(self, x, hidden):
out, hidden = self.rnn(x, hidden)
if self.test:
print(f'RNN output shape (before fc): {out.shape}') # add this
print(f'Last timestep shape (before fc): {out[:, -1, :].shape}')
out = self.fc(out[:, -1, :]) # the output shape is (batch_size, sequence_length, hidden_size), we will take the
# last output of the sequence and pass it through the fully connected layer to get the final output shape of (batch_size, output_size)
return out, hidden
def init_hidden(self, batch_size, device): # Initialize hidden state with zeros
return torch.zeros(1, batch_size, self.hidden_size).to(device)
input_size = len(chars)
hidden_size = 256
output_size = len(chars)
model = CharRNN(input_size, hidden_size, output_size, test=True)
print(model)CharRNN(
(rnn): RNN(47, 256, batch_first=True)
(fc): Linear(in_features=256, out_features=47, bias=True)
)
# pass a sample input through the model to check the output shape
sample_input = X[0].unsqueeze(0) # Add batch dimension
sample_input = nn.functional.one_hot(sample_input, num_classes=input_size).float() # Convert to one-hot encoding
print(f'Sample Input Shape: {sample_input.shape}')
hidden = model.init_hidden(batch_size=1, device='cpu')
output, hidden = model(sample_input, hidden)
print(f'Sample Output Shape after fc, the 256 got mapped to 47: {output.shape}, These are the logits; a scalar value for each character in the vocabulary.')Sample Input Shape: torch.Size([1, 100, 47])
RNN output shape (before fc): torch.Size([1, 100, 256])
Last timestep shape (before fc): torch.Size([1, 256])
Sample Output Shape after fc, the 256 got mapped to 47: torch.Size([1, 47]), These are the logits; a scalar value for each character in the vocabulary.
What are logits?
They are raw unnormalized scores — a high number means the model thinks that character is more likely to come next. For example:
ا → 3.21 ← highest, model predicts this character next
ب → 1.05
ت → -0.43
...
ي → -2.11 ← lowest
At each timestep, the model takes two things — the current character vector (47) and the previous hidden state (256) — and produces a new hidden state (256).
timestep 1: char_1 [47] + h0 [256] → h1 [256]
timestep 2: char_2 [47] + h1 [256] → h2 [256]
timestep 3: char_3 [47] + h2 [256] → h3 [256]
...
timestep 100: char_100[47] + h99 [256] → h100[256]
So the model will transform the input vector of size 47 into a hidden state of size 256, and this is rich enough to capture complex patterns in the text. And with this the model can learn what characters come before and after each character, enabling it to generate coherent text.
input_size = len(chars)
hidden_size = 256
output_size = len(chars)
model = CharRNN(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 100
batch_size = 64
for epoch in range(epochs):
model.train()
total_loss = 0
for i in range(0, len(X) - batch_size, batch_size): #loop through the data in batches
X_batch = X[i:i + batch_size]
Y_batch = y[i:i + batch_size]
# Convert inputs to one-hot encoding
X_batch_one_hot = nn.functional.one_hot(X_batch, num_classes=input_size).float()
# Initialize hidden state
hidden = model.init_hidden(batch_size, device='cpu')
# Forward pass
outputs, hidden = model(X_batch_one_hot, hidden)
# Compute loss
loss = criterion(outputs, Y_batch)
# Backward pass and optimization
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / (len(X) // batch_size)
print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}')Epoch [1/100], Loss: 3.2428
Epoch [2/100], Loss: 3.1102
Epoch [3/100], Loss: 2.9580
Epoch [4/100], Loss: 2.7829
Epoch [5/100], Loss: 2.6635
Epoch [6/100], Loss: 2.5801
Epoch [7/100], Loss: 2.5208
Epoch [8/100], Loss: 2.4808
Epoch [9/100], Loss: 2.4332
Epoch [10/100], Loss: 2.3844
Epoch [11/100], Loss: 2.3411
Epoch [12/100], Loss: 2.2933
Epoch [13/100], Loss: 2.2481
Epoch [14/100], Loss: 2.2152
Epoch [15/100], Loss: 2.1748
Epoch [16/100], Loss: 2.1247
Epoch [17/100], Loss: 2.0934
Epoch [18/100], Loss: 2.0354
Epoch [19/100], Loss: 1.9731
Epoch [20/100], Loss: 1.9176
Epoch [21/100], Loss: 1.8771
Epoch [22/100], Loss: 1.8426
Epoch [23/100], Loss: 1.7679
Epoch [24/100], Loss: 1.7100
Epoch [25/100], Loss: 1.6673
Epoch [26/100], Loss: 1.6027
Epoch [27/100], Loss: 1.5730
Epoch [28/100], Loss: 1.5056
Epoch [29/100], Loss: 1.4663
Epoch [30/100], Loss: 1.3984
Epoch [31/100], Loss: 1.3092
Epoch [32/100], Loss: 1.2697
Epoch [33/100], Loss: 1.2105
Epoch [34/100], Loss: 1.1701
Epoch [35/100], Loss: 1.1481
Epoch [36/100], Loss: 1.1229
Epoch [37/100], Loss: 1.0642
Epoch [38/100], Loss: 0.9668
Epoch [39/100], Loss: 0.9169
Epoch [40/100], Loss: 0.8436
Epoch [41/100], Loss: 0.7723
Epoch [42/100], Loss: 0.7143
Epoch [43/100], Loss: 0.6618
Epoch [44/100], Loss: 0.6403
Epoch [45/100], Loss: 0.6210
Epoch [46/100], Loss: 0.5513
Epoch [47/100], Loss: 0.5120
Epoch [48/100], Loss: 0.4519
Epoch [49/100], Loss: 0.3936
Epoch [50/100], Loss: 0.3587
Epoch [51/100], Loss: 0.3240
Epoch [52/100], Loss: 0.2842
Epoch [53/100], Loss: 0.2711
Epoch [54/100], Loss: 0.2483
Epoch [55/100], Loss: 0.2326
Epoch [56/100], Loss: 0.2178
Epoch [57/100], Loss: 0.2032
Epoch [58/100], Loss: 0.1671
Epoch [59/100], Loss: 0.1473
Epoch [60/100], Loss: 0.1289
Epoch [61/100], Loss: 0.1103
Epoch [62/100], Loss: 0.0884
Epoch [63/100], Loss: 0.0664
Epoch [64/100], Loss: 0.0515
Epoch [65/100], Loss: 0.0397
Epoch [66/100], Loss: 0.0338
Epoch [67/100], Loss: 0.0301
Epoch [68/100], Loss: 0.0275
Epoch [69/100], Loss: 0.0254
Epoch [70/100], Loss: 0.0237
Epoch [71/100], Loss: 0.0222
Epoch [72/100], Loss: 0.0209
Epoch [73/100], Loss: 0.0197
Epoch [74/100], Loss: 0.0186
Epoch [75/100], Loss: 0.0176
Epoch [76/100], Loss: 0.0167
Epoch [77/100], Loss: 0.0159
Epoch [78/100], Loss: 0.0151
Epoch [79/100], Loss: 0.0144
Epoch [80/100], Loss: 0.0138
Epoch [81/100], Loss: 0.0132
Epoch [82/100], Loss: 0.0127
Epoch [83/100], Loss: 0.0122
Epoch [84/100], Loss: 0.0118
Epoch [85/100], Loss: 0.0114
Epoch [86/100], Loss: 0.0110
Epoch [87/100], Loss: 0.0104
Epoch [88/100], Loss: 0.0100
Epoch [89/100], Loss: 0.0096
Epoch [90/100], Loss: 0.0094
Epoch [91/100], Loss: 0.0093
Epoch [92/100], Loss: 0.0091
Epoch [93/100], Loss: 0.0089
Epoch [94/100], Loss: 0.0088
Epoch [95/100], Loss: 0.0086
Epoch [96/100], Loss: 0.0086
Epoch [97/100], Loss: 0.0082
Epoch [98/100], Loss: 0.0077
Epoch [99/100], Loss: 0.0074
Epoch [100/100], Loss: 0.0070
In the below code, will take the trained model and generate new text based on a starting string. We will take the starting string, then predict the next 200 characters one by one, feeding the predicted character back into the model at each step.
def predict(model, start_str, predict_len=200, temperature=0.8):
model.eval()
# convert starting string to indices
input_seq = [char_to_int[ch] for ch in start_str] #the input converted to indices
input_tensor = torch.tensor(input_seq).unsqueeze(0) # [1, seq_len] - Add batch dimension
hidden = model.init_hidden(batch_size=1, device='cpu')
generated = start_str
for _ in range(predict_len):
# one-hot encode for each character in the input sequence
x = nn.functional.one_hot(input_tensor, num_classes=input_size).float() # [1, seq_len, 47]
# forward pass
output, hidden = model(x, hidden) # output: [1, 47], for what character comes next
# apply temperature then sample
output = output / temperature # with temperature < 1, high-probability chars get even higher
probs = torch.softmax(output, dim=1) # [1, 47] - convert logits to probabilities
next_char_idx = torch.multinomial(probs, num_samples=1).item() # sample the next character index based on the probabilities
# append predicted character
next_char = int_to_char[next_char_idx]
generated += next_char
# slide the window — drop first char, append predicted
input_seq = input_seq[1:] + [next_char_idx]
input_tensor = torch.tensor(input_seq).unsqueeze(0)
return generated
# run it
print(predict(model, start_str='رحلة '))رحلة تسنة النما تياتا تخد مع دق بع ال وادة المميا تلطفي نل على ال بعدل شيم.
الظيامت قليمة ول ال شبد.
الواحنياوم القا، تالم تجعل بل أب لي تل للباررة ال بطجه، ومعدائن الطاس يلطفي نل فلى اللبانرة وللم، وال يا
Comments