hinglishease_trained_hinglish.py

# -*- coding: utf-8 -*-
"""HinglishEase_Trained_Hinglish.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1eeuoe09oX1zagzz4jbsJFap2TXr1Tz-z
"""

import os
import random
import re
import string
import time

import nltk.translate.bleu_score as bleu
import numpy as np
import pandas as pd
import tensorflow as tf
from google.colab import drive
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

drive.mount("/content/drive")

eng_hin = pd.read_csv(
    "/content/drive/MyDrive/PRL/task/data/synthetic-dataset/train.csv"
)
eng_hin.head()

# create a new dataframe of english and hinglish column
df = pd.DataFrame()
df["english"] = eng_hin["English"]
df["hindi"] = eng_hin["Hindi"]
df.head()

eng_hin.dropna(inplace=True)
eng_hin.shape

import pickle

with open(
    "/content/drive/MyDrive/PRL/task/data/human-generated-dataset/train_human_generated.pkl",
    "rb",
) as f:
    human_generated = pickle.load(f)
    print("Human generated dataset size:", len(human_generated))

human_generated

exclude = set(string.punctuation)  # Set of all special characters
remove_digits = str.maketrans("", "", string.digits)  # Set of all digits


def preprocess(text):
    """
    Preprocesses an English sentence for natural language processing.

    Args:
        text (str): The input sentence to be preprocessed.

    Returns:
        str: The preprocessed sentence enclosed with "<start>" and "<end>" markers.
    """
    text = text.lower()  # lower casing
    text = re.sub("'", "", text)  # remove the quotation marks if any
    text = "".join(ch for ch in text if ch not in exclude)
    text = text.translate(remove_digits)  # remove the digits
    text = text.strip()
    text = re.sub(" +", " ", text)  # remove extra spaces
    text = "<start> " + text + " <end>"
    return text


def preprocess_hin(text):
    """
    Preprocesses a Hindi sentence for natural language processing.

    Args:
        text (str): The input sentence to be preprocessed.

    Returns:
        str: The preprocessed sentence enclosed with "<start>" and "<end>" markers.
    """
    text = re.sub("'", "", text)  # remove the quotation marks if any
    text = "".join(ch for ch in text if ch not in exclude)
    text = re.sub("[२३०८१५७९४६]", "", text)  # remove the digits
    text = text.strip()
    text = re.sub(" +", " ", text)  # remove extra spaces
    text = "<start> " + text + " <end>"
    return text


eng_hin["english"] = df["english"].apply(preprocess)
eng_hin["hindi"] = df["hindi"].apply(preprocess_hin)

eng_hin.rename(columns={"english": "english", "hindi": "hindi"}, inplace=True)

eng_hin.head()


def tokenize(lang):
    """
    Tokenizes a given language text and returns the tokenized tensor and tokenizer.

    Args:
        lang (list of str): A list of sentences in the target language for tokenization.

    Returns:
        tuple: A tuple containing:
            - tensor (tf.Tensor): A 2D tensor containing tokenized sequences.
            - lang_tokenizer (tf.keras.preprocessing.text.Tokenizer): The tokenizer used for tokenization.

    This function tokenizes the input language sentences by following these steps:
    1. Initializes a tokenizer for the language with no filters.
    2. Fits the tokenizer on the provided language data.
    3. Converts the language sentences into sequences of tokens.
    4. Pads the sequences to a fixed length of 20 with zeros, if necessary.

    Example:
    >>> sentences = ["This is a sample sentence.", "Tokenize this text."]
    >>> tensor, tokenizer = tokenize(sentences)
    >>> tensor
    array([[ 2,  9, 10,  4,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
           [ 2, 11, 12, 13, 14,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)
    """
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters="")
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(
        tensor, padding="post", maxlen=20, dtype="int32"
    )

    return tensor, lang_tokenizer


def load_dataset():
    input_tensor, inp_lang_tokenizer = tokenize(eng_hin["english"].values)
    target_tensor, targ_lang_tokenizer = tokenize(eng_hin["hindi"].values)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer


input_tensor, target_tensor, inp_lang, targ_lang = load_dataset()

max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

(
    input_tensor_train,
    input_tensor_val,
    target_tensor_train,
    target_tensor_val,
) = train_test_split(input_tensor, target_tensor, test_size=0.2)

print(
    len(input_tensor_train),
    len(target_tensor_train),
    len(input_tensor_val),
    len(target_tensor_val),
)

BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 32
N_BATCH = BUFFER_SIZE // BATCH_SIZE
embedding_dim = 256
units = 1024
steps_per_epoch = len(input_tensor_train) // BATCH_SIZE

vocab_inp_size = len(inp_lang.word_index.keys())
vocab_tar_size = len(targ_lang.word_index.keys())

dataset = tf.data.Dataset.from_tensor_slices(
    (input_tensor_train, target_tensor_train)
).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

embeddings_index = dict()
f = open(
    "/content/drive/MyDrive/PRL/task/data/synthetic-dataset/glove.6B.300d.txt"
)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((vocab_inp_size + 1, 300))
for word, i in inp_lang.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


class Encoder(tf.keras.Model):
    """
    Encoder model for sequence-to-sequence tasks.

    Args:
        vocab_size (int): The size of the vocabulary.
        embedding_dim (int): The dimension of word embeddings.
        enc_units (int): The number of units in the GRU layer.
        batch_sz (int): Batch size for training.

    Attributes:
        batch_sz (int): The batch size for training.
        enc_units (int): The number of units in the GRU layer.
        embedding (tf.keras.layers.Embedding): The embedding layer.
        gru (tf.keras.layers.GRU): The GRU layer for encoding sequences.

    This class represents the encoder part of a sequence-to-sequence model. It takes
    a sequence of input data, embeds it, and passes it through a GRU layer to
    produce an encoded representation.

    Methods:
        call(x, hidden): Forward pass through the encoder.
        initialize_hidden_state(): Initializes the hidden state to zeros.

    Example:
    >>> encoder = Encoder(vocab_size=1000, embedding_dim=256, enc_units=512, batch_sz=64)
    >>> hidden_state = encoder.initialize_hidden_state()
    >>> outputs, state = encoder(input_sequence, hidden_state)
    """

    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            name="embedding_layer_encoder",
            trainable=False,
        )
        self.gru = tf.keras.layers.GRU(
            units,
            return_sequences=True,
            return_state=True,
            recurrent_activation="sigmoid",
            recurrent_initializer="glorot_uniform",
        )

    def call(self, x, hidden):
        """
        Forward pass through the encoder.

        Args:
            x (tf.Tensor): Input sequence.
            hidden (tf.Tensor): Initial hidden state.

        Returns:
            tf.Tensor: Encoder outputs.
            tf.Tensor: Encoder state.
        """
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self):
        """
        Initializes the hidden state to zeros.

        Returns:
            tf.Tensor: Initial hidden state tensor.
        """
        return tf.zeros((self.batch_sz, self.enc_units))


class Decoder(tf.keras.Model):
    """
    Decoder model for sequence-to-sequence tasks with attention mechanism.

    Args:
        vocab_size (int): The size of the target vocabulary.
        embedding_dim (int): The dimension of word embeddings.
        dec_units (int): The number of units in the GRU layer.
        batch_sz (int): Batch size for training.

    Attributes:
        batch_sz (int): The batch size for training.
        dec_units (int): The number of units in the GRU layer.
        embedding (tf.keras.layers.Embedding): The embedding layer.
        gru (tf.keras.layers.GRU): The GRU layer for decoding sequences.
        fc (tf.keras.layers.Dense): The fully connected layer for output.
        W1 (tf.keras.layers.Dense): The weight matrix for attention mechanism.
        W2 (tf.keras.layers.Dense): The weight matrix for attention mechanism.
        V (tf.keras.layers.Dense): The weight matrix for attention mechanism.

    This class represents the decoder part of a sequence-to-sequence model with an
    attention mechanism. It takes the encoded sequence and generates the output
    sequence while paying attention to relevant parts of the input sequence.

    Methods:
        call(x, hidden, enc_output): Forward pass through the decoder.
        initialize_hidden_state(): Initializes the hidden state to zeros.

    Example:
    >>> decoder = Decoder(vocab_size=1000, embedding_dim=256, dec_units=512, batch_sz=64)
    >>> hidden_state = decoder.initialize_hidden_state()
    >>> output, state, attention = decoder(target_sequence, hidden_state, encoder_output)
    """

    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(
            units,
            return_sequences=True,
            return_state=True,
            recurrent_activation="sigmoid",
            recurrent_initializer="glorot_uniform",
        )
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, x, hidden, enc_output):
        """
        Forward pass through the decoder.

        Args:
            x (tf.Tensor): Input sequence.
            hidden (tf.Tensor): Initial hidden state.
            enc_output (tf.Tensor): Encoder output sequence.

        Returns:
            tf.Tensor: Decoder output.
            tf.Tensor: Decoder state.
            tf.Tensor: Attention weights.
        """
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        score = self.V(
            tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))
        )

        attention_weights = tf.nn.softmax(score, axis=1)

        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)

        x = self.embedding(x)

        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        output, state = self.gru(x)

        output = tf.reshape(output, (-1, output.shape[2]))

        x = self.fc(output)

        return x, state, attention_weights

    def initialize_hidden_state(self):
        """
        Initializes the hidden state to zeros.

        Returns:
            tf.Tensor: Initial hidden state tensor.
        """
        return tf.zeros((self.batch_sz, self.dec_units))


tf.keras.backend.clear_session()

encoder = Encoder(vocab_inp_size + 1, 300, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size + 1, embedding_dim, units, BATCH_SIZE)

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction="none"
)


def loss_function(real, pred):
    """
    Calculate the loss for sequence prediction.

    Args:
        real (tf.Tensor): The actual target sequence.
        pred (tf.Tensor): The predicted target sequence.

    Returns:
        tf.Tensor: The mean loss for the batch.

    This function computes the loss for a batch of sequences, taking into account
    the actual target sequence and the predicted target sequence. It applies a
    masking mechanism to ignore padding tokens when computing the loss.

    Example:
    >>> real_sequence = tf.constant([[1, 2, 3, 0, 0], [4, 5, 6, 7, 8]])
    >>> pred_sequence = tf.constant([[1, 2, 3, 4, 5], [4, 5, 6, 7, 8]])
    >>> loss = loss_function(real_sequence, pred_sequence)
    """
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)


checkpoint_dir = "./training_checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(
    optimizer=optimizer, encoder=encoder, decoder=decoder
)


@tf.function
def train_step(inp, targ, enc_hidden):
    """
    Perform a single training step.

    Args:
        inp (tf.Tensor): The input sequence.
        targ (tf.Tensor): The target sequence.
        enc_hidden (tf.Tensor): The initial encoder hidden state.

    Returns:
        tf.Tensor: The batch loss.

    This function performs a single training step for a sequence-to-sequence model.
    It computes the loss and updates the model's weights based on the given input
    and target sequences.

    Example:
    >>> input_sequence = tf.constant([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
    >>> target_sequence = tf.constant([[11, 12, 13, 14, 15], [16, 17, 18, 19, 20]])
    >>> initial_hidden_state = tf.zeros((2, 512))
    >>> loss = train_step(input_sequence, target_sequence, initial_hidden_state)
    """
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        encoder.get_layer("embedding_layer_encoder").set_weights(
            [embedding_matrix]
        )
        dec_hidden = enc_hidden

        dec_input = tf.expand_dims(
            [targ_lang.word_index["<start>"]] * BATCH_SIZE, 1
        )

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(
                dec_input, dec_hidden, enc_output
            )

            loss += loss_function(targ[:, t], predictions)

            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = loss / int(targ.shape[1])

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss


EPOCHS = 100

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for batch, (inp, targ) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print(
                f"Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}"
            )
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)

    print(f"Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}")
    print(f"Time taken for 1 epoch {time.time()-start:.2f} sec\n")


def evaluate(sentence):
    """
    Evaluate a given input sentence using the trained model.

    Args:
        sentence (str): The input sentence to be evaluated.

    Returns:
        tuple: A tuple containing:
            - str: The generated target sentence.
            - np.ndarray: The attention plot.

    This function takes an input sentence, preprocesses it, and generates a target
    sentence using a trained sequence-to-sequence model with an attention mechanism.

    Example:
    >>> input_sentence = "Hello, how are you?"
    >>> result, attention = evaluate(input_sentence)
    """
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(" ")]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs], maxlen=20, padding="post"
    )
    inputs = tf.convert_to_tensor(inputs)

    result = ""

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index["<start>"]], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(
            dec_input, dec_hidden, enc_out
        )
        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1,))
        attention_plot[t] = attention_weights.numpy()
        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + " "

        if targ_lang.index_word[predicted_id] == "<end>":
            return result, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, attention_plot


input_sentence = "please ensure that you use the appropriate form "
print("Input sentence in english : ", input_sentence)
predicted_output, attention_plot = evaluate(input_sentence)
print("Predicted sentence in hindi : ", predicted_output)

input_sentence = "and do something with it to change the world "
print("Input sentence in english : ", input_sentence)
predicted_output, attention_plot = evaluate(input_sentence)
print("Predicted sentence in hindi : ", predicted_output)

"""# OpenInAPP sentences output

"""

input_sentence = (
    "So even if its a big video I will clearly mention all the products "
)
print("Input sentence in english : ", input_sentence)
predicted_output, attention_plot = evaluate(input_sentence)
print("Predicted sentence in hindi : ", predicted_output)

input_sentence = "I was waiting for my bag "
print("Input sentence in english : ", input_sentence)
predicted_output, attention_plot = evaluate(input_sentence)
print("Predicted sentence in hindi : ", predicted_output)


input_sentence = "definitely share your feedback in the comment section "
print("Input sentence in english : ", input_sentence)
predicted_output, attention_plot = evaluate(input_sentence)
print("Predicted sentence in hindi : ", predicted_output)