English to Hindi Transliteration using Seq2Seq Model

English to Hindi Transliteration Tutorial

Setup Environment

On Google Colab make sure you select Python 3/GPU runtime before running the code

Choose Python 3 + GPU/CPU

</img> </img>

GPU

%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0

Download Data

!mkdir data
!wget -N "https://raw.githubusercontent.com/bsantraigi/tensorflow-seq2seq-hindi/master/data/Hindi%20-%20Word%20Transliteration%20Pairs%201.txt" -P data/

mkdir: cannot create directory ‘data’: File exists
--2019-08-10 20:10:10--  https://raw.githubusercontent.com/bsantraigi/tensorflow-seq2seq-hindi/master/data/Hindi%20-%20Word%20Transliteration%20Pairs%201.txt
Connecting to 172.16.2.30:8080... connected.
Proxy request sent, awaiting response... 200 OK
Length: 773211 (755K) [text/plain]
Last-modified header missing -- time-stamps turned off.
--2019-08-10 20:10:11--  https://raw.githubusercontent.com/bsantraigi/tensorflow-seq2seq-hindi/master/data/Hindi%20-%20Word%20Transliteration%20Pairs%201.txt
Connecting to 172.16.2.30:8080... connected.
Proxy request sent, awaiting response... 200 OK
Length: 773211 (755K) [text/plain]
Saving to: ‘data/Hindi - Word Transliteration Pairs 1.txt’

100%[======================================>] 7,73,211     714KB/s   in 1.1s   

2019-08-10 20:10:13 (714 KB/s) - ‘data/Hindi - Word Transliteration Pairs 1.txt’ saved [773211/773211]

Import Stuff

import nltk
from collections import Counter
from tqdm import tqdm_notebook
import numpy as np
import tensorflow as tf
from tensorflow.contrib import seq2seq
from tensorflow.contrib.rnn import DropoutWrapper
import random

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/bishal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True

Global Parameters

MAX_SEQ_LEN = 20
BATCH_SIZE = 64

Language Vocabulary

(Vocab of characters, i.e. an Alphabet)

class Lang:
    def __init__(self, counter, vocab_size):
        self.word2id = {}
        self.id2word = {}
        self.pad = "<PAD>"
        self.sos = "<SOS>"
        self.eos = "<EOS>"
        self.unk = "<UNK>"
        
        self.ipad = 0
        self.isos = 1
        self.ieos = 2
        self.iunk = 3
        
        self.word2id[self.pad] = 0
        self.word2id[self.sos] = 1
        self.word2id[self.eos] = 2
        self.word2id[self.unk] = 3
        
        self.id2word[0] = self.pad
        self.id2word[1] = self.sos
        self.id2word[2] = self.eos
        self.id2word[3] = self.unk
        
        curr_id = 4
        for w, c in counter.most_common(vocab_size):
            self.word2id[w] = curr_id
            self.id2word[curr_id] = w
            curr_id += 1
            
    def encodeSentence(self, s, max_len=-1):
        wseq = s.lower().strip()
        if max_len == -1:
            return [self.word2id[w] if w in self.word2id else self.iunk for w in wseq]
        else:
            return ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + [self.ieos] + [self.ipad]*max_len)[:max_len]
        
    def encodeSentence2(self, s, max_len=-1):
        wseq = wseq = s.lower().strip()
        return min(max_len, len(wseq)+1), \
            ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + \
                [self.ieos] + [self.ipad]*max_len)[:max_len]
    
    def decodeSentence(self, id_seq):
        id_seq = np.array(id_seq + [self.ieos])
        j = np.argmax(id_seq==self.ieos)
        s = ''.join([self.id2word[x] for x in id_seq[:j]])
        s = s.replace(self.unk, "UNK")
        return s

# Total number of samples to read
N = 30823

Reading the data files

Each line contains a hindi word in both English and Devnagari script

hi_counter = Counter()
hi_sentences=[]
en_counter = Counter()
en_sentences=[]
with open("data/Hindi - Word Transliteration Pairs 1.txt") as f:
    for line in tqdm_notebook(f, total=N, desc="Reading file:"):
        en, hi = line.strip().split("\t")
        hi_sentences.append(hi)
        en_sentences.append(en)
    for line in tqdm_notebook(hi_sentences, desc="Processing inputs:"):
        for w in line.strip():
            hi_counter[w] += 1
    for line in tqdm_notebook(en_sentences, desc="Processing inputs:"):
        for w in line.strip():
            en_counter[w] += 1

HBox(children=(IntProgress(value=0, description='Reading file:', max=30823, style=ProgressStyle(description_wi…

HBox(children=(IntProgress(value=0, description='Processing inputs:', max=30823, style=ProgressStyle(descripti…

HBox(children=(IntProgress(value=0, description='Processing inputs:', max=30823, style=ProgressStyle(descripti…

# A few sample hindi characters
print("Most common hi characters in dataset:\n", hi_counter.most_common(5))

print("\nTotal (hi)characters gathered from dataset:",len(hi_counter))

# A few sample english characters
print("\nMost common en characters in dataset:\n", en_counter.most_common(5))

print("\nTotal (en)characters gathered from dataset:", len(en_counter))

Most common hi characters in dataset:
 [('ा', 21123), ('र', 9205), ('े', 8100), ('न', 7225), ('ी', 6546)]

Total (hi)characters gathered from dataset: 66

Most common en characters in dataset:
 [('a', 57220), ('n', 15015), ('i', 14015), ('h', 13805), ('e', 12264)]

Total (en)characters gathered from dataset: 27

en_lang = Lang(en_counter, len(en_counter))
hi_lang = Lang(hi_counter, len(hi_counter))

print("Test en encoding:", en_lang.encodeSentence("Shukriya"))

print("Test en decoding:", en_lang.decodeSentence(en_lang.encodeSentence("Shukriya", 10)))

print("Test hindi encoding:", hi_lang.encodeSentence("शुक्रिया", 10))

print("Test hindi decoding:", hi_lang.decodeSentence((hi_lang.encodeSentence("शुक्रिया", 10))))

Test en encoding: [15, 7, 10, 13, 9, 6, 20, 4]
Test en decoding: shukriya
Test hindi encoding: [35, 19, 15, 22, 5, 12, 21, 4, 2, 0]
Test hindi decoding: शुक्रिया

VE = len(en_lang.word2id)
VH = len(hi_lang.word2id)

The Seq2Seq architecture

We will implement a seq2seq architecture for transliteration in Tensorflow r1.13.1 / r1.14
Debugging Tip: Always keep track of tensor dimensions!
Tensorflow Computation Graph - We will build a tf computation graph first. This is the representation used by tf for any neural network architecture. Once the computation graph is built, you can feed data to it for training or inference

Character Embedding Matrix

en_word_emb_matrix = tf.get_variable("en_word_emb_matrix", (VE, 300), dtype=tf.float32)
hi_word_emb_matrix = tf.get_variable("hi_word_emb_matrix", (VH, 300), dtype=tf.float32)

WARNING:tensorflow:From /home/bishal/miniconda3/envs/tf_gpu/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.

Placeholders

Input to a tensorflow graph is

keep_prob = tf.placeholder(tf.float32)

input_ids = tf.placeholder(tf.int32, (None, MAX_SEQ_LEN))
input_lens = tf.placeholder(tf.int32, (None, ))

ph_target_ids = tf.placeholder(tf.int32, (None, MAX_SEQ_LEN))
target_lens = tf.placeholder(tf.int32, (None, ))

# Add SOS or GO symbol
target_ids = tf.concat([tf.fill([BATCH_SIZE,1], hi_lang.isos), ph_target_ids], -1)

Building the computation graph

input_emb = tf.nn.embedding_lookup(en_word_emb_matrix, input_ids)
target_emb = tf.nn.embedding_lookup(hi_word_emb_matrix, target_ids[:, :-1])

input_emb.shape

TensorShape([Dimension(None), Dimension(20), Dimension(300)])

Encoder - RNN based sequence encoder

encoder_cell = tf.nn.rnn_cell.GRUCell(128) # 128 is the dimension of hidden state
encoder_cell = DropoutWrapper(encoder_cell, output_keep_prob=keep_prob) # Adding Dropout for regularization

WARNING:tensorflow:From <ipython-input-18-73573fbda69f>:1: GRUCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.

enc_outputs, enc_state = tf.nn.dynamic_rnn(
    encoder_cell, # The encoder GRU cell
    input_emb, # Embedded input sequence
    sequence_length=input_lens, # Sequence lengths of individual inputs in a batch
    initial_state=encoder_cell.zero_state(BATCH_SIZE, dtype=tf.float32)
)

WARNING:tensorflow:From <ipython-input-19-1e698e82f454>:5: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
WARNING:tensorflow:From /home/bishal/miniconda3/envs/tf_gpu/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py:626: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From /home/bishal/miniconda3/envs/tf_gpu/lib/python3.6/site-packages/tensorflow/python/ops/rnn_cell_impl.py:1259: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

# Confirm the shape of the final hidden state
enc_state.shape

TensorShape([Dimension(64), Dimension(128)])

Decoder

decoder_cell = tf.nn.rnn_cell.GRUCell(128)
decoder_cell = DropoutWrapper(decoder_cell, output_keep_prob=keep_prob)

Decoder to Output Vocab Projection Layer

output_projection = tf.layers.Dense(len(hi_lang.word2id))

Decoder Training Helper

helper = seq2seq.TrainingHelper(target_emb, target_lens)
decoder = seq2seq.BasicDecoder(decoder_cell, helper, enc_state, output_projection)
outputs, _, outputs_lens = seq2seq.dynamic_decode(decoder, maximum_iterations=MAX_SEQ_LEN, 
                                                  impute_finished=False, swap_memory=True)
output_max_len = tf.reduce_max(outputs_lens)

And Decoder Inference Helper

# Using the decoder_cell without dropout here.
infer_helper = seq2seq.GreedyEmbeddingHelper(hi_word_emb_matrix, tf.fill([BATCH_SIZE, ], hi_lang.isos), hi_lang.ieos)
infer_decoder = seq2seq.BasicDecoder(decoder_cell, infer_helper, enc_state, output_projection)
infer_output = seq2seq.dynamic_decode(infer_decoder, maximum_iterations=MAX_SEQ_LEN, swap_memory=True)

Loss and Optimizers

# Sequence mask:
# To make sure we don't back-propagate error from output of length positions
masks = tf.sequence_mask(target_lens, output_max_len, dtype=tf.float32, name='masks')

# Loss function - weighted softmax cross entropy
cost = seq2seq.sequence_loss(
    outputs[0],
    target_ids[:, 1:(output_max_len + 1)],
    masks)

# Optimizer
optimizer = tf.train.AdamOptimizer(0.0001)

train_op = optimizer.minimize(cost)

init = tf.global_variables_initializer()

Tensorflow Sessions

sess_config = tf.ConfigProto()
sess_config.gpu_options.allow_growth = True

sess = tf.InteractiveSession(config=sess_config)
sess.run(init)

Minibatch Training + Validation

Performance Evaluation using BLEU scores

random.seed(41)

parallel = list(zip(en_sentences, hi_sentences))

random.shuffle(parallel)

parallel[1000]

('hazaarii', 'हज़ारी')

train_n = int(0.95*N)
valid_n = N - train_n

train_pairs = parallel[:train_n].copy()
valid_pairs = parallel[train_n:]

def small_test():
    all_bleu = []
    smoothing = nltk.translate.bleu_score.SmoothingFunction().method7
    for m in range(0, valid_n, BATCH_SIZE):
        # print(f"Status: {m}/{N}", end='\r')
        n = m + BATCH_SIZE
        if n > valid_n:
            # print("Epoch Complete...")
            break

        input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
        input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
        for i in range(m, n):
            b,a = en_lang.encodeSentence2(valid_pairs[i][0], MAX_SEQ_LEN)
            input_batch[i-m,:] = a
            input_lens_batch[i-m] = b

    #     target_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
    #     target_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
    #     for i in range(m, n):
    #         b,a = hi_lang.encodeSentence2(valid_pairs[i][1], MAX_SEQ_LEN)
    #         target_batch[i-m,:] = a
    #         target_lens_batch[i-m] = b

        feed_dict={
            input_ids: input_batch,
            input_lens: input_lens_batch,
            #target_ids: target_batch,
            #target_lens: target_lens_batch,
            keep_prob: 1.0
        }
        pred_batch = sess.run(infer_output[0].sample_id, feed_dict=feed_dict)
        for k, pred_ in enumerate(pred_batch):
            pred_s = hi_lang.decodeSentence(list(pred_))
            ref = valid_pairs[m+k][1]
            try:
                _bx = nltk.translate.bleu_score.sentence_bleu(
                    [ref],
                    pred_s,
                    weights=[1/4]*4,
                    smoothing_function=smoothing)
            except ZeroDivisionError:
                _bx = 0
            all_bleu.append(_bx)

    print(f"BLEU Score: {np.mean(all_bleu)}")

for _e in range(20):
    # Mix things up a bit.
    random.shuffle(train_pairs)
    pbar = tqdm_notebook(range(0, train_n, BATCH_SIZE))
    batch_loss = 0
    bxi = 0
    for m in pbar:
        n = m + BATCH_SIZE
        if n <= train_n:
            # print("Epoch Complete... \n")

            input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
            input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
            for i in range(m, n):
                b,a = en_lang.encodeSentence2(train_pairs[i][0], MAX_SEQ_LEN)
                input_batch[i-m,:] = a
                input_lens_batch[i-m] = b

            target_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
            target_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
            for i in range(m, n):
                b,a = hi_lang.encodeSentence2(train_pairs[i][1], MAX_SEQ_LEN)
                target_batch[i-m,:] = a
                target_lens_batch[i-m] = b

            feed_dict={
                input_ids: input_batch,
                input_lens: input_lens_batch,
                ph_target_ids: target_batch,
                target_lens: target_lens_batch,
                keep_prob: 0.8 
            }
            sess.run(train_op, feed_dict=feed_dict)
            batch_loss += sess.run(cost, feed_dict=feed_dict)
            pbar.set_description(f"Epoch: {_e} >> Loss: {batch_loss/(bxi+1):2.2F}:")
            bxi += 1
            if (1 + n//BATCH_SIZE) % 100 == 0:
                small_test()

HBox(children=(IntProgress(value=0, max=458), HTML(value='')))

BLEU Score: 0.610158884519212
BLEU Score: 0.6146452675497532
BLEU Score: 0.6058223461268137
BLEU Score: 0.6131855443159836

HBox(children=(IntProgress(value=0, max=458), HTML(value='')))

BLEU Score: 0.6182856361594138
BLEU Score: 0.620302774263697
BLEU Score: 0.6255745267126785
BLEU Score: 0.6288471674183466

Let’s see some real translation examples now!

def transliterate(s):
    input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
    input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
    b,a = en_lang.encodeSentence2(s, MAX_SEQ_LEN)
    input_batch[0, :] = a
    input_lens_batch[0] = b
    
    feed_dict={
        input_ids: input_batch,
        input_lens: input_lens_batch,
        #target_ids: target_batch,
        #target_lens: target_lens_batch,
        keep_prob: 1.0
    }
    pred_batch = sess.run(infer_output[0].sample_id, feed_dict=feed_dict)
    pred_ = pred_batch[0]
    pred_s = hi_lang.decodeSentence(list(pred_))
    # ref = valid_pairs[m+k][1]
    return pred_s

transliterate("saya")

'साया'

transliterate("raama")

'रामा'

transliterate("shahar")

'शाहर'