Deep Sentiment Analysis

Deep Sentiment Analysis Tutorial

Setup Environment

On Google Colab make sure you select Python 3/GPU runtime before running the code

Choose Python 3 + GPU/CPU

</img> </img>

%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0

Download Data

![ ! -d data ] && mkdir data/
![ -f data/aclImdb_v1.tar.gz ] && echo "Skip Download"
![ ! -f data/aclImdb_v1.tar.gz ] && wget -N https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -P data/

Skip Download

%%time
![ -d data/aclImdb/ ] && echo "Data already extracted"
![ ! -d data/aclImdb/ ] && tar -xzf data/aclImdb_v1.tar.gz -C data/

Data already extracted
CPU times: user 6.34 ms, sys: 4.91 ms, total: 11.2 ms
Wall time: 224 ms

Imports

import os
import re
import nltk
from collections import Counter
from tqdm import tqdm_notebook
import numpy as np
import tensorflow as tf
from tensorflow.contrib import seq2seq
from tensorflow.contrib.rnn import DropoutWrapper
import random

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/bishal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True

MAX_SEQ_LEN = 50
BATCH_SIZE = 64

class Lang:
    def __init__(self, counter, vocab_size):
        self.word2id = {}
        self.id2word = {}
        self.pad = "<PAD>"
        self.sos = "<SOS>"
        self.eos = "<EOS>"
        self.unk = "<UNK>"
        
        self.ipad = 0
        self.isos = 1
        self.ieos = 2
        self.iunk = 3
        
        self.word2id[self.pad] = 0
        self.word2id[self.sos] = 1
        self.word2id[self.eos] = 2
        self.word2id[self.unk] = 3
        
        self.id2word[0] = self.pad
        self.id2word[1] = self.sos
        self.id2word[2] = self.eos
        self.id2word[3] = self.unk
        
        curr_id = 4
        for w, c in counter.most_common(vocab_size-curr_id):
            self.word2id[w] = curr_id
            self.id2word[curr_id] = w
            curr_id += 1
    
    def encodeSentence(self, wseq, max_len=-1):
        # wseq = nltk.tokenize.word_tokenize(s.lower().strip())
        if max_len == -1:
            return [self.word2id[w] if w in self.word2id else self.iunk for w in wseq]
        else:
            return ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + [self.ieos] + [self.ipad]*max_len)[:max_len]
        
    def encodeSentence2(self, wseq, max_len=-1):
        # wseq = nltk.tokenize.word_tokenize(s.lower().strip()) 
        return min(max_len, len(wseq)+1), \
            ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + \
                [self.ieos] + [self.ipad]*max_len)[:max_len]
    
    def decodeSentence(self, id_seq):
        id_seq = np.array(id_seq + [self.ieos])
        j = np.argmax(id_seq==self.ieos)
        s = ' '.join([self.id2word[x] for x in id_seq[:j]])
        s = s.replace(self.unk, "UNK")
        return s

Let’s read in the data

data_folder = 'data/aclImdb/'

rp = os.path.join(data_folder, 'train/pos')
train_positive = [os.path.join(rp, f) for f in os.listdir(rp)]
rp = os.path.join(data_folder, 'train/neg')
train_negative = [os.path.join(rp, f) for f in os.listdir(rp)]

rp = os.path.join(data_folder, 'test/pos')
test_positive = [os.path.join(rp, f) for f in os.listdir(rp)]
rp = os.path.join(data_folder, 'test/neg')
test_negative = [os.path.join(rp, f) for f in os.listdir(rp)]

Limit number of samples

To quickly train a small model, consider setting n_train and n_test to some relatively small numbers e.g. 1000. Set, n_train = n_test = -1 to use all the samples available.

n_train = 100000
n_test = 2500

re_html_cleaner = re.compile(r"<.*?>")

en_counter = Counter()
train_data = []
for _fname in tqdm_notebook(train_positive[:n_train], desc="Crunching +ve samples: "):
    with open(_fname) as f:
        text = f.read().strip()
        text = re_html_cleaner.sub(" ", text)
        wseq = nltk.tokenize.word_tokenize(text.lower())
        en_counter += Counter(wseq)
        train_data.append((wseq, 1))
        
for _fname in tqdm_notebook(train_negative[:n_train], desc="Crunching -ve samples: "):
    with open(_fname) as f:
        text = f.read().strip()
        text = re_html_cleaner.sub(" ", text)
        wseq = nltk.tokenize.word_tokenize(text.lower())
        en_counter += Counter(wseq)
        train_data.append((wseq, 0))

HBox(children=(IntProgress(value=0, description='Crunching +ve samples: ', max=12500, style=ProgressStyle(desc…

HBox(children=(IntProgress(value=0, description='Crunching -ve samples: ', max=12500, style=ProgressStyle(desc…

test_data = []
for _fname in tqdm_notebook(test_positive[:n_test], desc="Crunching +ve samples: "):
    with open(_fname) as f:
        text = f.read().strip()
        text = re_html_cleaner.sub(" ", text)
        wseq = nltk.tokenize.word_tokenize(text.lower())
        test_data.append((wseq, 1))
        
for _fname in tqdm_notebook(test_negative[:n_test], desc="Crunching -ve samples: "):
    with open(_fname) as f:
        text = f.read().strip()
        text = re_html_cleaner.sub(" ", text)
        wseq = nltk.tokenize.word_tokenize(text.lower())
        test_data.append((wseq, 0))

HBox(children=(IntProgress(value=0, description='Crunching +ve samples: ', max=2500, style=ProgressStyle(descr…

HBox(children=(IntProgress(value=0, description='Crunching -ve samples: ', max=2500, style=ProgressStyle(descr…

# A few sample english words
print("\nMost common en words in dataset:\n", en_counter.most_common(10))

print("\nTotal (en)words gathered from dataset:", len(en_counter))

Most common en words in dataset:
 [('the', 334752), (',', 275881), ('.', 271448), ('and', 163327), ('a', 162162), ('of', 145428), ('to', 135195), ('is', 110396), ('it', 95772), ('in', 93249)]

Total (en)words gathered from dataset: 105920

V = 10000

en_lang = Lang(en_counter, V)

wseq = nltk.tokenize.word_tokenize("Where are you going?".lower())
print("Test en encoding:", en_lang.encodeSentence(wseq))
print("Test en decoding:", en_lang.decodeSentence(en_lang.encodeSentence(wseq, 10)))

Test en encoding: [131, 33, 27, 182, 58]
Test en decoding: where are you going ?

The RNN based Sentence Classifier architecture

We will implement a RNN based classifier architecture for sentiment analysis in Tensorflow r1.13.1 / r1.14
Debugging Tip: Always keep track of tensor dimensions!
Tensorflow Computation Graph - We will build a tf computation graph first. This is the representation used by tf for any neural network architecture. Once the computation graph is built, you can feed data to it for training or inference

Word Embedding Matrix

en_word_emb_matrix = tf.get_variable("en_word_emb_matrix", (V, 300), dtype=tf.float32)

WARNING:tensorflow:From /home/bishal/miniconda3/envs/tf_gpu/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.

Placeholders

keep_prob = tf.placeholder(tf.float32)

input_ids = tf.placeholder(tf.int32, (None, MAX_SEQ_LEN))
input_lens = tf.placeholder(tf.int32, (None, ))

y_placeholder = tf.placeholder(tf.int32, (None,))

Tensorflow Graphs

input_emb = tf.nn.embedding_lookup(en_word_emb_matrix, input_ids)

input_emb.shape

TensorShape([Dimension(None), Dimension(50), Dimension(300)])

Encoder

RNN Units

# Create a single GRU cell
encoder_cell = tf.nn.rnn_cell.GRUCell(128)
# Add dropout : Dropout is applied to the hidden state output at every time step
encoder_cell = DropoutWrapper(encoder_cell, output_keep_prob=keep_prob)

WARNING:tensorflow:From <ipython-input-25-090b5be7a571>:2: GRUCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.

# Unrolling of time-sequence
# Apply the encoder cell on input sequence and unroll computation upto
# max sequence length
enc_outputs, enc_state = tf.nn.dynamic_rnn(
    encoder_cell, input_emb, sequence_length=input_lens, initial_state=encoder_cell.zero_state(BATCH_SIZE, dtype=tf.float32)
)

WARNING:tensorflow:From <ipython-input-26-48ec6b10a5e9>:5: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
WARNING:tensorflow:From /home/bishal/miniconda3/envs/tf_gpu/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py:626: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From /home/bishal/miniconda3/envs/tf_gpu/lib/python3.6/site-packages/tensorflow/python/ops/rnn_cell_impl.py:1259: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

enc_outputs.shape

TensorShape([Dimension(64), Dimension(50), Dimension(128)])

enc_state.shape

TensorShape([Dimension(64), Dimension(128)])

Classifier Layer

# A simple fully connected linear layer
# W^T*X + b
dense_layer = tf.layers.Dense(1)

Approaches:

As input to the final linear layers use mean of the hidden states?

As input to the final linear layers use the last hidden state?

Approch 1: Take mean of enc_outputs across dimension 1

IMPORTANT: Need to mask the positions in input sentence that doesn’t contain any inputs

# masks = tf.sequence_mask(input_lens, MAX_SEQ_LEN, dtype=tf.float32, name='masks')
# class_prob = tf.nn.sigmoid(
#                 dense_layer(
#                     tf.reduce_mean(
#                         enc_outputs*masks[:, :, None], 1)
#                 )
# ) 

# print(class_prob.shape)

Approch 2: Use enc_state (final hidden state)

class_prob = tf.nn.sigmoid(dense_layer(enc_state))
print(class_prob.shape)

(64, 1)

Loss and Optimizers [softmax_cross_entropy]

Note that onehot_labels and logits must have the same shape, e.g. [batch_size, num_classes]

print(y_placeholder.shape)
print(class_prob.shape)

(?,)
(64, 1)

# Loss function - softmax cross entropy
y_ = tf.cast(y_placeholder[:, None], dtype=tf.float32)
cost = -y_*tf.log(class_prob + 1e-12) - (1-y_)*tf.log(1-class_prob + 1e-12)
cost = tf.reduce_mean(cost)

# Optimizer
optimizer = tf.train.AdamOptimizer(0.001)

train_op = optimizer.minimize(cost)

init = tf.global_variables_initializer()

Tensorflow Sessions

sess_config = tf.ConfigProto()
sess_config.gpu_options.allow_growth = True

sess = tf.InteractiveSession(config=sess_config)
sess.run(init)

Minibatch Training

random.seed(41)

random.shuffle(train_data)

train_n = len(train_data)

test_n = len(test_data)

def small_test():
    all_true = []
    all_preds = []
    for m in range(0, test_n, BATCH_SIZE):
        n = m + BATCH_SIZE
        if n > test_n:
            break

        input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
        input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
        true_class_batch = np.zeros((BATCH_SIZE))
        for i in range(m, n):
            b,a = en_lang.encodeSentence2(test_data[i][0], MAX_SEQ_LEN)
            input_batch[i-m,:] = a
            input_lens_batch[i-m] = b
            true_class_batch[i-m] = test_data[i][1]

        feed_dict={
            input_ids: input_batch,
            input_lens: input_lens_batch,
            keep_prob: 1.0
        }
        pred_batch = sess.run(class_prob, feed_dict=feed_dict)
        # acc = accuracy_score(true_class_batch, pred_batch > 0.5)
        all_true.extend(list(true_class_batch))
        all_preds.extend(list(pred_batch[:,0]))
    
    all_true = np.array(all_true)
    all_preds = np.array(all_preds)
    prec = precision_score(all_true, all_preds > 0.5)*100
    rec = recall_score(all_true, all_preds > 0.5)*100
    f1 = f1_score(all_true, all_preds > 0.5)*100
    print(f"Precision: {prec:2.2F}, Recall: {rec:2.2F}, F1-Score: {f1:2.2F}")

for _e in range(5):
    # Mix things up a bit.
    random.shuffle(train_data)
    pbar = tqdm_notebook(range(0, train_n, BATCH_SIZE))
    batch_loss = 0
    bxi = 0
    for m in pbar:
        n = m + BATCH_SIZE
        if n <= train_n:
            # print("Epoch Complete... \n")

            input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
            input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
            true_class_batch = np.zeros((BATCH_SIZE))
            for i in range(m, n):
                b,a = en_lang.encodeSentence2(train_data[i][0], MAX_SEQ_LEN)
                input_batch[i-m,:] = a
                input_lens_batch[i-m] = b
                true_class_batch[i-m] = train_data[i][1]

            feed_dict={
                input_ids: input_batch,
                input_lens: input_lens_batch,
                y_placeholder: true_class_batch,
                keep_prob: 0.6
            }
            sess.run(train_op, feed_dict=feed_dict)
            batch_loss += sess.run(cost, feed_dict=feed_dict)
            pbar.set_description(f"Epoch: {_e} >> Loss: {batch_loss/(bxi+1):2.2F}:")
            bxi += 1
            if (1 + n//BATCH_SIZE) % 10 == 0:
                small_test()

HBox(children=(IntProgress(value=0, max=391), HTML(value='')))


Precision: 73.85, Recall: 80.08, F1-Score: 76.84
Precision: 76.18, Recall: 74.32, F1-Score: 75.24
Precision: 77.32, Recall: 70.76, F1-Score: 73.89
Precision: 76.12, Recall: 75.12, F1-Score: 75.62
Precision: 74.19, Recall: 78.76, F1-Score: 76.41
Precision: 76.54, Recall: 72.04, F1-Score: 74.22
Precision: 78.19, Recall: 67.40, F1-Score: 72.40
Precision: 71.75, Recall: 80.68, F1-Score: 75.96
Precision: 75.86, Recall: 73.28, F1-Score: 74.55
Precision: 71.11, Recall: 81.64, F1-Score: 76.01
Precision: 77.24, Recall: 68.28, F1-Score: 72.48
Precision: 72.17, Recall: 79.16, F1-Score: 75.51
Precision: 73.09, Recall: 77.68, F1-Score: 75.32
Precision: 75.79, Recall: 72.52, F1-Score: 74.12
Precision: 73.05, Recall: 78.92, F1-Score: 75.87
Precision: 77.82, Recall: 69.88, F1-Score: 73.64
Precision: 77.57, Recall: 69.32, F1-Score: 73.22
Precision: 75.01, Recall: 75.64, F1-Score: 75.32
Precision: 73.56, Recall: 79.00, F1-Score: 76.18
Precision: 79.96, Recall: 65.92, F1-Score: 72.26
Precision: 75.16, Recall: 76.00, F1-Score: 75.58
Precision: 75.87, Recall: 70.16, F1-Score: 72.90
Precision: 75.62, Recall: 75.68, F1-Score: 75.65
Precision: 74.14, Recall: 76.60, F1-Score: 75.35
Precision: 75.21, Recall: 74.40, F1-Score: 74.80
Precision: 74.12, Recall: 75.72, F1-Score: 74.91
Precision: 78.93, Recall: 68.16, F1-Score: 73.15
Precision: 75.42, Recall: 76.36, F1-Score: 75.89
Precision: 78.60, Recall: 68.16, F1-Score: 73.01
Precision: 70.98, Recall: 83.64, F1-Score: 76.79
Precision: 76.70, Recall: 72.68, F1-Score: 74.64
Precision: 75.38, Recall: 76.56, F1-Score: 75.97
Precision: 77.42, Recall: 72.84, F1-Score: 75.06
Precision: 77.10, Recall: 74.32, F1-Score: 75.68
Precision: 72.71, Recall: 82.28, F1-Score: 77.20
Precision: 78.98, Recall: 72.00, F1-Score: 75.33
Precision: 76.05, Recall: 78.12, F1-Score: 77.07
Precision: 79.82, Recall: 69.60, F1-Score: 74.36
Precision: 74.51, Recall: 80.32, F1-Score: 77.31

Improving Further

This was a very simple RNN based model for the task.
You can still improve it a lot by tweaking hyperparameters e.g.
lstm size
dropout
learning rate
or modifying the architecture e.g.
Add bidirectional RNNs
Use multiple layers of RNN cells
Add more hidden layers to the classifier