Trouble Training Same Tensorflow Model in PyTorch

Issue

I have trained a model in Tensorflow and am having trouble replicating it in PyTorch. The Tensorflow model achieves near 100% accuracy (the task is simple), but the PyTorch model performs at random. I’ve spent a while trying to figure this out, and can’t understand what the problem could be.

The model is trained for the task of binary classification. Given an input utterance describing a quadrant and a (x, y, z) coordinate, the model has to predict if the (x, z) portion of the coordinate is in the quadrant described. For example, if the input text was "quadrant 1" and the coordinate was (0.5, -, 0.5), then the prediction should be true, but if the region was "quadrant 2" with the same coordinate, then the prediction should be false.

I generated some data and trained the model in Tensorflow using this code:

x_data_placeholder = tf.placeholder(tf.float32, [FLAGS.batch_size, 1], name="x_data")
y_data_placeholder = tf.placeholder(tf.float32, [FLAGS.batch_size, 1], name="y_data")
z_data_placeholder = tf.placeholder(tf.float32, [FLAGS.batch_size, 1], name="z_data")
# text and labels placeholders
text_data = tf.placeholder(tf.int32, [FLAGS.batch_size, maxtextlength])
text_lengths = tf.placeholder(tf.int32, [FLAGS.batch_size])
y_labels_placeholder = tf.placeholder(tf.int64, [FLAGS.batch_size])

# encode text and coordinate
embeddings = tf.Variable(tf.random_uniform([100, embedding_size], -1, -1))
rnn_inputs = tf.nn.embedding_lookup(embeddings, text_data)
rnn_layers = [tf.compat.v1.nn.rnn_cell.LSTMCell(size, initializer=tf.compat.v1.keras.initializers.glorot_normal) for size in [256]]
multi_rnn_cell = tf.compat.v1.nn.rnn_cell.MultiRNNCell(rnn_layers, state_is_tuple=True)

text_outputs, text_fstate = tf.compat.v1.nn.dynamic_rnn(cell=multi_rnn_cell,
                                             inputs=rnn_inputs,
                                             dtype=tf.float32, sequence_length=text_lengths)

# have fully connected layers to map them the input coordinates into the same dimension as the LSTM output layer from above
x_output_layer = tf.compat.v1.layers.dense(x_data_placeholder, units=FLAGS.fc_column_size, activation=tf.nn.relu, name='x_coordinate')
y_output_layer = tf.compat.v1.layers.dense(y_data_placeholder, units=FLAGS.fc_column_size, activation=tf.nn.relu, name='y_coordinate')
z_output_layer = tf.compat.v1.layers.dense(z_data_placeholder, units=FLAGS.fc_column_size, activation=tf.nn.relu, name='z_coordinate')

# add the representations
total_output_layer = x_output_layer + y_output_layer + z_output_layer + lstm_output_layer

# make the predictions with two fully connected layers
fc_1 = tf.compat.v1.layers.dense(total_output_layer, units=FLAGS.hidden_layer_size, activation=tf.nn.relu, name='fc_1')
logits = tf.compat.v1.layers.dense(fc_1, units=FLAGS.output_dims, activation=None, name='logits')

# train the model
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_labels_placeholder, logits=logits))
optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate, epsilon=1e-7)
gradients, variables = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, FLAGS.gradient_clip_threshold)
optimize = optimizer.apply_gradients(zip(gradients, variables))

# then it'll be trained with sess.run ...

Now for the PyTorch replication:

class BaselineModel(nn.Module):
    def __init__(self):
        super(BaselineModel, self).__init__()

        self.encode_x = nn.Linear(1, embed_size)
        self.encode_y = nn.Linear(1, embed_size)
        self.encode_z = nn.Linear(1, embed_size)
        self._embeddings = nn.Embedding(vocab_size, self.embedding_table_size)
        nn.init.uniform_(self._embeddings.weight, -1.0, 1.0)
        self.num_layers = 1
        self.rnn = nn.LSTM(self.embedding_table_size, self.hidden_size, batch_first=True)
        self.fc_after_text_lstm = nn.Linear(self.hidden_size, 100)
        self.fc = nn.Linear(100, 256) 
        self.fc_final = nn.Linear(256, 2) 
        self.relu_activation = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def init_hidden(self, batch_size, device='cuda:0'):
        # for LSTM, we need # of layers
        h_0 = torch.zeros(1, batch_size, self.hidden_size).to(device)
        c_0 = torch.zeros(1, batch_size, self.hidden_size).to(device)
        return h_0, c_0

    def forward(self, input_text, x_coordinate=None, y_coordinate=None, z_coordinate=None):
        x_embed = self.relu_activation(self.encode_x(x_coordinate.cuda().to(torch.float32)).cuda())
        y_embed = self.relu_activation(self.encode_y(y_coordinate.cuda().to(torch.float32))).cuda()
        z_embed = self.relu_activation(self.encode_z(z_coordinate.cuda().to(torch.float32))).cuda()

        embeds = self._embeddings(input_text)
        embedding, hidden = self.rnn(embeds, self.hidden)
        text_fc = self.relu_activation(self.fc_after_text_lstm(embedding[:, -1]))
        representations_so_far_added = torch.sum(torch.stack([text_fc, x_embed, y_embed, z_embed]), dim=0)
        pre_final_embedding = self.relu_activation(self.fc(representations_so_far_added))
        
        return self.fc_final(pre_final_embedding )

### training code
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, eps=1e-7)
criterion = nn.CrossEntropyLoss()
for input_text, x_coordinate, y_coordinate, z_coordinate, targets, train_data:

    optimizer.zero_grad()
    pred = model(input_text, x_coordinate=x_coordinate, y_coordinate=y_coordinate, z_coordinate=z_coordinate)
    loss = criterion(pred.float(), targets)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
    optimizer.step()
    scheduler.step()

    # accuracy evaluation code, this is evaluated over the entire epoch
    pred_idx = F.log_softmax(pred, dim=1)
    target_labels = targets.cpu().int()
    pred_labels = torch.argmax(pred_idx, dim=-1).cpu().data.int()
    curr_acc = skm.accuracy_score(target_labels, pred_labels)


If anyone can spot any issue with the PyTorch implementation or maybe tell me what could be wrong, that would be much appreciated! I also tried to load the weights of the Tensorflow model into all the appropriate layers, and performance still struggles in PyTorch! Thanks in advance!

EDIT:
I have created a minimally reproducible example, because I still cannot figure out what the problem is. Any help would be still appreciated!

import torch
import torch.nn as nn
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lr = 0.0005
n_epochs = 10
input_dim = 4    
hidden_dim = 128
layer_dim = 2
output_dim = 2
batch_size = 50

class FeatureDataSet(torch.utils.data.Dataset):

    def __init__(self, x_train, y_train, x_coordinates):
        self.x_train = torch.tensor(x_train, dtype=torch.long)
        self.y_train = torch.tensor(y_train)
        self.x_coordinates = torch.tensor(x_coordinates, dtype=torch.float32)
    def __len__(self):
        return len(self.y_train)
    def __getitem__(self, idx):
        return self.x_train[idx], self.y_train[idx], self.x_coordinates[idx]


class RNN(nn.Module):

    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, batch_size):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim

        # linear layer to encode the coordinate
        self.encode_x = nn.Linear(1, hidden_dim).cuda()
        self._embeddings = nn.Embedding(40, 100).cuda()

        # hidden_dim is 128
        # layer_dim is 2
        self.lstm = nn.LSTM(100, hidden_dim, layer_dim, batch_first=True).cuda()
        self.fc = nn.Linear(2 * hidden_dim, output_dim).cuda()
        self.batch_size = batch_size
        self.hidden = None

    def init_hidden(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        return [t.cpu() for t in (h0, c0)]

    def forward(self, x, x_coordinate):
        #initializing the hidden states
        h0, c0 = self.init_hidden(x)
        embeds = self._embeddings(x)
        out, (hn, cn) = self.lstm(embeds.cuda(), (h0.cuda(), c0.cuda()))

        x_embed = F.relu(self.encode_x(x_coordinate.cuda().to(torch.float32)).cuda())
        representations_so_far_added = torch.cat([out[:, -1, :], x_embed], dim=1)

        out = self.fc(representations_so_far_added)
        return out

    

model = RNN(input_dim, hidden_dim, layer_dim, output_dim, batch_size)
criterion = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=0.001)

print('Start model training')
import sklearn.metrics as skm
import torch.nn.functional as F


x_train = []
x_coordinates = []
y_train = []
for i in range(10000):
    # create the data. if x_coordinate > 0 and the sentence says that (represented by [1, 5, 6, 8]), then we should predict positive else negative (if the x_coordinate > 0)
    # same applies if the x_coordinate < 0, just that the sentence is now [1, 5, 6, 9]
    if np.random.randint(0, 2) == 0:
        if np.random.randint(0, 2) == 0:
            # x coordinate > 0
            x_train.append([1, 5, 6, 8])
            x_coordinates.append([round(np.random.uniform(0.01, 1.00, 1)[0], 2)])
            y_train.append(1.0)
        else:
            # x coordinate > 0 negative
            x_train.append([1, 5, 6, 8])
            x_coordinates.append([round(np.random.uniform(-1.00, 0.00, 1)[0], 2)])
            y_train.append(0.0)
    else:
        if np.random.randint(0, 2) == 0:
            # x coordinate < 0
            x_train.append([1, 5, 6, 9])
            x_coordinates.append([round(np.random.uniform(-1.00, 0.00, 1)[0], 2)])
            y_train.append(1.0)
        else:
            # x coordinate < 0 negative
            x_train.append([1, 5, 6, 9])
            x_coordinates.append([round(np.random.uniform(0.01, 1.00, 1)[0], 2)])
            y_train.append(0.0)

# print a sample of data 
print(x_train[:10])
print(y_train[:10])
print(x_coordinates[:10])

# create a dataloader
trainingDataset = FeatureDataSet(x_train=x_train, y_train=y_train, x_coordinates=x_coordinates)
train_loader = torch.utils.data.DataLoader(dataset=trainingDataset, batch_size=batch_size, shuffle=True)


# for each epoch
for epoch in range(1, n_epochs + 1):
    acc_all = []
    # each batch
    for i, (x_batch, y_batch, x_coord_batch) in enumerate(train_loader):
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        x_coord_batch = x_coord_batch.to(device)

        opt.zero_grad()
        # pass in the text (x_batch) and coordinate (x_coord_batch)
        out = model(x_batch, x_coordinate=x_coord_batch)
        loss = criterion(out.float(), y_batch.type(torch.LongTensor).cuda())
        loss.backward()
        opt.step()

        pred_idx = F.log_softmax(out, dim=1)
        target_labels = y_batch.cpu().int()
        pred_labels = torch.argmax(pred_idx, dim=-1).cpu().data.int()

        curr_acc = skm.accuracy_score(target_labels, pred_labels)
        acc_all.append(curr_acc)

    print(np.mean(acc_all))

Solution

I suppose perhaps there are some mistakes in your dataset implementation in the PyTorch version.

I tried your pytorch BaselineModel on both the dataset in your "minimally reproducible example" and my own dataset constructed according to your description, and find that it works fine.

The following is my codes for testing on my own dataset. Note that I add several hyperparameters to the code of BaselineModel to make it run. I got accuracy over 99%.

import random

import torch
import torch.nn as nn
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lr = 0.0005
n_epochs = 100
input_dim = 4
hidden_dim = 128
layer_dim = 2
output_dim = 2
batch_size = 50


class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class FeatureDataSet(torch.utils.data.Dataset):

    def __init__(self, x_train, y_train, x_coordinates, y_coordinates, z_coordinates):
        self.x_train = torch.tensor(x_train, dtype=torch.long)
        self.y_train = torch.tensor(y_train)
        self.x_coordinates = torch.tensor(x_coordinates, dtype=torch.float32)
        self.y_coordinates = torch.tensor(y_coordinates, dtype=torch.float32)
        self.z_coordinates = torch.tensor(z_coordinates, dtype=torch.float32)

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, idx):
        return self.x_train[idx], self.y_train[idx], self.x_coordinates[idx], self.y_coordinates[idx], self.z_coordinates[idx]


class BaselineModel(nn.Module):
    def __init__(self):
        super(BaselineModel, self).__init__()
        vocab_size = 40
        self.hidden_size = 100
        self.embedding_table_size = self.hidden_size
        self.encode_x = nn.Linear(1, self.hidden_size)
        self.encode_y = nn.Linear(1, self.hidden_size)
        self.encode_z = nn.Linear(1, self.hidden_size)
        self._embeddings = nn.Embedding(vocab_size, self.embedding_table_size)
        nn.init.uniform_(self._embeddings.weight, -1.0, 1.0)
        self.num_layers = 1
        self.rnn = nn.LSTM(self.embedding_table_size, self.hidden_size, batch_first=True)
        self.fc_after_text_lstm = nn.Linear(self.hidden_size, 100)
        self.fc = nn.Linear(100, 256)
        self.fc_final = nn.Linear(256, 2)
        self.relu_activation = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

        self.hidden = self.init_hidden(batch_size)

    def init_hidden(self, batch_size, device='cuda:0'):
        # for LSTM, we need # of layers
        h_0 = torch.zeros(1, batch_size, self.hidden_size).to(device)
        c_0 = torch.zeros(1, batch_size, self.hidden_size).to(device)
        return h_0, c_0

    def forward(self, input_text, x_coordinate=None, y_coordinate=None, z_coordinate=None):
        x_embed = self.relu_activation(self.encode_x(x_coordinate.cuda().to(torch.float32)).cuda())
        y_embed = self.relu_activation(self.encode_y(y_coordinate.cuda().to(torch.float32))).cuda()
        z_embed = self.relu_activation(self.encode_z(z_coordinate.cuda().to(torch.float32))).cuda()

        embeds = self._embeddings(input_text)
        embedding, hidden = self.rnn(embeds, self.hidden)
        text_fc = self.relu_activation(self.fc_after_text_lstm(embedding[:, -1]))
        representations_so_far_added = torch.sum(torch.stack([text_fc, x_embed, y_embed, z_embed]), dim=0)
        pre_final_embedding = self.relu_activation(self.fc(representations_so_far_added))

        return self.fc_final(pre_final_embedding)


# model = RNN(input_dim, hidden_dim, layer_dim, output_dim, batch_size)
model = BaselineModel().cuda()
criterion = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=0.001)

print('Start model training')
import sklearn.metrics as skm
import torch.nn.functional as F

x_train = []
x_coordinates = []
y_coordinates = []
z_coordinates = []
y_train = []
for i in range(10000):
    x_coordinate = round(np.random.uniform(-1, 1.00, 1)[0], 2)
    y_coordinate = round(np.random.uniform(-1, 1.00, 1)[0], 2)
    z_coordinate = round(np.random.uniform(-1, 1.00, 1)[0], 2)
    x_coordinates.append([x_coordinate])
    y_coordinates.append([y_coordinate])
    z_coordinates.append([z_coordinate])

    if np.random.randint(0, 2) == 0: # positive example
        if x_coordinate <= 0 and z_coordinate <= 0:
            x_train.append([1, 5, 6, 8])
        elif x_coordinate <= 0 and z_coordinate > 0:
            x_train.append([1, 5, 6, 9])
        elif x_coordinate > 0 and z_coordinate <= 0:
            x_train.append([1, 5, 6, 10])
        elif x_coordinate > 0 and z_coordinate > 0:
            x_train.append([1, 5, 6, 11])
        y_train.append(1.0)
    else:
        if x_coordinate <= 0 and z_coordinate <= 0:
            x_train.append(random.choice([[1, 5, 6, 9], [1, 5, 6, 10], [1, 5, 6, 11]]))
        elif x_coordinate <= 0 and z_coordinate > 0:
            x_train.append(random.choice([[1, 5, 6, 8], [1, 5, 6, 10], [1, 5, 6, 11]]))
        elif x_coordinate > 0 and z_coordinate <= 0:
            x_train.append(random.choice([[1, 5, 6, 8], [1, 5, 6, 9], [1, 5, 6, 11]]))
        elif x_coordinate > 0 and z_coordinate > 0:
            x_train.append(random.choice([[1, 5, 6, 8], [1, 5, 6, 9], [1, 5, 6, 10]]))
        y_train.append(0.0)



# print a sample of data
print(x_train[:10])
print(y_train[:10])
print(x_coordinates[:10])
print(y_coordinates[:10])
print(z_coordinates[:10])

# create a dataloader
trainingDataset = FeatureDataSet(x_train=x_train, y_train=y_train, x_coordinates=x_coordinates, y_coordinates=y_coordinates, z_coordinates=z_coordinates)
train_loader = torch.utils.data.DataLoader(dataset=trainingDataset, batch_size=batch_size, shuffle=True)

# for each epoch
loss_meter = AverageMeter()
for epoch in range(1, n_epochs + 1):
    acc_all = []
    # each batch
    loss_meter.reset()
    for i, (x_batch, y_batch, x_coord_batch, y_coord_batch, z_coord_batch) in enumerate(train_loader):
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        x_coord_batch = x_coord_batch.to(device)
        y_coord_batch = y_coord_batch.to(device)
        z_coord_batch = z_coord_batch.to(device)

        opt.zero_grad()
        # pass in the text (x_batch) and coordinate (x_coord_batch)
        out = model(x_batch, x_coordinate=x_coord_batch, y_coordinate=y_coord_batch, z_coordinate=z_coord_batch)
        loss = criterion(out.float(), y_batch.type(torch.LongTensor).cuda())


        loss.backward()
        opt.step()

        pred_idx = F.log_softmax(out, dim=1)
        target_labels = y_batch.cpu().int()
        pred_labels = torch.argmax(pred_idx, dim=-1).cpu().data.int()

        curr_acc = skm.accuracy_score(target_labels, pred_labels)
        acc_all.append(curr_acc)

        loss_meter.update(loss.item())

    print(np.mean(acc_all))
    print("loss is %f" % loss_meter.val)

As for the "minimally reproducible example", I think the model RNN doesn’t work is quite reasonable, as I have stated in the comments. I suppose that tensorflow can not fit as well, although I have not tried it. Your "minimally reproducible example" may be unrelated to your main problem.

Answered By – hellohawaii

This Answer collected from stackoverflow, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Leave a Reply

(*) Required, Your email will not be published