Tensorflow: read variable length data, via Dataset (tfrecord)

Issue

Best

I would like to read some TF records data.
This works, but only for Fixed length data, but now I would like to do the same thing with variable length data VarLenFeature

def load_tfrecord_fixed(serialized_example):

    context_features = {
        'length':tf.FixedLenFeature([],dtype=tf.int64),
        'type':tf.FixedLenFeature([],dtype=tf.string)
    }

    sequence_features = {
        "values":tf.FixedLenSequenceFeature([], dtype=tf.int64)
    }


    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
        serialized=serialized_example,
        context_features=context_features,
        sequence_features=sequence_features
    )


    return context_parsed,sequence_parsed

and

   tf.reset_default_graph()



    with tf.Session() as sess:

        filenames = [fp.name]

        dataset = tf.data.TFRecordDataset(filenames)
        dataset = dataset.map(load_tfrecord_fixed)
        dataset = dataset.repeat()
        dataset = dataset.batch(2)

        iterator = dataset.make_initializable_iterator()
        next_element = iterator.get_next()

        a = sess.run(iterator.initializer)

        for i in range(3):
            a = sess.run(next_element)
            print(a)

result:

({'length': array([3, 3], dtype=int64), 'type': array([b'FIXED_length', b'FIXED_length'], dtype=object)}, {'values': array([[82,  2,  2],
       [42,  5,  1]], dtype=int64)}) ({'length': array([3, 3], dtype=int64), 'type': array([b'FIXED_length', b'FIXED_length'], dtype=object)}, {'values': array([[2, 3, 1],
       [1, 2, 3]], dtype=int64)}) ({'length': array([3, 3], dtype=int64), 'type': array([b'FIXED_length', b'FIXED_length'], dtype=object)}, {'values': array([[  1, 100, 200],
       [123,  12,  12]], dtype=int64)})

here is the map function which i’m trying to use, but at the end it gives me some errors :'(

def load_tfrecord_variable(serialized_example):

    context_features = {
        'length':tf.FixedLenFeature([],dtype=tf.int64),
        'batch_size':tf.FixedLenFeature([],dtype=tf.int64),
        'type':tf.FixedLenFeature([],dtype=tf.string)
    }

    sequence_features = {
        "values":tf.VarLenFeature(tf.int64)
    }


    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
        serialized=serialized_example,
        context_features=context_features,
        sequence_features=sequence_features
    )
    #return context_parsed, sequence_parsed (which is sparse)

    # return context_parsed, sequence_parsed
    batched_data = tf.train.batch(
        tensors=[sequence_parsed['values']],
        batch_size= 2,
        dynamic_pad=True
    )

    # make dense data
    dense_data = tf.sparse_tensor_to_dense(batched_data)

    return context_parsed, dense_data

error:

OutOfRangeError: Attempted to repeat an empty dataset infinitely.
     [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[[], [], [], [?,?,?]], output_types=[DT_INT64, DT_INT64, DT_STRING, DT_INT64], _device="/job:localhost/replica:0/task:0/device:CPU:0"](Iterator)]]

During handling of the above exception, another exception occurred:

Thus can you someone help me? Also, i’m using tensorflow nightly.
I don’t think that i’m missing a lot …

Solution

def load_tfrecord_variable(serialized_example):

    context_features = {
        'length':tf.FixedLenFeature([],dtype=tf.int64),
        'batch_size':tf.FixedLenFeature([],dtype=tf.int64),
        'type':tf.FixedLenFeature([],dtype=tf.string)
    }

    sequence_features = {
        "values":tf.VarLenFeature(tf.int64)
    }

    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
        serialized=serialized_example,
        context_features=context_features,
        sequence_features=sequence_features
    )
    
    length = context_parsed['length']
    batch_size = context_parsed['batch_size']
    type = context_parsed['type']
    
    values = sequence_parsed['values'].values
    
    return tf.tuple([length, batch_size, type, values])
    
# 
filenames = [fp.name]    
    
dataset = tf.data.TFRecordDataset(filenames)
dataset = dataset.map(load_tfrecord_fixed)
dataset = dataset.repeat()
dataset = dataset.padded_batch(
    batch_size, 
    padded_shapes=(
        tf.TensorShape([]),
        tf.TensorShape([]),
        tf.TensorShape([]),
        tf.TensorShape([None])  # if you reshape 'values' in load_tfrecord_variable, add the added dims after None, e.g. [None, 3]
        ),
    padding_values = (
        tf.constant(0, dtype=tf.int64),
        tf.constant(0, dtype=tf.int64),
        tf.constant(""),
        tf.constant(0, dtype=tf.int64)
        )
    )

iterator = dataset.make_initializable_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    a = sess.run(iterator.initializer)
    for i in range(3):
        [length_vals, batch_size_vals, type_vals, values_vals] = sess.run(next_element)

Answered By – Maosi Chen

This Answer collected from stackoverflow, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Leave a Reply

(*) Required, Your email will not be published