import numpy as np
import pandas as pd
import tensorflow as tf
from joblib import dump, load
import copy
[docs]class TFRecord_Writer(object):
def __init__(self,fileName,compression_type = 'GZIP'):
"""
TFRecord_Writer transforming iterator object into tfRecord
Parameters
----------
fileName : str
directory for storing tfRecord
compression_type : str, optional
compression type support by tensorflow dataset, by default 'GZIP'
"""
self.fileName = fileName
self._dtypeDict = {}
self._compression_type = compression_type
def _valueDict_builder(self,data):
valueDict = {}
for i in data:
if np.ndim(data[i]) > 0:
if not isinstance(data[i],np.ndarray):
data[i] = np.array(data[i])
self._dtypeDict[i] = ('tensor',data[i].shape)
valueDict[i] = _tensor_feature(data[i])
else:
if type(data[i]) is int:
self._dtypeDict[i] = ('int',[1])
valueDict[i] = _int64_feature(data[i])
elif type(data[i]) is float:
self._dtypeDict[i] = ('float',[1])
valueDict[i] = _float_feature(data[i])
else:
self._dtypeDict[i] = ('str',[1])
valueDict[i] = _bytes_feature(str.encode(data[i]))
return valueDict
def _tfExample_factory(self,valueDict):
example_proto = tf.train.Example(features=tf.train.Features(feature=valueDict))
return example_proto.SerializeToString()
[docs] def write_tfRecord(self,data):
"""
write_tfRecord writing tfRecord
transforming list of dict object to tfRecord
Parameters
----------
data : list of dict
list of dict data such as [{'col':1,'col2':"123",'col3':[1,2]}]
"""
with tf.io.TFRecordWriter(self.fileName,self._compression_type) as writer:
for X in data:
valueDict = self._valueDict_builder(copy.copy(X))
serialized_features_dataset = self._tfExample_factory(valueDict)
writer.write(serialized_features_dataset)
[docs] def get_tfRecord_dtype(self,pickleDir=None):
"""
get_tfRecord_dtype geting dtype dictionary for reading tfRecord
this method returns or pickle the dictionary of tfRecord feature datatype
Parameters
----------
pickleDir : str, optional
the directory of pickling dataType dictionary if not None, by default None
Returns
-------
dict
dictionary for making TFRecord_Reader
"""
if pickleDir is not None:
dump(self._dtypeDict,pickleDir)
return self._dtypeDict
[docs]class TFRecord_Reader(object):
def __init__(self,fileName,dtypeDict,compression_type = 'GZIP'):
"""
TFRecord_Reader Reading TFRecord to TF dataset
tfRecord can be transform into tensorflow dataset by this reader
Parameters
----------
fileName : str
the directory of tfRecord
dtypeDict : dict
this dictionary defined the datatype of tensorflow dataset.
it can be generated by TFRecord_Writer class
compression_type : str, optional
compression type support by tensorflow dataset, by default 'GZIP'
"""
self.fileName = fileName
self._dtypeDict = dtypeDict
self._compression_type = compression_type
[docs] def feature_des_builder(self):
"""
feature_des_builder create feature description ojbect for tensorflow dataset
using dtypeDict to build the feature description object
notice: currently this builder only creates FixedLenFeature.
Returns
-------
dict
feature description object
"""
feature_desc = {}
for i in self._dtypeDict:
if self._dtypeDict[i][0] == 'tensor':
feature_desc[i] = tf.io.FixedLenFeature((), tf.string)
elif self._dtypeDict[i][0] == 'float':
feature_desc[i] = tf.io.FixedLenFeature((), tf.float32)
elif self._dtypeDict[i][0] == 'int':
feature_desc[i] = tf.io.FixedLenFeature((), tf.int64)
else:
feature_desc[i] = tf.io.FixedLenFeature([], tf.string)
return feature_desc
def _read_tfrecord(self,serialized_example,feature_desc,dtypeDict,tensor_opt_dtype):
record = {}
example = tf.io.parse_single_example(serialized_example,feature_desc)
for i in dtypeDict:
if dtypeDict[i][0] == 'tensor':
tmp = tf.io.parse_tensor(example[i], out_type = tensor_opt_dtype)
tmp.set_shape(dtypeDict[i][1])
record[i] = tmp
else:
record[i] = example[i]
return record
[docs] def make_tfDataset(self,tensor_opt_dtype = tf.float32):
"""
make_tfDataset making tensorflow dataset
Parameters
----------
tensor_opt_dtype : tf dtypes, optional
the tensorflow data type used for casting dataset features, by default tf.float32
Returns
-------
tensorflow dataset
tensorflow dataset prepared for model training/testing
"""
feature_desc = self.feature_des_builder()
raw_dataset = tf.data.TFRecordDataset(self.fileName,compression_type=self._compression_type)
return raw_dataset.map(lambda x: self._read_tfrecord(x,feature_desc,self._dtypeDict,tensor_opt_dtype))
def _bytes_feature(value):
"""Returns a bytes_list from a string / byte."""
if isinstance(value, type(tf.constant(0))):
value = value.numpy()
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _float_feature(value):
"""Returns a float_list from a float / double."""
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def _int64_feature(value):
"""Returns an int64_list from a bool / enum / int / uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _tensor_feature(value):
"""Returns an bytes_list from a numpy tensor"""
return _bytes_feature(tf.io.serialize_tensor(value.astype(np.float32)))