Source code for time_series_transform.transform_core_api.tfDataset_adopter

import numpy as np
import pandas as pd
import tensorflow as tf
from joblib import dump, load
import copy

[docs]class TFRecord_Writer(object):
    def __init__(self,fileName,compression_type = 'GZIP'):
        """
        TFRecord_Writer transforming iterator object into tfRecord
        
        Parameters
        ----------
        fileName : str 
            directory for storing tfRecord
        compression_type : str, optional
            compression type support by tensorflow dataset, by default 'GZIP'
        """
        self.fileName = fileName
        self._dtypeDict = {}
        self._compression_type = compression_type
    
    def _valueDict_builder(self,data):
        valueDict = {}
        for i in data:
            if np.ndim(data[i]) > 0:
                if not isinstance(data[i],np.ndarray):
                    data[i] = np.array(data[i])
                self._dtypeDict[i] = ('tensor',data[i].shape)
                valueDict[i] = _tensor_feature(data[i])
            else:
                if type(data[i]) is int:
                    self._dtypeDict[i] = ('int',[1])
                    valueDict[i] = _int64_feature(data[i])
                elif type(data[i]) is float:
                    self._dtypeDict[i] = ('float',[1])
                    valueDict[i] = _float_feature(data[i])
                else:
                    self._dtypeDict[i] = ('str',[1])
                    valueDict[i] = _bytes_feature(str.encode(data[i]))                    
        return valueDict

    def _tfExample_factory(self,valueDict):
        example_proto = tf.train.Example(features=tf.train.Features(feature=valueDict))
        return example_proto.SerializeToString()


[docs]    def write_tfRecord(self,data):
        """
        write_tfRecord writing tfRecord
        
        transforming list of dict object to tfRecord
        
        Parameters
        ----------
        data : list of dict
            list of dict data such as [{'col':1,'col2':"123",'col3':[1,2]}]
        """
        with tf.io.TFRecordWriter(self.fileName,self._compression_type) as writer:
            for X in data:
                valueDict = self._valueDict_builder(copy.copy(X))
                serialized_features_dataset = self._tfExample_factory(valueDict)
                writer.write(serialized_features_dataset)

[docs]    def get_tfRecord_dtype(self,pickleDir=None):
        """
        get_tfRecord_dtype geting dtype dictionary for reading tfRecord
        
        this method returns or pickle the dictionary of tfRecord feature datatype
        
        Parameters
        ----------
        pickleDir : str, optional
            the directory of pickling dataType dictionary if not None, by default None
        
        Returns
        -------
        dict
            dictionary for making TFRecord_Reader
        """
        if pickleDir is not None:
            dump(self._dtypeDict,pickleDir)
        return self._dtypeDict


[docs]class TFRecord_Reader(object):
    def __init__(self,fileName,dtypeDict,compression_type = 'GZIP'):
        """
        TFRecord_Reader Reading TFRecord to TF dataset
        
        tfRecord can be transform into tensorflow dataset by this reader
        
        Parameters
        ----------
        fileName : str
            the directory of tfRecord
        dtypeDict : dict
            this dictionary defined the datatype of tensorflow dataset.
            it can be generated by TFRecord_Writer class 
        compression_type : str, optional
            compression type support by tensorflow dataset, by default 'GZIP'
        """
        self.fileName = fileName
        self._dtypeDict = dtypeDict
        self._compression_type = compression_type

[docs]    def feature_des_builder(self):
        """
        feature_des_builder create feature description ojbect for tensorflow dataset
        
        using dtypeDict to build the feature description object
        notice: currently this builder only creates FixedLenFeature. 
        
        Returns
        -------
        dict
            feature description object
        """
        feature_desc = {}
        for i in self._dtypeDict:
            if self._dtypeDict[i][0] == 'tensor':
                feature_desc[i] = tf.io.FixedLenFeature((), tf.string)
            elif self._dtypeDict[i][0] == 'float':
                feature_desc[i] = tf.io.FixedLenFeature((), tf.float32)
            elif self._dtypeDict[i][0] == 'int':
                feature_desc[i] = tf.io.FixedLenFeature((), tf.int64)
            else:
                feature_desc[i] = tf.io.FixedLenFeature([], tf.string)
        return feature_desc

    def _read_tfrecord(self,serialized_example,feature_desc,dtypeDict,tensor_opt_dtype):
        record = {}
        example = tf.io.parse_single_example(serialized_example,feature_desc)
        for i in dtypeDict:
            if dtypeDict[i][0] == 'tensor':
                tmp = tf.io.parse_tensor(example[i], out_type = tensor_opt_dtype)
                tmp.set_shape(dtypeDict[i][1])
                record[i] = tmp
            else:
                record[i] = example[i]
        return record

[docs]    def make_tfDataset(self,tensor_opt_dtype = tf.float32):
        """
        make_tfDataset making tensorflow dataset
        
        Parameters
        ----------
        tensor_opt_dtype : tf dtypes, optional
            the tensorflow data type used for casting dataset features, by default tf.float32
        
        Returns
        -------
        tensorflow dataset
            tensorflow dataset prepared for model training/testing
        """
        feature_desc = self.feature_des_builder()
        raw_dataset = tf.data.TFRecordDataset(self.fileName,compression_type=self._compression_type)
        return raw_dataset.map(lambda x: self._read_tfrecord(x,feature_desc,self._dtypeDict,tensor_opt_dtype))


def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _tensor_feature(value):
    """Returns an bytes_list from a numpy tensor"""
    return _bytes_feature(tf.io.serialize_tensor(value.astype(np.float32)))