from time_series_transform.transform_core_api.time_series_transformer import Time_Series_Transformer
from time_series_transform.stock_transform.stock_transfromer import Stock_Transformer
import pandas as pd
import numpy as np
import pandas_ta as ta
Machine Learning¶
This demo is going to train a machine learning model using sklearn, Stock_Transformer, and Time_Series_Transformer to predict the momentum of stock on daily basis. Subsequently, we will use the momentum to generate buy/sell signals and make a plot using Time_Series_Transformer.
Note: This tutorial is aim to demonstrate how to use time_series_transformer, instead of showing how to invest in stock
Data Prep¶
We use yahoo api to fetch Google, NASDQ, and Gold data for past 3 years. In turn, there are various technical indicators generated such as Bollinger Bands, RSI, MACD, and Exponential Moving Average.
strategy = ta.Strategy(
name= 'mystrategy',
ta=[
{"kind": "ema", "length": 50},
{"kind": "ema", "length": 7},
{"kind": "ema", "length": 20},
{"kind": "bbands", "length": 20},
{"kind": "bbands", "length": 50},
{"kind": "bbands", "length": 30},
{"kind": "rsi","prefix":"rsi"},
{"kind": "macd", "fast": 8, "slow": 21},
]
)
st = Stock_Transformer.from_stock_engine_period(["GOOGL","NDAQ","GOLD"],'3y','yahoo')
st = st.get_technial_indicator(strategy)
st = st.dropna()
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 1.0s remaining: 0.0s
[Parallel(n_jobs=1)]: Done 2 out of 2 | elapsed: 2.1s remaining: 0.0s
[Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 3.3s remaining: 0.0s
[Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 3.3s finished
Since we want to use Nasdaq and Gold as features, we have to expand it into columns. Then, we use make_lead features combining with the up_down_transform function to generate the pricing moment for the next time period.
def up_or_down(current,lead):
if current < lead:
return 'up'
elif current > lead:
return "down"
else:
return "unchange"
def up_down_transform(data):
currentList = data['Close_GOOGL']
leadList = data['Close_GOOGL_lead_1']
res = []
for c,l in zip(currentList,leadList):
res.append(up_or_down(c,l))
return res
df = st.to_pandas(expandCategory= True, expandTime = False)
tst = Time_Series_Transformer.from_pandas(df,'Date',None)
tst = tst.make_lead('Close_GOOGL',1,'_lead_')
tst = tst.transform(['Close_GOOGL','Close_GOOGL_lead_1'],'change',up_down_transform)
tst = tst.remove_feature("Close_GOOGL_lead_1")
tst = tst.make_label("change")
X,y = tst.to_pandas(sepLabel = True)
After generating X and y, we split 60 days of data as test data. time_series_transform also provide some sklearn transformers. For instance, Lag_Transformer is an implmentation of generating multiple lag features and can be combined with sklearn pipeline. It means that we can simply tune the lag number using cross validation.
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV,TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from time_series_transform.sklearn.transformer import Lag_Transformer
testX,testY = X.tail(60),y.tail(60)
trainX,trainY = X.drop(testX.index),y.drop(testY.index)
trainX.columns
Index(['Date', 'Open_GOOGL', 'High_GOOGL', 'Low_GOOGL', 'Close_GOOGL',
'Volume_GOOGL', 'Dividends_GOOGL', 'Stock Splits_GOOGL', 'EMA_50_GOOGL',
'EMA_7_GOOGL', 'EMA_20_GOOGL', 'BBL_20_2.0_GOOGL', 'BBM_20_2.0_GOOGL',
'BBU_20_2.0_GOOGL', 'BBL_50_2.0_GOOGL', 'BBM_50_2.0_GOOGL',
'BBU_50_2.0_GOOGL', 'BBL_30_2.0_GOOGL', 'BBM_30_2.0_GOOGL',
'BBU_30_2.0_GOOGL', 'rsi_RSI_14_GOOGL', 'MACD_8_21_9_GOOGL',
'MACDh_8_21_9_GOOGL', 'MACDs_8_21_9_GOOGL', 'Open_GOLD', 'High_GOLD',
'Low_GOLD', 'Close_GOLD', 'Volume_GOLD', 'Dividends_GOLD',
'Stock Splits_GOLD', 'EMA_50_GOLD', 'EMA_7_GOLD', 'EMA_20_GOLD',
'BBL_20_2.0_GOLD', 'BBM_20_2.0_GOLD', 'BBU_20_2.0_GOLD',
'BBL_50_2.0_GOLD', 'BBM_50_2.0_GOLD', 'BBU_50_2.0_GOLD',
'BBL_30_2.0_GOLD', 'BBM_30_2.0_GOLD', 'BBU_30_2.0_GOLD',
'rsi_RSI_14_GOLD', 'MACD_8_21_9_GOLD', 'MACDh_8_21_9_GOLD',
'MACDs_8_21_9_GOLD', 'Open_NDAQ', 'High_NDAQ', 'Low_NDAQ', 'Close_NDAQ',
'Volume_NDAQ', 'Dividends_NDAQ', 'Stock Splits_NDAQ', 'EMA_50_NDAQ',
'EMA_7_NDAQ', 'EMA_20_NDAQ', 'BBL_20_2.0_NDAQ', 'BBM_20_2.0_NDAQ',
'BBU_20_2.0_NDAQ', 'BBL_50_2.0_NDAQ', 'BBM_50_2.0_NDAQ',
'BBU_50_2.0_NDAQ', 'BBL_30_2.0_NDAQ', 'BBM_30_2.0_NDAQ',
'BBU_30_2.0_NDAQ', 'rsi_RSI_14_NDAQ', 'MACD_8_21_9_NDAQ',
'MACDh_8_21_9_NDAQ', 'MACDs_8_21_9_NDAQ'],
dtype='object')
pip = Pipeline(
[
("lag",Lag_Transformer(list(range(1,20)),time_col = 'Date')),
("impute",SimpleImputer(strategy = 'median')),
("pca",PCA()),
('rf',RandomForestClassifier())
]
)
searchParam = {
"lag__lag_nums":[list(range(1,20)),list(range(1,50)),list(range(1,100))],
"pca__n_components":list(range(2,30)),
"rf__n_estimators":list(range(100,500)),
"rf__min_samples_split":list(range(2,20))
}
randPip = RandomizedSearchCV(
pip,
searchParam,
cv= TimeSeriesSplit(5),
n_iter=30,
n_jobs = 5
)
randPip = randPip.fit(trainX,trainY)
prd = randPip.predict(testX)
C:UsersAllen Chianganaconda3libsite-packagessklearnpipeline.py:335: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
print(classification_report(testY,prd))
precision recall f1-score support
down 0.41 0.42 0.42 26
unchange 0.00 0.00 0.00 1
up 0.55 0.55 0.55 33
accuracy 0.48 60
macro avg 0.32 0.32 0.32 60
weighted avg 0.48 0.48 0.48 60
C:UsersAllen Chianganaconda3libsite-packagessklearnmetrics_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use zero_division parameter to control this behavior.
confusion_matrix(testY,prd)
array([[11, 0, 15],
[ 1, 0, 0],
[15, 0, 18]], dtype=int64)
After we train the model, we predict the data. And the singal is generated by the following rules: 1. the fist period should be buy. 2. buy and then hold until sell 3. buy when the predict value is up 4. sell when the predict value is down
def buy_sell_signal(dataList):
res = []
tmp ='down'
for i in dataList:
if tmp == i or i == 'unchange':
if i == 'unchange':
res.append('hold')
continue
tmp = i
res.append('hold')
else:
tmp = i
res.append(i)
return res
det = pd.DataFrame({'prd':buy_sell_signal(prd)})
det['Close'] = testX.Close_GOOGL.tolist()
det['Date'] = testX.Date.tolist()
det['Buy'] = det[['Close','prd']].apply(lambda x: x.Close if x.prd == 'up' else 0,axis =1)
det['Sell'] = det[['Close','prd']].apply(lambda x: x.Close if x.prd == 'down' else 0,axis =1)
buySignal = det[det.Buy > 0][['Date','Buy']]
sellSignal = det[det.Sell > 0][['Date','Sell']]
det_tst = Time_Series_Transformer.from_pandas(det,'Date',None)
det_tst.plot(["Close"],'info').add_marker(
x = buySignal['Date'],
y = buySignal['Buy'],color = 'green',
legendName = 'Buy').add_marker(
x = sellSignal['Date'],
y = sellSignal['Sell'],color = 'red',
legendName = 'Sell')
det.Sell.sum() -det.Buy.sum()
164.6298828125
Deep Learning¶
Similar to the machine learning stock trading signal example, we are trying to use LSTM to produce trading signals in this tutorial. In this tutorial we will demonstrate how to use time_series_transform to generate stock sequences and serialize into tfRecords.
import tensorflow as tf
from time_series_transform.transform_core_api.time_series_transformer import Time_Series_Transformer
from time_series_transform.stock_transform.stock_transfromer import Stock_Transformer
import pandas as pd
import numpy as np
import pandas_ta as ta
from tensorflow.keras import layers
from tensorflow import keras
from time_series_transform import tfDataset_adopter as tda
from sklearn.metrics import classification_report,confusion_matrix
def up_or_down(current,lead):
if current < lead:
return [1,0,0]
elif current > lead:
return [0,0,1]
else:
return [0,1,0]
def up_down_transform(data):
currentList = data['Close_GOOGL']
leadList = data['Close_GOOGL_lead_1']
res = []
for c,l in zip(currentList,leadList):
res.append(up_or_down(c,l))
return res
Here, we extract 3 years of Google, NASDQ, and Gold information from yahoo API. And then, we generated the lag sequences with length of 10. After generating those lag sequences, we use make_stack_sequence to stack those sequences into one. The stock movement is developed by comparing lead data with current data.
st = Stock_Transformer.from_stock_engine_period(["GOOGL","NDAQ","GOLD"],'3y','yahoo')
df = st.to_pandas(expandCategory= True, expandTime = False,preprocessType = 'remove')
tst = Time_Series_Transformer.from_pandas(df,'Date',None)
tst = tst.make_lag_sequence(
inputLabels = ["Close_GOLD","Close_GOOGL","Close_NDAQ"],
windowSize = 10,
lagNum =1 ,
suffix = "_lag_seq_")
tst = tst.make_stack_sequence(
inputLabels = ["Close_GOLD_lag_seq_10","Close_GOOGL_lag_seq_10","Close_NDAQ_lag_seq_10"],
newName = "Close_seq",axis = -1
)
tst = tst.dropna()
tst = tst.make_lead('Close_GOOGL',1,'_lead_')
tst = tst.transform(['Close_GOOGL','Close_GOOGL_lead_1'],'change',up_down_transform)
tst = tst.make_label("change")
Once we prepared the pre-processed data, we can hold the last 60 days as test data. Subsequently, we can use TFRecord_Writer and TFRecord_Reader to serialize data into tfRecord and retrieved as Tensorflow Dataset. Once, the data is prepared, we start to train LSTM model.
X,y = tst.to_pandas(sepLabel=True)
testX,testY = X.tail(60),y.tail(60)
trainX,trainY = X.drop(testX.index),y.drop(testY.index)
twX = tda.TFRecord_Writer('trainX.tfRecord')
twX.write_tfRecord(pd.DataFrame(trainX).to_dict('records'))
twY = tda.TFRecord_Writer('trainY.tfRecord')
twY.write_tfRecord(trainY.to_dict('records'))
datasetTrainX = tda.TFRecord_Reader('trainX.tfRecord',twX.get_tfRecord_dtype()).make_tfDataset()
datasetTrainY = tda.TFRecord_Reader('trainY.tfRecord',twY.get_tfRecord_dtype()).make_tfDataset()
dataset = tf.data.Dataset.zip((datasetTrainX,datasetTrainY)).batch(686)
inputs = keras.Input(shape=(10,3),name = "Close_seq")
x = layers.LSTM(20)(inputs)
x = layers.Flatten()(inputs)
x = layers.Dense(10, activation="relu", name="dense_2")(x)
outputs = layers.Dense(3, activation="softmax", name="change")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(
optimizer = tf.keras.optimizers.RMSprop(learning_rate = 0.001),
loss = tf.keras.losses.CategoricalCrossentropy()
)
model.fit(
dataset,
batch_size=1,
epochs = 500,
verbose = 0
)
C:UsersAllen ChiangAppDataRoamingPythonPython38site-packagestensorflowpythonkerasenginefunctional.py:592: UserWarning: Input dict contained keys ['Date', 'Open_GOLD', 'High_GOLD', 'Low_GOLD', 'Close_GOLD', 'Volume_GOLD', 'Dividends_GOLD', 'Stock Splits_GOLD', 'Open_GOOGL', 'High_GOOGL', 'Low_GOOGL', 'Close_GOOGL', 'Volume_GOOGL', 'Dividends_GOOGL', 'Stock Splits_GOOGL', 'Open_NDAQ', 'High_NDAQ', 'Low_NDAQ', 'Close_NDAQ', 'Volume_NDAQ', 'Dividends_NDAQ', 'Stock Splits_NDAQ', 'Close_GOLD_lag_seq_10', 'Close_GOOGL_lag_seq_10', 'Close_NDAQ_lag_seq_10', 'Close_GOOGL_lead_1'] which did not match any model input. They will be ignored by the model.
<tensorflow.python.keras.callbacks.History at 0x29213c84220>
tesetX = np.array(testX['Close_seq'].tolist())
prd = model.predict(tesetX)
def label_gen(data):
prdRes = []
for i in data:
if np.argmax(i) == 0:
prdRes.append('up')
elif np.argmax(i) == 2:
prdRes.append('down')
else:
prdRes.append('unchange')
return prdRes
prd = label_gen(prd)
testY = label_gen(testY.to_dict('list')['change'])
print(classification_report(testY,prd))
precision recall f1-score support
down 0.44 0.73 0.55 26
unchange 0.00 0.00 0.00 1
up 0.59 0.30 0.40 33
accuracy 0.48 60
macro avg 0.34 0.34 0.32 60
weighted avg 0.52 0.48 0.46 60
C:UsersAllen Chianganaconda3libsite-packagessklearnmetrics_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use zero_division parameter to control this behavior.
confusion_matrix(testY,prd)
array([[19, 0, 7],
[ 1, 0, 0],
[23, 0, 10]], dtype=int64)
Trading signal is generated similar to Machine learning cases. The result is showing below.
def buy_sell_signal(dataList):
res = []
tmp ='down'
for i in dataList:
if tmp == i or i == 'unchange':
if i == 'unchange':
res.append('hold')
continue
tmp = i
res.append('hold')
else:
tmp = i
res.append(i)
return res
det = pd.DataFrame({'prd':buy_sell_signal(prd)})
det['Close'] = testX.Close_GOOGL.tolist()
det['Date'] = testX.Date.tolist()
det['Buy'] = det[['Close','prd']].apply(lambda x: x.Close if x.prd == 'up' else 0,axis =1)
det['Sell'] = det[['Close','prd']].apply(lambda x: x.Close if x.prd == 'down' else 0,axis =1)
buySignal = det[det.Buy > 0][['Date','Buy']]
sellSignal = det[det.Sell > 0][['Date','Sell']]
det_tst = Time_Series_Transformer.from_pandas(det,'Date',None)
det_tst.plot(["Close"],'info').add_marker(
x = buySignal['Date'],
y = buySignal['Buy'],color = 'green',
legendName = 'Buy').add_marker(
x = sellSignal['Date'],
y = sellSignal['Sell'],color = 'red',
legendName = 'Sell')
det.Sell.sum() -det.Buy.sum()
122.7799072265625