-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathnormalization.py
114 lines (97 loc) · 4.44 KB
/
normalization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# data's libraries
# ==============================================================================
import numpy as np
import pandas as pd
# plots's libraries
# ==============================================================================
import matplotlib.pyplot as plt
import matplotlib.font_manager
from matplotlib import style
style.use('ggplot') or plt.style.use('ggplot')
# Classical preprocessing
# ==============================================================================
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler
# Tensorflow Quantum
from cirq.contrib.svg import SVGCircuit
import tensorflow_quantum as tfq
import tensorflow as tf
import cirq
import sympy
from keras.models import Sequential, load_model
# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_absolute_error
# classical preprocessing
# for this class we need only the path and filename for our classical data
# we consider two methos using only the closing values and normalzie or
# use PCA for delete osme parameter and use the original data
# we consider 30 (or indicate in the input) days per instance,
# is important that the size og the instance must be module 3
# And consider the method hold-out 70-30 to train,
# we can choose another values for this valid method.
class Normalization():
def __init__(self, filename='AAPL', address='Data/stocks_predictions/', address_original='Data/Stocks/'): # read a default file
self.filename = filename
self.address = address
self.address_original = address_original
def create_dataset(self,df,days=30): # convert an array to split in x and y sets
x = []
y = []
for i in range(days, df.shape[0]):
x.append(df[i-days:i,0])
y.append(df[i,0])
x = np.array(x)
y = np.array(y)
return x,y
def preprocessing(self,porcentage=0.7,days=30,flag_pca=False,dataset='train'):
## format of CSV file: Date,Open,High,Low,Close,Adj Close,Volume
df = pd.read_csv(self.address_original+self.filename+'.csv') # using pandas to read the csv file
del df["Date"] #delete the column Date
df_preprocessing = []
if flag_pca:
df_pca = df.copy()
for i in df_pca.columns: #apply pca methods
df_pca[i] = MinMaxScaler().fit_transform(np.array(df_pca[i]).reshape(-1,1))
pca_pipe = make_pipeline(StandardScaler(), PCA())
pca_pipe.fit(df_pca)
# Se extrae el modelo entrenado del pipeline
model_pca = pca_pipe.named_steps['pca']
pd.DataFrame(
data = model_pca.components_,
columns = df_pca.columns,
index = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6']
)
df_pca = df_pca['Open'].values
df_preprocessing = df_pca.reshape(-1, 1)
else:
df = df['Close'].values
df_preprocessing = df.reshape(-1, 1)
dataset_train = np.array(df_preprocessing[:int(df_preprocessing.shape[0]*porcentage)])
dataset_test = np.array(df_preprocessing[int(df_preprocessing.shape[0]*porcentage):])
scaler = MinMaxScaler(feature_range=(0,1))
dataset_train = scaler.fit_transform(dataset_train)
dataset_test = scaler.transform(dataset_test)
if dataset == 'train':
df_output = pd.read_csv(self.address+self.filename+'_train.csv')
else:
df_output = pd.read_csv(self.address+self.filename+'_test.csv')
predictions = scaler.inverse_transform(np.array(df_output['y_pred']).reshape(-1,1))
y_test_scaled = scaler.inverse_transform(np.array(df_output['y_real']).reshape(-1, 1))
results = np.dstack((y_test_scaled,predictions))
return results
if __name__ == "__main__":
name_stocks = ['AAPL','ABB','ABBV','TOT','WMT','DUK','CHL','HSBC']
address='Data/stocks_predictions/'
address_original='Data/Stocks/'
for name in name_stocks:
nm = Normalization(filename=name)
for ds in ['train','test']:
result = nm.preprocessing(porcentage=0.7,days=30,flag_pca=False,dataset=ds)
print(result)
np.save(address+'data_'+name+'_'+ds+'.npy', result)