Skip to content
Snippets Groups Projects
Commit 52a297c1 authored by Malik Algelly's avatar Malik Algelly
Browse files

fix(article-service): TMTC

parent 6347bc41
No related branches found
No related tags found
No related merge requests found
......@@ -2,3 +2,4 @@
from .dataset import MoleculeDataModule
from .dataset import MoleculeDataset
from .sklearn_dataset import MoleculeSklearnDataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
class MoleculeSklearnDataset():
def __init__(self, path_of_csv):
self.data = pd.read_csv(path_of_csv)
self.data = self.data.dropna()
self.data = self.data.sort_values(by=["Energy_(kcal/mol)"])
self.data = self.data.drop_duplicates(subset=["Chiral_Molecular_SMILES"])
self.data = self.data.sort_index()
# Make attributes consisting of the number of occurences
# of characters in the Chiral Molecular SMILES, for all characters
# possible. (This is a very simple tokenization)
symbols = np.unique(list("".join(self.data["Chiral_Molecular_SMILES"])))
symbols.sort()
for symbol in symbols:
if symbol in list("""\\^$.|?*+()[]\\{\\}"""):
self.data["Count of {}".format(symbol)] = self.data[
"Chiral_Molecular_SMILES"
].str.count("\\" + symbol)
else:
self.data["Count of {}".format(symbol)] = self.data[
"Chiral_Molecular_SMILES"
].str.count(symbol)
self.X = self.data.drop(
columns=[
"Energy DG:kcal/mol)",
"Energy_(kcal/mol)",
"Chiral_Molecular_SMILES",
]
)
self.y = self.data[["Energy DG:kcal/mol)", "Energy_(kcal/mol)"]]
self.X = (self.X - self.X.mean()) / self.X.std()
self.y = (self.y - self.y.mean()) / self.y.std()
def getDataNoSmiles(self):
cols = [c for c in self.X.columns if c.lower()[0:5] != 'count']
return self.X[cols]
def getDataWithOnlySmiles(self):
cols = [c for c in self.X.columns if c.lower()[0:5] == 'count']
return self.X[cols]
def getDataWithSmiles(self):
return self.X
def getY(self):
return self.y
%% Cell type:markdown id: tags:
# Linear Regression
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
```
%% Cell type:code id: tags:
``` python
from malikule.dataset import MoleculeSklearnDataset
```
%% Cell type:markdown id: tags:
### Preprocessing of data
%% Cell type:markdown id: tags:
The data are normalized and columns on the composition of the molecules are added. For more information, see the initialization code of the MoleculeSklearnDataset class.
%% Cell type:code id: tags:
``` python
path = "../data/data.csv"
dm = MoleculeSklearnDataset(path)
```
%% Output
/home/moi/.local/lib/python3.8/site-packages/pandas/core/nanops.py:892: RuntimeWarning: overflow encountered in square
sqr = _ensure_numeric((avg - values) ** 2)
%% Cell type:markdown id: tags:
### Getting the different data sets
%% Cell type:markdown id: tags:
We create three data sets: \
- one that does not contain the SMILES data of the molecules \
- one that contains only the SMILES data of the molecules \
- one that contains all the data
%% Cell type:code id: tags:
``` python
X_no_smiles = dm.getDataNoSmiles()
X_only_smiles = dm.getDataWithOnlySmiles()
X_with_smiles = dm.getDataWithSmiles()
Xs = [X_no_smiles, X_only_smiles, X_with_smiles]
data_model = ["X without SMILES", "X with only SMILES", "X with SMILES" ]
y = dm.getY()
```
%% Cell type:markdown id: tags:
### Cross-validation
%% Cell type:markdown id: tags:
We create a function that takes a data set as input and an integer k, and splits the set into k parts. Then it performs the cross validation on these sets and returns the linear regression with the best MSE.
%% Cell type:code id: tags:
``` python
def cross_validation(X, y, k=5):
mse_val = 0
best = np.inf
kf = KFold(n_splits=k, random_state=0, shuffle=True)
for result in kf.split(X):
X_train = X.iloc[result[0]]
X_test = X.iloc[result[1]]
y_train = y.iloc[result[0]]
y_test = y.iloc[result[1]]
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
mse_val += mean_squared_error(y_test, y_pred)
if best > mse_val:
best = mse_val
best_reg = reg
return mse_val / k, best_reg
```
%% Cell type:markdown id: tags:
Then we use the previous function on the different sets and test the linear regression on an unused test set.
%% Cell type:code id: tags:
``` python
k = 5
for i,X in enumerate(Xs):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
mse_val_energy_, reg_energy_ = cross_validation(X_train, y_train["Energy_(kcal/mol)"], k)
mse_val_energy_DG, reg_energy_DG = cross_validation(X_train, y_train["Energy DG:kcal/mol)"], k)
y_pred_energy_ = reg_energy_.predict(X_test)
mse_energy_ = mean_squared_error(y_test["Energy_(kcal/mol)"], y_pred_energy_)
y_pred_energy_DG = reg_energy_DG.predict(X_test)
mse_energy_DG = mean_squared_error(y_test["Energy DG:kcal/mol)"], y_pred_energy_DG)
print(data_model[i])
print("Energy_")
print("MSE on validate set:",mse_val_energy_)
print("MSE on test set:",mse_energy_)
print("Energy_DG")
print("MSE on validate set:",mse_val_energy_DG)
print("MSE on test set:",mse_energy_DG)
print("------------------------------")
```
%% Output
X without SMILES
Energy_
MSE on validate set: 0.22166087919948133
MSE on test set: 0.20934400797162736
Energy_DG
MSE on validate set: 0.22228984627021942
MSE on test set: 0.20993802568824027
------------------------------
X with only SMILES
Energy_
MSE on validate set: 0.0003477875723842499
MSE on test set: 0.00032023641292565125
Energy_DG
MSE on validate set: 0.0003339909998309863
MSE on test set: 0.00030783826426369177
------------------------------
X with SMILES
Energy_
MSE on validate set: 2.992105443881672e-06
MSE on test set: 2.6693795730154255e-06
Energy_DG
MSE on validate set: 3.0005956281824e-06
MSE on test set: 2.6769539956866846e-06
------------------------------
%% Cell type:code id: tags:
``` python
```
%% Cell type:markdown id: tags:
# SVR
%% Cell type:code id: tags:
``` python
import glob, os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import pickle
import optuna
```
%% Cell type:code id: tags:
``` python
from malikule.dataset import MoleculeSklearnDataset
```
%% Cell type:markdown id: tags:
### Preprocessing of data
%% Cell type:markdown id: tags:
The data are normalized and columns on the composition of the molecules are added. For more information, see the initialization code of the MoleculeSklearnDataset class.
%% Cell type:code id: tags:
``` python
path = "../data/data.csv"
dm = MoleculeSklearnDataset(path)
```
%% Output
/home/moi/.local/lib/python3.8/site-packages/pandas/core/nanops.py:892: RuntimeWarning: overflow encountered in square
sqr = _ensure_numeric((avg - values) ** 2)
%% Cell type:markdown id: tags:
### Getting the different data sets
%% Cell type:markdown id: tags:
We create three data sets: \
- one that does not contain the SMILES data of the molecules \
- one that contains only the SMILES data of the molecules \
- one that contains all the data
%% Cell type:code id: tags:
``` python
X_no_smiles = dm.getDataNoSmiles()
X_only_smiles = dm.getDataWithOnlySmiles()
X_with_smiles = dm.getDataWithSmiles()
Xs = [X_no_smiles, X_only_smiles, X_with_smiles]
data_model = ["X without SMILES", "X with only SMILES", "X with SMILES" ]
y = dm.getY()
```
%% Cell type:markdown id: tags:
### Cross-validation
%% Cell type:markdown id: tags:
We create a function that takes a data set as input and an integer k, and splits the set into k parts. Then it performs the cross validation on these sets and returns the linear regression with the best MSE.
%% Cell type:code id: tags:
``` python
def objective(trial, X_train, X_test, y_train, y_test, k_index):
svr = SVR(
kernel='rbf',
C=trial.suggest_float("C", 0, 100),
epsilon= trial.suggest_float("epsilon", 0, 1),
gamma=trial.suggest_float("gamma", 0, 10),
)
svr = svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
with open("TMP_MALIKULE_{}_{}_{}.pickle".format(X_train.shape[1], trial.number, k_index), "wb") as fout:
pickle.dump(svr, fout)
return mean_squared_error(y_test, y_pred)
def cross_validation(X, y, k=5):
n_trials = 500
mse_val = 0
best = np.inf
kf = KFold(n_splits=k, random_state=0, shuffle=True)
for i, result in enumerate(kf.split(X)):
X_train = X.iloc[result[0]]
X_test = X.iloc[result[1]]
y_train = y.iloc[result[0]]
y_test = y.iloc[result[1]]
study = optuna.create_study()
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(lambda trial: objective(trial, X_train, X_test, y_train, y_test, i), n_trials=n_trials)
mse_val += study.best_trial.value
if best > mse_val:
best = study.best_trial.value
best_i = i
with open("TMP_MALIKULE_{}_{}_{}.pickle".format(X.shape[1], study.best_trial.number, best_i), "rb") as fin:
best_svr = pickle.load(fin)
for f in glob.glob("TMP_MALIKULE_*.pickle"):
os.remove(f)
return mse_val / k, best_svr
```
%% Cell type:markdown id: tags:
Then we use the previous function on the different sets and test the linear regression on an unused test set.
%% Cell type:code id: tags:
``` python
k = 2
for i,X in enumerate(Xs):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
mse_val_energy_, svr_energy_ = cross_validation(X_train, y_train["Energy_(kcal/mol)"], k)
mse_val_energy_DG, svr_energy_DG = cross_validation(X_train, y_train["Energy DG:kcal/mol)"], k)
y_pred_energy_ = svr_energy_.predict(X_test)
mse_energy_ = mean_squared_error(y_test["Energy_(kcal/mol)"], y_pred_energy_)
y_pred_energy_DG = svr_energy_DG.predict(X_test)
mse_energy_DG = mean_squared_error(y_test["Energy DG:kcal/mol)"], y_pred_energy_DG)
print(data_model[i])
print("Energy_")
print("MSE on validate set:",mse_val_energy_)
print("MSE on test set:",mse_energy_)
print("Energy_DG")
print("MSE on validate set:",mse_val_energy_DG)
print("MSE on test set:",mse_energy_DG)
print("------------------------------")
```
%% Output
[I 2022-06-20 17:19:16,902] A new study created in memory with name: no-name-a44d7b6f-9801-4715-96c5-8d1def60cb0e
X without SMILES
Energy_
MSE on validate set: 0.3309146291714391
MSE on test set: 0.3066575846793091
Energy_DG
MSE on validate set: 0.32556979134078445
MSE on test set: 0.3093429133132752
------------------------------
X with only SMILES
Energy_
MSE on validate set: 0.0004153051447523143
MSE on test set: 0.00043307206180056136
Energy_DG
MSE on validate set: 0.00035963750253789877
MSE on test set: 0.0004470172626322179
------------------------------
X with SMILES
Energy_
MSE on validate set: 0.0006245028007995493
MSE on test set: 0.0010165411631082994
Energy_DG
MSE on validate set: 0.0015130469019798815
MSE on test set: 0.0006429917099876396
------------------------------
%% Cell type:code id: tags:
``` python
```
......@@ -98,9 +98,30 @@ energy at room temperature (Energy\_DG)
\subsection{Pre-processing}
Looking at the dataset, we found that some data had NaN values in their attribute.
In order to solve this problem, we decided to delete all rows with NaN values.
Then, as requested, we kept one sample of each molecule with the lowest energy (Energy\_).
In addition, we have tokenize the SMILES string as explained above.
After that, in order to evaluate the importance of the attributes, we created three different datasets.
The first one contains no information about the composition of the molecule.
The second one contains only the information about the composition of the molecule.
And the last one contains all the available information.
\section{Descriptions of the algorithms used}
\subsection{Baseline algorithm: Linear Regression}
\subsection{ \ldots}
Linear regression tries to find the linear relationship between two variables by calculating a linear equation.
If a linear relationship exists between the two variables, this equation allows us to estimate the desired variable.
The general linear equation is as follows: \[ Y = w_{T}X + b \]
In order to find the coefficients $w_{T}$ and the intercept $b$, we must minimize the residual
sum of squares between $Y$ and $w_{T}X$ as followed: \[ MIN \sum_{n=1}^{\infty} (y_{i} - w_{T}x_{i})^{2} \]
\subsection{Support Vector Regression}
\subsection{Multi-Layered Perceptrons}
Multi-Layered Perceptrons (MLP) are the cornerstone of Deep Neural Networks.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment