Merge branch 'main' of gitlab.unige.ch:Samuel.Simko/project_datamining

c68c2751 · Samuel Simko · a1bb9cff · 1dd71b76 · c68c2751 · c68c2751
Commit c68c2751 authored 2 years ago by Samuel Simko
--- a/notebooks/svr.ipynb
+++ b/notebooks/svr.ipynb
@@ -9,7 +9,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -27,7 +27,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -50,7 +50,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@@ -87,7 +87,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -117,59 +117,58 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
-    "def objective(trial, X_train, X_test, y_train, y_test, k_index):\n",
-    "\n",
-    "    svr = SVR(\n",
-    "                kernel='rbf',\n",
-    "                C=trial.suggest_float(\"C\", 0, 100),\n",
-    "                epsilon= trial.suggest_float(\"epsilon\", 0, 1),\n",
-    "                gamma=trial.suggest_float(\"gamma\", 0, 10),\n",
-    "            )\n",
-    "\n",
-    "    svr = svr.fit(X_train, y_train)\n",
-    "\n",
-    "    y_pred = svr.predict(X_test)\n",
-    "\n",
-    "    with open(\"TMP_MALIKULE_{}_{}_{}.pickle\".format(X_train.shape[1], trial.number, k_index), \"wb\") as fout:\n",
-    "        pickle.dump(svr, fout)\n",
-    "\n",
-    "    return mean_squared_error(y_test, y_pred)\n",
-    "\n",
-    "def cross_validation(X, y, k=5):\n",
-    "    n_trials = 500\n",
-    "\n",
+    "def objective(trial, X, y):\n",
+    "    k = 5\n",
    "    mse_val = 0\n",
-    "    best = np.inf\n",
+    "\n",
+    "    C = trial.suggest_float(\"C\", 0, 100)\n",
+    "    epsilon = trial.suggest_float(\"epsilon\", 0, 1)\n",
+    "    gamma = trial.suggest_float(\"gamma\", 0, 10)\n",
    "\n",
    "    kf = KFold(n_splits=k, random_state=0, shuffle=True)\n",
-    "    for i, result in enumerate(kf.split(X)):\n",
+    "    for result in kf.split(X):\n",
    "        X_train = X.iloc[result[0]]\n",
    "        X_test = X.iloc[result[1]]\n",
    "\n",
    "        y_train = y.iloc[result[0]]\n",
    "        y_test = y.iloc[result[1]]\n",
    "\n",
-    "        study = optuna.create_study()\n",
-    "        optuna.logging.set_verbosity(optuna.logging.WARNING)\n",
-    "        study.optimize(lambda trial: objective(trial, X_train, X_test, y_train, y_test, i), n_trials=n_trials)\n",
+    "        svr = SVR(\n",
+    "                    kernel='rbf',\n",
+    "                    C=C,\n",
+    "                    epsilon=epsilon,\n",
+    "                    gamma=gamma,\n",
+    "                )\n",
    "\n",
-    "        mse_val += study.best_trial.value\n",
+    "        svr = svr.fit(X_train, y_train)\n",
    "\n",
-    "        if best > mse_val:\n",
-    "            best = study.best_trial.value\n",
-    "            best_i = i\n",
+    "        y_pred = svr.predict(X_test)\n",
    "\n",
-    "            with open(\"TMP_MALIKULE_{}_{}_{}.pickle\".format(X.shape[1], study.best_trial.number, best_i), \"rb\") as fin:\n",
-    "                best_svr = pickle.load(fin)\n",
-    "                        \n",
-    "        for f in glob.glob(\"TMP_MALIKULE_*.pickle\"):\n",
-    "            os.remove(f)\n",
+    "        mse_val += mean_squared_error(y_test, y_pred)\n",
+    "\n",
+    "    with open(\"tmp/TMP_MALIKULE_{}_{}.pickle\".format(X_train.shape[1], trial.number), \"wb\") as fout:\n",
+    "        pickle.dump(svr, fout)  \n",
+    "    \n",
+    "    return mse_val / k\n",
    "\n",
-    "    return mse_val / k, best_svr\n",
+    "def cross_validation(X, y):\n",
+    "    n_trials = 100\n",
+    "\n",
+    "    study = optuna.create_study()\n",
+    "    optuna.logging.set_verbosity(optuna.logging.WARNING)\n",
+    "    study.optimize(lambda trial: objective(trial, X, y), n_trials=n_trials)\n",
+    "\n",
+    "    with open(\"tmp/TMP_MALIKULE_{}_{}.pickle\".format(X.shape[1], study.best_trial.number), \"rb\") as fin:\n",
+    "        best_svr = pickle.load(fin)\n",
+    "                    \n",
+    "    for f in glob.glob(\"tmp/TMP_MALIKULE_*.pickle\"):\n",
+    "        os.remove(f)\n",
+    "\n",
+    "    return study.best_value, best_svr\n",
    "\n"
   ]
  },
@@ -182,55 +181,46 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[32m[I 2022-06-20 17:19:16,902]\u001b[0m A new study created in memory with name: no-name-a44d7b6f-9801-4715-96c5-8d1def60cb0e\u001b[0m\n"
-     ]
-    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X without SMILES\n",
      "Energy_\n",
-      "MSE on validate set: 0.3309146291714391\n",
-      "MSE on test set: 0.3066575846793091\n",
+      "MSE on validate set: 0.3088730001275274\n",
+      "MSE on test set: 0.31100852093520814\n",
      "Energy_DG\n",
-      "MSE on validate set: 0.32556979134078445\n",
-      "MSE on test set: 0.3093429133132752\n",
+      "MSE on validate set: 0.31517404420730144\n",
+      "MSE on test set: 0.31770596340555723\n",
      "------------------------------\n",
      "X with only SMILES\n",
      "Energy_\n",
-      "MSE on validate set: 0.0004153051447523143\n",
-      "MSE on test set: 0.00043307206180056136\n",
+      "MSE on validate set: 0.01886122524627829\n",
+      "MSE on test set: 0.012995809534011079\n",
      "Energy_DG\n",
-      "MSE on validate set: 0.00035963750253789877\n",
-      "MSE on test set: 0.0004470172626322179\n",
+      "MSE on validate set: 0.036944536440535\n",
+      "MSE on test set: 0.04047147481825687\n",
      "------------------------------\n",
      "X with SMILES\n",
      "Energy_\n",
-      "MSE on validate set: 0.0006245028007995493\n",
-      "MSE on test set: 0.0010165411631082994\n",
+      "MSE on validate set: 0.02431202432691374\n",
+      "MSE on test set: 0.03139232691986686\n",
      "Energy_DG\n",
-      "MSE on validate set: 0.0015130469019798815\n",
-      "MSE on test set: 0.0006429917099876396\n",
+      "MSE on validate set: 0.044747865322953176\n",
+      "MSE on test set: 0.049129339906476985\n",
      "------------------------------\n"
     ]
    }
   ],
   "source": [
-    "k = 2\n",
-    "\n",
    "for i,X in enumerate(Xs):\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
    "\n",
-    "    mse_val_energy_, svr_energy_ = cross_validation(X_train, y_train[\"Energy_(kcal/mol)\"], k)\n",
-    "    mse_val_energy_DG, svr_energy_DG = cross_validation(X_train, y_train[\"Energy DG:kcal/mol)\"], k)\n",
+    "    mse_val_energy_, svr_energy_ = cross_validation(X_train, y_train[\"Energy_(kcal/mol)\"])\n",
+    "    mse_val_energy_DG, svr_energy_DG = cross_validation(X_train, y_train[\"Energy DG:kcal/mol)\"])\n",
    "\n",
    "    y_pred_energy_ = svr_energy_.predict(X_test)\n",
    "    mse_energy_ = mean_squared_error(y_test[\"Energy_(kcal/mol)\"], y_pred_energy_)\n",

 %% Cell type:markdown id: tags:

 # SVR

 %% Cell type:code id: tags:

 ``` python
 import glob, os
 import pandas as pd
 import numpy as np
 from sklearn.model_selection import train_test_split
 from sklearn.svm import SVR
 from sklearn.metrics import mean_squared_error
 from sklearn.model_selection import KFold
 import pickle

 import optuna
 ```

 %% Cell type:code id: tags:

 ``` python
 from malikule.dataset import MoleculeSklearnDataset
 ```

 %% Cell type:markdown id: tags:

 ### Preprocessing of data

 %% Cell type:markdown id: tags:

 The data are normalized and columns on the composition of the molecules are added. For more information, see the initialization code of the MoleculeSklearnDataset class.

 %% Cell type:code id: tags:

 ``` python
 path = "../data/data.csv"

 dm = MoleculeSklearnDataset(path)
 ```

 %% Output

    /home/moi/.local/lib/python3.8/site-packages/pandas/core/nanops.py:892: RuntimeWarning: overflow encountered in square
      sqr = _ensure_numeric((avg - values) ** 2)

 %% Cell type:markdown id: tags:

 ### Getting the different data sets

 %% Cell type:markdown id: tags:

 We create three data sets: \
    - one that does not contain the SMILES data of the molecules \
    - one that contains only the SMILES data of the molecules \
    - one that contains all the data

 %% Cell type:code id: tags:

 ``` python
 X_no_smiles = dm.getDataNoSmiles()
 X_only_smiles = dm.getDataWithOnlySmiles()
 X_with_smiles = dm.getDataWithSmiles()

 Xs = [X_no_smiles, X_only_smiles, X_with_smiles]
 data_model = ["X without SMILES", "X with only SMILES", "X with SMILES" ]

 y = dm.getY()
 ```

 %% Cell type:markdown id: tags:

 ### Cross-validation

 %% Cell type:markdown id: tags:

 We create a function that takes a data set as input and an integer k, and splits the set into k parts. Then it performs the cross validation on these sets and returns the linear regression with the best MSE.

 %% Cell type:code id: tags:

 ``` python
-def objective(trial, X_train, X_test, y_train, y_test, k_index):
-
-    svr = SVR(
-                kernel='rbf',
-                C=trial.suggest_float("C", 0, 100),
-                epsilon= trial.suggest_float("epsilon", 0, 1),
-                gamma=trial.suggest_float("gamma", 0, 10),
-            )
-
-    svr = svr.fit(X_train, y_train)
-
-    y_pred = svr.predict(X_test)
-
-    with open("TMP_MALIKULE_{}_{}_{}.pickle".format(X_train.shape[1], trial.number, k_index), "wb") as fout:
-        pickle.dump(svr, fout)
-
-    return mean_squared_error(y_test, y_pred)
-
-def cross_validation(X, y, k=5):
-    n_trials = 500
-
+def objective(trial, X, y):
+    k = 5
    mse_val = 0
-    best = np.inf
+
+    C = trial.suggest_float("C", 0, 100)
+    epsilon = trial.suggest_float("epsilon", 0, 1)
+    gamma = trial.suggest_float("gamma", 0, 10)

    kf = KFold(n_splits=k, random_state=0, shuffle=True)
-    for i, result in enumerate(kf.split(X)):
+    for result in kf.split(X):
        X_train = X.iloc[result[0]]
        X_test = X.iloc[result[1]]

        y_train = y.iloc[result[0]]
        y_test = y.iloc[result[1]]

-        study = optuna.create_study()
-        optuna.logging.set_verbosity(optuna.logging.WARNING)
-        study.optimize(lambda trial: objective(trial, X_train, X_test, y_train, y_test, i), n_trials=n_trials)
+        svr = SVR(
+                    kernel='rbf',
+                    C=C,
+                    epsilon=epsilon,
+                    gamma=gamma,
+                )

-        mse_val += study.best_trial.value
+        svr = svr.fit(X_train, y_train)

-        if best > mse_val:
-            best = study.best_trial.value
-            best_i = i
+        y_pred = svr.predict(X_test)

-            with open("TMP_MALIKULE_{}_{}_{}.pickle".format(X.shape[1], study.best_trial.number, best_i), "rb") as fin:
-                best_svr = pickle.load(fin)
+        mse_val += mean_squared_error(y_test, y_pred)

-        for f in glob.glob("TMP_MALIKULE_*.pickle"):
-            os.remove(f)
+    with open("tmp/TMP_MALIKULE_{}_{}.pickle".format(X_train.shape[1], trial.number), "wb") as fout:
+        pickle.dump(svr, fout)
+
+    return mse_val / k
+
+def cross_validation(X, y):
+    n_trials = 100
+
+    study = optuna.create_study()
+    optuna.logging.set_verbosity(optuna.logging.WARNING)
+    study.optimize(lambda trial: objective(trial, X, y), n_trials=n_trials)

-    return mse_val / k, best_svr
+    with open("tmp/TMP_MALIKULE_{}_{}.pickle".format(X.shape[1], study.best_trial.number), "rb") as fin:
+        best_svr = pickle.load(fin)
+
+    for f in glob.glob("tmp/TMP_MALIKULE_*.pickle"):
+        os.remove(f)
+
+    return study.best_value, best_svr

 ```

 %% Cell type:markdown id: tags:

 Then we use the previous function on the different sets and test the linear regression on an unused test set.

 %% Cell type:code id: tags:

 ``` python
-k = 2
-
 for i,X in enumerate(Xs):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

-    mse_val_energy_, svr_energy_ = cross_validation(X_train, y_train["Energy_(kcal/mol)"], k)
-    mse_val_energy_DG, svr_energy_DG = cross_validation(X_train, y_train["Energy DG:kcal/mol)"], k)
+    mse_val_energy_, svr_energy_ = cross_validation(X_train, y_train["Energy_(kcal/mol)"])
+    mse_val_energy_DG, svr_energy_DG = cross_validation(X_train, y_train["Energy DG:kcal/mol)"])

    y_pred_energy_ = svr_energy_.predict(X_test)
    mse_energy_ = mean_squared_error(y_test["Energy_(kcal/mol)"], y_pred_energy_)

    y_pred_energy_DG = svr_energy_DG.predict(X_test)
    mse_energy_DG = mean_squared_error(y_test["Energy DG:kcal/mol)"], y_pred_energy_DG)

    print(data_model[i])
    print("Energy_")
    print("MSE on validate set:",mse_val_energy_)
    print("MSE on test set:",mse_energy_)

    print("Energy_DG")
    print("MSE on validate set:",mse_val_energy_DG)
    print("MSE on test set:",mse_energy_DG)

    print("------------------------------")

 ```

 %% Output

-    [32m[I 2022-06-20 17:19:16,902][0m A new study created in memory with name: no-name-a44d7b6f-9801-4715-96c5-8d1def60cb0e[0m
-
    X without SMILES
    Energy_
-    MSE on validate set: 0.3309146291714391
-    MSE on test set: 0.3066575846793091
+    MSE on validate set: 0.3088730001275274
+    MSE on test set: 0.31100852093520814
    Energy_DG
-    MSE on validate set: 0.32556979134078445
-    MSE on test set: 0.3093429133132752
+    MSE on validate set: 0.31517404420730144
+    MSE on test set: 0.31770596340555723
    ------------------------------
    X with only SMILES
    Energy_
-    MSE on validate set: 0.0004153051447523143
-    MSE on test set: 0.00043307206180056136
+    MSE on validate set: 0.01886122524627829
+    MSE on test set: 0.012995809534011079
    Energy_DG
-    MSE on validate set: 0.00035963750253789877
-    MSE on test set: 0.0004470172626322179
+    MSE on validate set: 0.036944536440535
+    MSE on test set: 0.04047147481825687
    ------------------------------
    X with SMILES
    Energy_
-    MSE on validate set: 0.0006245028007995493
-    MSE on test set: 0.0010165411631082994
+    MSE on validate set: 0.02431202432691374
+    MSE on test set: 0.03139232691986686
    Energy_DG
-    MSE on validate set: 0.0015130469019798815
-    MSE on test set: 0.0006429917099876396
+    MSE on validate set: 0.044747865322953176
+    MSE on test set: 0.049129339906476985
    ------------------------------

 %% Cell type:code id: tags:

 ``` python
 ```

--- a/rapport/rapport.tex
+++ b/rapport/rapport.tex
@@ -148,12 +148,92 @@ And the last one contains all the available information.
 Linear regression tries to find the linear relationship between two variables by calculating a linear equation.
 If a linear relationship exists between the two variables, this equation allows us to estimate the desired variable.

-The general linear equation is as follows: \[ Y = w_{T}X + b \]
+The general linear equation is as follows: \[ Y = w^{T}X + b \]

-In order to find the coefficients $w_{T}$ and the intercept $b$, we must minimize the residual
-sum of squares between $Y$ and $w_{T}X$ as followed: \[ MIN \sum_{n=1}^{\infty} (y_{i} - w_{T}x_{i})^{2} \]
+In order to find the coefficients $w$ and the intercept $b$, we must minimize the residual
+sum of squares between $Y$ and $w^{T}X$ as followed: \[ \min_{w} \sum_{n=1}^{n} (y_{i} - w^{T}x_{i})^{2} \]
+
+This kind of method in linear regression is called an Ordinary least squares (OLS).
+
+For this model, we used the Sklearn library which proposes an implementation of linear regression with the LinearRegression function.
+In addition, we also used Sklearn to perform cross-validation using the KFold function.
+
+This function allows us to separate the data set into several subsets in order to perform the cross-validation.
+Before that, we split the dataset in two parts in order to have an unused set to test the best model found during the cross-validation.

 \subsection{Support Vector Regression}
+
+Support Vector Regression (SVR) is a specification of the Support Vector Machine (SVM) classification model.
+In order to understand the SVR model, one must first understand the SVM model.
+SVM is a model for binary classification of data. This model is used to find a hyperplane that best separates data.
+The constraints ensure that the data is classified correctly. In other words, this problem consists in finding a hyperplane separating
+the data and having the largest margin. It can be shown that maximizing the size of the margin is equivalent to minimizing the norm of the hyperplane vector.
+
+More formally, SVM consists in solving the following minimization problem:
+
+$$\min_{w} \frac{1}{2} \|w\|^{2}$$
+
+with the constraints:
+
+$$ y_{i}(\vec{w}\cdot\vec{x_{i}} - b) \geq 1 $$
+
+where:\\
+$y_{i} \in \{-1,1\}$ is the binary labels\\
+$\vec{w}$ is the hyperplane\\
+$\vec{x_{i}}$ is the $i^{th}$ data\\
+$b$ is the intercept\\
+
+This problem can be expressed using the Lagrange multiplier as follows:
+$$ L( w, b, \lambda) = \frac{1}{2} w^{T} w - \sum_{i=1}^{N} \lambda_{i}(y_{i}(w^{T}x_{i}-b)-1)$$
+
+By a derivation sequence, this equation can be expressed as follows:
+$$ L(\lambda) = -\frac{1}{2} \sum_{i=1}^{N} (\sum_{j=1}^{N} \lambda_{i} \cdot \lambda_{j}\cdot y_{i}\cdot y_{j}\cdot \vec{x}_{i}^{\,T} \cdot\vec{x}_{j})
+	+ \sum_{i=1}^{N}\lambda_{i}
+$$
+with contraints $\lambda_{i} \geq 0$
+
+This form allows us to introduce the kernel trick. Originally, SVM is used to classify linearly separable data. However, thanks to the so-called kernel trick,
+it is possible to adapt the model to non-linearly separable data. This kernel trick allows us to perform a transformation of the data in higher dimensions
+where the data are linearly separable.
+
+By a mathematical trick, the kernel trick allows to calculate the dot product between two transformed vectors without actually
+transforming each vector. This trick transforms the previous equation into the following:
+$$ L(\lambda) = -\frac{1}{2} \sum_{i=1}^{N} (\sum_{j=1}^{N} \lambda_{i} \cdot \lambda_{j}\cdot y_{i}\cdot y_{j}\cdot K(\vec{x}_{i} \cdot\vec{x}_{j}))
+	+ \sum_{i=1}^{N}\lambda_{i}
+$$
+
+We arrive at the following problem:
+
+$$ \min_{\lambda_{i} \geq 0 }  \frac{1}{2} \sum_{i=1}^{N} (\sum_{j=1}^{N} \lambda_{i} \cdot \lambda_{j}\cdot y_{i}\cdot y_{j}\cdot K(\vec{x}_{i} \cdot\vec{x}_{j}))
+	- \sum_{i=1}^{N}\lambda_{i}
+$$
+
+Let's talk about SVR.
+
+Unlike SVM, SVR seeks to predict the continuous values of a so-called dependent variable. Moreover, for SVR, the size of the margin is a hyperparameter.
+This model will perform the linear regression by trying to minimize the number and distance of data falling outside the margin.
+
+Another hyperparameter allows us to weight the importance of the data falling outside the margin.
+
+The objective function of SVR is as follows: $$ \min_{w} \frac{1}{2} \|w\|^{2} + C \sum_{i=1}^{n}|\xi_{i}| $$
+And the constraints of the problem is: $$ |y_{i} - w_{i}x_{i}| \leq \epsilon + |\xi_{i}| $$
+
+where:\\
+$y_{i}$ is the continuous values that we are trying to predict\\
+$\vec{w}$ is the hyperplane\\
+$\vec{x_{i}}$ is the $i^{th}$ data\\
+$\epsilon$ is the size of the margin\\
+$\xi_{i}$ is the distance outside the margin of the ith data\\
+$C$ is the weighting of the distances of the data falling outside the margin
+
+Sklearn uses the following kernel called RBF:
+$$K(\vec{x}_{i},\cdot\vec{x}_{j}) = \exp^{-\gamma \|\vec{x}_{i}-\vec{x}_{j}\|^{2}}$$
+
+
+As we can see, this model has several hyperparameters: $\epsilon$, $C$ and $\gamma$. We have optimized these hyperparameters with the optuna library
+
+Similar to linear regression, we used Sklearn's KFold function to perform cross-validation of our model.
+
 \subsection{Multi-Layered Perceptrons}

 Multi-Layered Perceptrons (MLP) are the cornerstone of Deep Neural Networks.