From ac2beb55e06fbb46edc424bae76e7f83d4bb44dc Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Mon, 1 Jul 2024 11:29:03 +0200
Subject: [PATCH 1/6] clerify column output in the docs

---
 docs/getstarted.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/getstarted.md b/docs/getstarted.md
index 1f2eb561..6946e238 100644
--- a/docs/getstarted.md
+++ b/docs/getstarted.md
@@ -391,6 +391,8 @@ output_test = pd.read_hdf(os.path.join("<output_folder_path>", "output_exporter.
 
 The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.
 
+For classification tasks, the `output` column contains the results from a [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html). This means each entry in the `output` column is a list with one element for each class. Each element represents the probability of that class occurring.
+
 Example for plotting training loss curves using [Plotly Express](https://plotly.com/python/plotly-express/):
 
 ```python

From a3de3f30b525643f4dbdd3eaa5aa06047d6cd3a6 Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Mon, 1 Jul 2024 11:29:15 +0200
Subject: [PATCH 2/6] clarify column output in the tutorial

---
 tutorials/training.ipynb | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/tutorials/training.ipynb b/tutorials/training.ipynb
index ae8870c4..11ca92f4 100644
--- a/tutorials/training.ipynb
+++ b/tutorials/training.ipynb
@@ -420,12 +420,8 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "output_train = pd.read_hdf(\n",
-        "    os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"training\"\n",
-        ")\n",
-        "output_test = pd.read_hdf(\n",
-        "    os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\"\n",
-        ")\n",
+        "output_train = pd.read_hdf(os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"training\")\n",
+        "output_test = pd.read_hdf(os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\")\n",
         "output_train.head()"
       ]
     },
@@ -436,7 +432,11 @@
       "source": [
         "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n",
         "\n",
-        "For example, the loss across the epochs can be plotted for the training and the validation sets:\n"
+        "For classification tasks, as in the current tutorial, the `output` column contains the results from a [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html). This means each entry in the `output` column is a list with one element for each class. Each element represents the probability of that class occurring.\n",
+        "\n",
+        "Here specifically, the `output` column contains a list with two elements, respectively representing the predicted probabilities that the data point is 0 (first element of the list, representing non-binder class) and 1 (second element of the list, representing binder class).\n",
+        "\n",
+        "The loss across the epochs can be plotted for the training and the validation sets:\n"
       ]
     },
     {
@@ -671,12 +671,8 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "output_train = pd.read_hdf(\n",
-        "    os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"training\"\n",
-        ")\n",
-        "output_test = pd.read_hdf(\n",
-        "    os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\"\n",
-        ")\n",
+        "output_train = pd.read_hdf(os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"training\")\n",
+        "output_test = pd.read_hdf(os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\")\n",
         "output_train.head()"
       ]
     },
@@ -767,7 +763,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.10.13"
+      "version": "3.10.12"
     },
     "orig_nbformat": 4
   },

From ed561dcdf8d9d3c953aebb3d12b9edf954a550e4 Mon Sep 17 00:00:00 2001
From: Giulia Crocioni <55382553+gcroci2@users.noreply.github.com>
Date: Fri, 5 Jul 2024 14:11:35 +0200
Subject: [PATCH 3/6] Update docs/getstarted.md

Co-authored-by: Dani Bodor <d.bodor@esciencecenter.nl>
---
 docs/getstarted.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/getstarted.md b/docs/getstarted.md
index 6946e238..fe54d06e 100644
--- a/docs/getstarted.md
+++ b/docs/getstarted.md
@@ -391,7 +391,7 @@ output_test = pd.read_hdf(os.path.join("<output_folder_path>", "output_exporter.
 
 The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.
 
-For classification tasks, the `output` column contains the results from a [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html). This means each entry in the `output` column is a list with one element for each class. Each element represents the probability of that class occurring.
+For classification tasks, the `output` column contains a list of probabilities that each class occurs. For more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)
 
 Example for plotting training loss curves using [Plotly Express](https://plotly.com/python/plotly-express/):
 

From 8f6c94036432f93900b2a2034e8d476956f2618a Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Fri, 5 Jul 2024 14:24:57 +0200
Subject: [PATCH 4/6] specify classes order

---
 docs/getstarted.md       | 2 +-
 tutorials/training.ipynb | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/docs/getstarted.md b/docs/getstarted.md
index fe54d06e..a60efce3 100644
--- a/docs/getstarted.md
+++ b/docs/getstarted.md
@@ -391,7 +391,7 @@ output_test = pd.read_hdf(os.path.join("<output_folder_path>", "output_exporter.
 
 The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.
 
-For classification tasks, the `output` column contains a list of probabilities that each class occurs. For more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)
+For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeepRankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1].
 
 Example for plotting training loss curves using [Plotly Express](https://plotly.com/python/plotly-express/):
 
diff --git a/tutorials/training.ipynb b/tutorials/training.ipynb
index 11ca92f4..dd7c1bc3 100644
--- a/tutorials/training.ipynb
+++ b/tutorials/training.ipynb
@@ -432,9 +432,7 @@
       "source": [
         "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n",
         "\n",
-        "For classification tasks, as in the current tutorial, the `output` column contains the results from a [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html). This means each entry in the `output` column is a list with one element for each class. Each element represents the probability of that class occurring.\n",
-        "\n",
-        "Here specifically, the `output` column contains a list with two elements, respectively representing the predicted probabilities that the data point is 0 (first element of the list, representing non-binder class) and 1 (second element of the list, representing binder class).\n",
+        "For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeepRankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1].\n",
         "\n",
         "The loss across the epochs can be plotted for the training and the validation sets:\n"
       ]

From 7838cf9bfbea84b5f8b5791c5d0dd92e7829f923 Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Fri, 5 Jul 2024 15:02:07 +0200
Subject: [PATCH 5/6] fix class name

---
 docs/getstarted.md       | 2 +-
 tutorials/training.ipynb | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/getstarted.md b/docs/getstarted.md
index a60efce3..ed908a98 100644
--- a/docs/getstarted.md
+++ b/docs/getstarted.md
@@ -391,7 +391,7 @@ output_test = pd.read_hdf(os.path.join("<output_folder_path>", "output_exporter.
 
 The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.
 
-For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeepRankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1].
+For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeeprankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1].
 
 Example for plotting training loss curves using [Plotly Express](https://plotly.com/python/plotly-express/):
 
diff --git a/tutorials/training.ipynb b/tutorials/training.ipynb
index dd7c1bc3..401c3403 100644
--- a/tutorials/training.ipynb
+++ b/tutorials/training.ipynb
@@ -432,7 +432,7 @@
       "source": [
         "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n",
         "\n",
-        "For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeepRankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1].\n",
+        "For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeeprankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1].\n",
         "\n",
         "The loss across the epochs can be plotted for the training and the validation sets:\n"
       ]

From c822e1b1aa8bba59056d0e06cd5764dee89b5ca7 Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Fri, 5 Jul 2024 15:02:21 +0200
Subject: [PATCH 6/6] add capri capri classes condition

---
 deeprank2/dataset.py | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/deeprank2/dataset.py b/deeprank2/dataset.py
index 4e3959d9..3718f301 100644
--- a/deeprank2/dataset.py
+++ b/deeprank2/dataset.py
@@ -151,32 +151,37 @@ def _check_hdf5_files(self) -> None:
             self.hdf5_paths.remove(hdf5_path)
 
     def _check_task_and_classes(self, task: str, classes: str | None = None) -> None:
-        if self.target in [targets.IRMSD, targets.LRMSD, targets.FNAT, targets.DOCKQ]:
-            self.task = targets.REGRESS
-
-        elif self.target in [targets.BINARY, targets.CAPRI]:
-            self.task = targets.CLASSIF
-
+        # Determine the task based on the target or use the provided task
+        if task is None:
+            target_to_task_map = {
+                targets.IRMSD: targets.REGRESS,
+                targets.LRMSD: targets.REGRESS,
+                targets.FNAT: targets.REGRESS,
+                targets.DOCKQ: targets.REGRESS,
+                targets.BINARY: targets.CLASSIF,
+                targets.CAPRI: targets.CLASSIF,
+            }
+            self.task = target_to_task_map.get(self.target)
         else:
             self.task = task
 
+        # Validate the task
         if self.task not in [targets.CLASSIF, targets.REGRESS] and self.target is not None:
             msg = f"User target detected: {self.target} -> The task argument must be 'classif' or 'regress', currently set as {self.task}"
             raise ValueError(msg)
 
-        if task != self.task and task is not None:
+        # Warn if the user-set task does not match the determined task
+        if task and task != self.task:
             warnings.warn(
-                f"Target {self.target} expects {self.task}, but was set to task {task} by user.\nUser set task is ignored and {self.task} will be used.",
+                f"Target {self.target} expects {self.task}, but was set to task {task} by user. User set task is ignored and {self.task} will be used.",
             )
 
+        # Handle classification task
         if self.task == targets.CLASSIF:
             if classes is None:
-                self.classes = [0, 1]
-                _log.info(f"Target classes set to: {self.classes}")
-            else:
-                self.classes = classes
-
+                self.classes = [0, 1, 2, 3, 4, 5] if self.target == targets.CAPRI else [0, 1]
             self.classes_to_index = {class_: index for index, class_ in enumerate(self.classes)}
+            _log.info(f"Target classes set to: {self.classes}")
         else:
             self.classes = None
             self.classes_to_index = None