From ac2beb55e06fbb46edc424bae76e7f83d4bb44dc Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 1 Jul 2024 11:29:03 +0200 Subject: [PATCH 1/6] clerify column output in the docs --- docs/getstarted.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/getstarted.md b/docs/getstarted.md index 1f2eb561..6946e238 100644 --- a/docs/getstarted.md +++ b/docs/getstarted.md @@ -391,6 +391,8 @@ output_test = pd.read_hdf(os.path.join("", "output_exporter. The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results. +For classification tasks, the `output` column contains the results from a [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html). This means each entry in the `output` column is a list with one element for each class. Each element represents the probability of that class occurring. + Example for plotting training loss curves using [Plotly Express](https://plotly.com/python/plotly-express/): ```python From a3de3f30b525643f4dbdd3eaa5aa06047d6cd3a6 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 1 Jul 2024 11:29:15 +0200 Subject: [PATCH 2/6] clarify column output in the tutorial --- tutorials/training.ipynb | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/tutorials/training.ipynb b/tutorials/training.ipynb index ae8870c4..11ca92f4 100644 --- a/tutorials/training.ipynb +++ b/tutorials/training.ipynb @@ -420,12 +420,8 @@ "metadata": {}, "outputs": [], "source": [ - "output_train = pd.read_hdf(\n", - " os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"training\"\n", - ")\n", - "output_test = pd.read_hdf(\n", - " os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\"\n", - ")\n", + "output_train = pd.read_hdf(os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"training\")\n", + "output_test = pd.read_hdf(os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\")\n", "output_train.head()" ] }, @@ -436,7 +432,11 @@ "source": [ "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n", "\n", - "For example, the loss across the epochs can be plotted for the training and the validation sets:\n" + "For classification tasks, as in the current tutorial, the `output` column contains the results from a [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html). This means each entry in the `output` column is a list with one element for each class. Each element represents the probability of that class occurring.\n", + "\n", + "Here specifically, the `output` column contains a list with two elements, respectively representing the predicted probabilities that the data point is 0 (first element of the list, representing non-binder class) and 1 (second element of the list, representing binder class).\n", + "\n", + "The loss across the epochs can be plotted for the training and the validation sets:\n" ] }, { @@ -671,12 +671,8 @@ "metadata": {}, "outputs": [], "source": [ - "output_train = pd.read_hdf(\n", - " os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"training\"\n", - ")\n", - "output_test = pd.read_hdf(\n", - " os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\"\n", - ")\n", + "output_train = pd.read_hdf(os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"training\")\n", + "output_test = pd.read_hdf(os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\")\n", "output_train.head()" ] }, @@ -767,7 +763,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.12" }, "orig_nbformat": 4 }, From ed561dcdf8d9d3c953aebb3d12b9edf954a550e4 Mon Sep 17 00:00:00 2001 From: Giulia Crocioni <55382553+gcroci2@users.noreply.github.com> Date: Fri, 5 Jul 2024 14:11:35 +0200 Subject: [PATCH 3/6] Update docs/getstarted.md Co-authored-by: Dani Bodor --- docs/getstarted.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getstarted.md b/docs/getstarted.md index 6946e238..fe54d06e 100644 --- a/docs/getstarted.md +++ b/docs/getstarted.md @@ -391,7 +391,7 @@ output_test = pd.read_hdf(os.path.join("", "output_exporter. The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results. -For classification tasks, the `output` column contains the results from a [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html). This means each entry in the `output` column is a list with one element for each class. Each element represents the probability of that class occurring. +For classification tasks, the `output` column contains a list of probabilities that each class occurs. For more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html) Example for plotting training loss curves using [Plotly Express](https://plotly.com/python/plotly-express/): From 8f6c94036432f93900b2a2034e8d476956f2618a Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Fri, 5 Jul 2024 14:24:57 +0200 Subject: [PATCH 4/6] specify classes order --- docs/getstarted.md | 2 +- tutorials/training.ipynb | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/getstarted.md b/docs/getstarted.md index fe54d06e..a60efce3 100644 --- a/docs/getstarted.md +++ b/docs/getstarted.md @@ -391,7 +391,7 @@ output_test = pd.read_hdf(os.path.join("", "output_exporter. The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results. -For classification tasks, the `output` column contains a list of probabilities that each class occurs. For more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html) +For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeepRankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1]. Example for plotting training loss curves using [Plotly Express](https://plotly.com/python/plotly-express/): diff --git a/tutorials/training.ipynb b/tutorials/training.ipynb index 11ca92f4..dd7c1bc3 100644 --- a/tutorials/training.ipynb +++ b/tutorials/training.ipynb @@ -432,9 +432,7 @@ "source": [ "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n", "\n", - "For classification tasks, as in the current tutorial, the `output` column contains the results from a [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html). This means each entry in the `output` column is a list with one element for each class. Each element represents the probability of that class occurring.\n", - "\n", - "Here specifically, the `output` column contains a list with two elements, respectively representing the predicted probabilities that the data point is 0 (first element of the list, representing non-binder class) and 1 (second element of the list, representing binder class).\n", + "For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeepRankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1].\n", "\n", "The loss across the epochs can be plotted for the training and the validation sets:\n" ] From 7838cf9bfbea84b5f8b5791c5d0dd92e7829f923 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Fri, 5 Jul 2024 15:02:07 +0200 Subject: [PATCH 5/6] fix class name --- docs/getstarted.md | 2 +- tutorials/training.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/getstarted.md b/docs/getstarted.md index a60efce3..ed908a98 100644 --- a/docs/getstarted.md +++ b/docs/getstarted.md @@ -391,7 +391,7 @@ output_test = pd.read_hdf(os.path.join("", "output_exporter. The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results. -For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeepRankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1]. +For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeeprankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1]. Example for plotting training loss curves using [Plotly Express](https://plotly.com/python/plotly-express/): diff --git a/tutorials/training.ipynb b/tutorials/training.ipynb index dd7c1bc3..401c3403 100644 --- a/tutorials/training.ipynb +++ b/tutorials/training.ipynb @@ -432,7 +432,7 @@ "source": [ "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n", "\n", - "For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeepRankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1].\n", + "For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeeprankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1].\n", "\n", "The loss across the epochs can be plotted for the training and the validation sets:\n" ] From c822e1b1aa8bba59056d0e06cd5764dee89b5ca7 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Fri, 5 Jul 2024 15:02:21 +0200 Subject: [PATCH 6/6] add capri capri classes condition --- deeprank2/dataset.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/deeprank2/dataset.py b/deeprank2/dataset.py index 4e3959d9..3718f301 100644 --- a/deeprank2/dataset.py +++ b/deeprank2/dataset.py @@ -151,32 +151,37 @@ def _check_hdf5_files(self) -> None: self.hdf5_paths.remove(hdf5_path) def _check_task_and_classes(self, task: str, classes: str | None = None) -> None: - if self.target in [targets.IRMSD, targets.LRMSD, targets.FNAT, targets.DOCKQ]: - self.task = targets.REGRESS - - elif self.target in [targets.BINARY, targets.CAPRI]: - self.task = targets.CLASSIF - + # Determine the task based on the target or use the provided task + if task is None: + target_to_task_map = { + targets.IRMSD: targets.REGRESS, + targets.LRMSD: targets.REGRESS, + targets.FNAT: targets.REGRESS, + targets.DOCKQ: targets.REGRESS, + targets.BINARY: targets.CLASSIF, + targets.CAPRI: targets.CLASSIF, + } + self.task = target_to_task_map.get(self.target) else: self.task = task + # Validate the task if self.task not in [targets.CLASSIF, targets.REGRESS] and self.target is not None: msg = f"User target detected: {self.target} -> The task argument must be 'classif' or 'regress', currently set as {self.task}" raise ValueError(msg) - if task != self.task and task is not None: + # Warn if the user-set task does not match the determined task + if task and task != self.task: warnings.warn( - f"Target {self.target} expects {self.task}, but was set to task {task} by user.\nUser set task is ignored and {self.task} will be used.", + f"Target {self.target} expects {self.task}, but was set to task {task} by user. User set task is ignored and {self.task} will be used.", ) + # Handle classification task if self.task == targets.CLASSIF: if classes is None: - self.classes = [0, 1] - _log.info(f"Target classes set to: {self.classes}") - else: - self.classes = classes - + self.classes = [0, 1, 2, 3, 4, 5] if self.target == targets.CAPRI else [0, 1] self.classes_to_index = {class_: index for index, class_ in enumerate(self.classes)} + _log.info(f"Target classes set to: {self.classes}") else: self.classes = None self.classes_to_index = None