From b2f66c165e526ccc68950c429551856a1b38c158 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Fri, 5 Jan 2024 15:58:11 +0100 Subject: [PATCH 01/27] Commented-out input restrictions; --- .../org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 43c471b..71109c3 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -182,7 +182,7 @@ public List find(IAtomContainer container, boolean clone){ bondMap = EdgeToBondMap.withSpaceFor(mol); adjList = GraphUtil.toAdjList(mol, bondMap); - checkConstraints(mol); + //checkConstraints(mol); // atom marking markAtoms(mol); @@ -777,7 +777,7 @@ private boolean checkConstraints(IAtomContainer molecule) { ConnectedComponents cc = new ConnectedComponents(adjList); if(cc.nComponents() != 1) { - throw new IllegalArgumentException("Input molecule must consist of only a single connected stucture."); + throw new IllegalArgumentException("Input molecule must consist of only a single connected structure."); } return true; From b9c3dab0970450a48372fe521b5abc065e0bec9d Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Fri, 5 Jan 2024 17:48:03 +0100 Subject: [PATCH 02/27] Moved test classes into the same folder as the tested classes; moved test resources into the respective package structure; added tests to EFGFTest for cases that formerly represented illegal inputs; --- ...lFunctionalGroupsFinderEvaluationTest.java | 4 +- .../ErtlFunctionalGroupsFinderTest.java | 159 +++++++++++++++++- ...ErtlFunctionalGroupsFinderUtilityTest.java | 4 +- .../cdk/tools}/ChEBI_lite_3star_subset.sdf | 0 .../tools}/ChEBI_lite_3star_subset_readme.md | 0 5 files changed, 159 insertions(+), 8 deletions(-) rename src/test/java/org/openscience/cdk/tools/{test => }/ErtlFunctionalGroupsFinderEvaluationTest.java (99%) rename src/test/java/org/openscience/cdk/tools/{test => }/ErtlFunctionalGroupsFinderTest.java (70%) rename src/test/java/org/openscience/cdk/tools/{test => }/ErtlFunctionalGroupsFinderUtilityTest.java (98%) rename src/test/resources/{ => org/openscience/cdk/tools}/ChEBI_lite_3star_subset.sdf (100%) rename src/test/resources/{ => org/openscience/cdk/tools}/ChEBI_lite_3star_subset_readme.md (100%) diff --git a/src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderEvaluationTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java similarity index 99% rename from src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderEvaluationTest.java rename to src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java index c108c32..4106932 100644 --- a/src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderEvaluationTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java @@ -18,7 +18,7 @@ * along with this program. If not, see . */ -package org.openscience.cdk.tools.test; +package org.openscience.cdk.tools; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assumptions; @@ -47,8 +47,6 @@ import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.smiles.SmilesParser; -import org.openscience.cdk.tools.CDKHydrogenAdder; -import org.openscience.cdk.tools.ErtlFunctionalGroupsFinder; import org.openscience.cdk.tools.ErtlFunctionalGroupsFinder.Mode; import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; import org.openscience.cdk.tools.manipulator.AtomTypeManipulator; diff --git a/src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java similarity index 70% rename from src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderTest.java rename to src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java index 28b2d9b..e6f815a 100644 --- a/src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java @@ -18,7 +18,7 @@ * along with this program. If not, see . */ -package org.openscience.cdk.tools.test; +package org.openscience.cdk.tools; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -34,6 +34,7 @@ import org.openscience.cdk.interfaces.IBond.Order; import org.openscience.cdk.interfaces.IChemObjectBuilder; import org.openscience.cdk.interfaces.IPseudoAtom; +import org.openscience.cdk.io.iterator.IteratingSDFReader; import org.openscience.cdk.isomorphism.Mappings; import org.openscience.cdk.isomorphism.Pattern; import org.openscience.cdk.isomorphism.VentoFoggia; @@ -41,12 +42,12 @@ import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.smiles.SmilesParser; -import org.openscience.cdk.tools.ErtlFunctionalGroupsFinder; import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Objects; /** * Test for ErtlFunctionalGroupsFinder. @@ -232,6 +233,160 @@ public void gitHubWikiTest() throws Exception { } } + /** + * TODO: Investigate code for possible problems with charged atoms? + * + * TODO: Test carbon ions. + * + * @throws Exception + */ + @Test + public void testChargedMolecules() throws Exception { + SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical | SmiFlavor.UseAromaticSymbols); + Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); + + IAtomContainer tmpChargedASA = tmpSmiPar.parseSmiles("CC(=O)OC1=CC=CC=C1C(=O)[O+]"); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpChargedASA); + tmpAromaticity.apply(tmpChargedASA); + + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); + List tmpFGList = tmpEFGF.find(tmpChargedASA); + + System.out.println("Charged ASA:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); + } + + IAtomContainer tmpNitroPhenol = tmpSmiPar.parseSmiles("C1=CC(=CC=C1[N+](=O)[O-])O"); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpNitroPhenol); + tmpAromaticity.apply(tmpNitroPhenol); + + tmpFGList = tmpEFGF.find(tmpNitroPhenol); + + System.out.println("Nitrophenol:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); + } + } + + /** + * TODO: Investigate code for possible problems with disconnected structures? + * + * @throws Exception + */ + @Test + public void testDisconnectedMolecules() throws Exception { + SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical | SmiFlavor.UseAromaticSymbols); + Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); + + IAtomContainer tmpChlorhexidineDiacetate = tmpSmiPar.parseSmiles("CC(=O)O.CC(=O)O.C1=CC(=CC=C1NC(=NC(=NCCCCCCN=C(N)N=C(N)NC2=CC=C(C=C2)Cl)N)N)Cl"); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpChlorhexidineDiacetate); + tmpAromaticity.apply(tmpChlorhexidineDiacetate); + + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); + List tmpFGList = tmpEFGF.find(tmpChlorhexidineDiacetate); + + System.out.println("Chlorhexidine Diacetate:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); + } + + IAtomContainer tmpSodiumEdetate = tmpSmiPar.parseSmiles("C(CN(CC(=O)[O-])CC(=O)[O-])N(CC(=O)[O-])CC(=O)[O-].[Na+].[Na+].[Na+].[Na+]"); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpSodiumEdetate); + tmpAromaticity.apply(tmpSodiumEdetate); + + tmpFGList = tmpEFGF.find(tmpSodiumEdetate); + + System.out.println("Sodium edetate:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); + } + } + + /** + * + * + * Note: all atoms are marked as hetero atoms by EFGF that are not H or C. So, metals and metalloids get treated like + * any other hetero atom and should not cause problems. + * + * @throws Exception + */ + @Test + public void testMetalsMetalloids() throws Exception { + SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical | SmiFlavor.UseAromaticSymbols); + Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); + + IAtomContainer tmpTetraethylOrthosilicate = tmpSmiPar.parseSmiles("CCO[Si](OCC)(OCC)OCC"); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpTetraethylOrthosilicate); + tmpAromaticity.apply(tmpTetraethylOrthosilicate); + + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); + List tmpFGList = tmpEFGF.find(tmpTetraethylOrthosilicate); + + System.out.println("Tetraethyl Orthosilicate:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); + } + + IAtomContainer tmpKaolin = tmpSmiPar.parseSmiles("O.O.O=[Al]O[Si](=O)O[Si](=O)O[Al]=O"); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpKaolin); + tmpAromaticity.apply(tmpKaolin); + + tmpFGList = tmpEFGF.find(tmpKaolin); + + System.out.println("Kaolin:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); + } + + } + + //TODO: Clean-up check constraints and add test molecules for these special cases to the testFind#() methods. + + /** + * TODO: test complete ChEBI? + * + * Note: ChEBI lite 3-star subset SDF contains 251 molecules with charges or metal/metalloid atoms or more than one + * disconnected structure (comment-in checkConstraints in EFGF.find() method to check). + * + * @throws Exception + */ + @Test + public void readChebiLite3StarSubset() throws Exception { + IteratingSDFReader tmpChebiSDFReader = new IteratingSDFReader( + ErtlFunctionalGroupsFinderTest.class.getResourceAsStream("ChEBI_lite_3star_subset.sdf"), + SilentChemObjectBuilder.getInstance(), + false); + Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); + int tmpMoleculeCouter = 0; + int tmpExceptionsCounter = 0; + while (tmpChebiSDFReader.hasNext()) { + IAtomContainer tmpMolecule = null; + tmpMoleculeCouter++; + try { + tmpMolecule = tmpChebiSDFReader.next(); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpMolecule); + tmpAromaticity.apply(tmpMolecule); + + List tmpFGList = tmpEFGF.find(tmpMolecule); + } catch (Exception anException) { + tmpExceptionsCounter++; + if (!Objects.isNull(tmpMolecule)) { + System.out.println(tmpMolecule.getProperty("ChEBI ID") + "," + anException.toString() + "," + tmpMoleculeCouter); + } else { + System.out.println("Could not parse molecule! Counter: " + tmpMoleculeCouter); + } + } + + } + System.out.println(tmpMoleculeCouter); + System.out.println(tmpExceptionsCounter); + } + private void testFind(String moleculeSmiles, String[] fGStrings) throws Exception { testFind(moleculeSmiles, fGStrings, new Aromaticity(ElectronDonation.daylight(), Cycles.all())); } diff --git a/src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderUtilityTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java similarity index 98% rename from src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderUtilityTest.java rename to src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java index 9611ffb..37dd154 100644 --- a/src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderUtilityTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java @@ -18,7 +18,7 @@ * along with this program. If not, see . */ -package org.openscience.cdk.tools.test; +package org.openscience.cdk.tools; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -30,8 +30,6 @@ import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.smiles.SmilesParser; -import org.openscience.cdk.tools.ErtlFunctionalGroupsFinder; -import org.openscience.cdk.tools.ErtlFunctionalGroupsFinderUtility; import java.util.ArrayList; import java.util.HashMap; diff --git a/src/test/resources/ChEBI_lite_3star_subset.sdf b/src/test/resources/org/openscience/cdk/tools/ChEBI_lite_3star_subset.sdf similarity index 100% rename from src/test/resources/ChEBI_lite_3star_subset.sdf rename to src/test/resources/org/openscience/cdk/tools/ChEBI_lite_3star_subset.sdf diff --git a/src/test/resources/ChEBI_lite_3star_subset_readme.md b/src/test/resources/org/openscience/cdk/tools/ChEBI_lite_3star_subset_readme.md similarity index 100% rename from src/test/resources/ChEBI_lite_3star_subset_readme.md rename to src/test/resources/org/openscience/cdk/tools/ChEBI_lite_3star_subset_readme.md From 82d8d0dc8aff97519fc60d946f6b4bd17fa1de20 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Mon, 8 Jan 2024 14:43:49 +0100 Subject: [PATCH 03/27] Updated copyright year --- License-header/License-header.txt | 2 +- build.gradle | 2 +- .../org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java | 2 +- .../cdk/tools/ErtlFunctionalGroupsFinderUtility.java | 2 +- .../app/ErtlFunctionalGroupsFinderPerformanceSnapshotApp.java | 2 +- .../cdk/tools/efgf/app/ExtractFunctionalGroupsTask.java | 2 +- src/main/java/org/openscience/cdk/tools/efgf/app/Main.java | 2 +- .../cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java | 2 +- .../openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java | 2 +- .../cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/License-header/License-header.txt b/License-header/License-header.txt index d1b5969..9b2deb0 100644 --- a/License-header/License-header.txt +++ b/License-header/License-header.txt @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) $today.year Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * diff --git a/build.gradle b/build.gradle index 547e5c8..14f425e 100644 --- a/build.gradle +++ b/build.gradle @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (C) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (C) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 71109c3..b88c5e1 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java index 9ec9468..cd80434 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * diff --git a/src/main/java/org/openscience/cdk/tools/efgf/app/ErtlFunctionalGroupsFinderPerformanceSnapshotApp.java b/src/main/java/org/openscience/cdk/tools/efgf/app/ErtlFunctionalGroupsFinderPerformanceSnapshotApp.java index 545c955..d9f137f 100644 --- a/src/main/java/org/openscience/cdk/tools/efgf/app/ErtlFunctionalGroupsFinderPerformanceSnapshotApp.java +++ b/src/main/java/org/openscience/cdk/tools/efgf/app/ErtlFunctionalGroupsFinderPerformanceSnapshotApp.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * diff --git a/src/main/java/org/openscience/cdk/tools/efgf/app/ExtractFunctionalGroupsTask.java b/src/main/java/org/openscience/cdk/tools/efgf/app/ExtractFunctionalGroupsTask.java index 9191158..7fc144f 100644 --- a/src/main/java/org/openscience/cdk/tools/efgf/app/ExtractFunctionalGroupsTask.java +++ b/src/main/java/org/openscience/cdk/tools/efgf/app/ExtractFunctionalGroupsTask.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * diff --git a/src/main/java/org/openscience/cdk/tools/efgf/app/Main.java b/src/main/java/org/openscience/cdk/tools/efgf/app/Main.java index 0500b9e..0ae5470 100644 --- a/src/main/java/org/openscience/cdk/tools/efgf/app/Main.java +++ b/src/main/java/org/openscience/cdk/tools/efgf/app/Main.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java index 4106932..dce85ec 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java index e6f815a..e0d5c64 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java index 37dd154..ef35271 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * From 565e448d00d4e4926891f7207c5e854c0b5828cf Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Mon, 8 Jan 2024 17:03:44 +0100 Subject: [PATCH 04/27] Started refactoring code according to internal conventions; --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 354 ++++++++++++------ 1 file changed, 235 insertions(+), 119 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index b88c5e1..a14d362 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -42,102 +42,197 @@ import java.util.Set; /** - * Finds and extracts a molecules's functional groups in a purely rule-based manner. + * Finds and extracts a molecule's functional groups in a purely rule-based manner. * * This class implements Peter Ertl's algorithm for the automated detection and extraction * of functional groups in organic molecules - * [Ertl P. An algorithm to identify functional groups in organic molecules. J Cheminform. 2017; 9:36.]. + * ([Ertl P. An algorithm to identify functional groups in organic molecules. J Cheminform. 2017; 9:36.]). + *

+ * Note: this implementation is not thread-safe. Each parallel thread should have its own instance of this class. + *

* * @author Sebastian Fritsch, Jonas Schaub - * @version 1.2 + * @version 1.2.1 */ public class ErtlFunctionalGroupsFinder { - private static ILoggingTool log = LoggingToolFactory.createLoggingTool(ErtlFunctionalGroupsFinder.class); - private static final String CARBONYL_C_MARKER = "Carbonyl-C"; - private final Set nonmetalAtomicNumbers; - private final Mode mode; - private EdgeToBondMap bondMap; - private int[][] adjList; - private HashSet markedAtoms; - private HashMap aromaticHeteroAtoms; // key: atom idx, value: isInGroup - private Map> environmentsMap; - /** - * Defines the working mode. + * Defines the mode for generalizing functional group environments (default) or keeping them whole. */ - public static enum Mode{ + public static enum Mode { /** * Default mode including the generalization step. */ DEFAULT, /** - * Skips the generalization step. Functional groups will keep their full "environment". + * Skips the generalization step. Functional groups will keep their full environment. */ NO_GENERALIZATION; } - - private enum EnvironmentCalCType { C_AROMATIC, C_ALIPHATIC }; - + // + /** + * Defines whether an environmental carbon atom is aromatic or aliphatic. Only for internal use for caching this + * info in the EnvironmentalC instances (see private class below). + */ + private static enum EnvironmentalCType { + /** + * Aromatic environmental carbon. + */ + C_AROMATIC, + /** + * Aliphatic environmental carbon. + */ + C_ALIPHATIC; + } + // /** * Describes one carbon atom in the environment of a marked atom. It can either be aromatic * or aliphatic and also contains a clone of its connecting bond. */ - private class EnvironmentalC{ - private EnvironmentCalCType type; + private class EnvironmentalC { + + /** + * Indicates whether carbon atom is aromatic or aliphatic. + */ + private EnvironmentalCType type; + // + /** + * Bond index of the original C atom. + */ private int bondIndex; + // + /** + * Order of the bond connecting this environmental C atom to the marked functional group atom. + */ private IBond.Order bondOrder; + // + /** + * Stereo information of the bond connecting this environmental C atom to the marked functional group atom. + */ private IBond.Stereo bondStereo; + // + /** + * Flags of the bond connecting this environmental C atom to the marked functional group atom. IChemObjecflags + * are properties defined by an integer value (array position) and a boolean value. + */ private boolean[] bondFlags; - - public EnvironmentalC(EnvironmentCalCType type, IBond bond, int indexInBond) { - this.type = type; - - bondIndex = indexInBond; - bondOrder = bond.getOrder(); - bondStereo = bond.getStereo(); - bondFlags = bond.getFlags(); + // + /** + * Default constructor defining all fields. Order, stereo, and flags are taken from the IBond object directly. + * + * @param aType aromatic or aliphatic + * @param aConnectingBond bond instance connecting to the marked atom + * @param anIndexInBond index of the atom in the connecting bond + */ + public EnvironmentalC(EnvironmentalCType aType, IBond aConnectingBond, int anIndexInBond) { + this.type = aType; + this.bondIndex = anIndexInBond; + this.bondOrder = aConnectingBond.getOrder(); + this.bondStereo = aConnectingBond.getStereo(); + this.bondFlags = aConnectingBond.getFlags(); } - - public EnvironmentCalCType getType() { - return type; + // + /** + * Returns the type, i.e. whether this carbon atom is aromatic or aliphatic. + * + * @return EnvironmentalCType enum constant + */ + public EnvironmentalCType getType() { + return this.type; } - - public IBond createBond(IAtom targetAtom, IAtom cAtom) { - IBond bond = targetAtom.getBuilder().newInstance(IBond.class); - if(bondIndex == 0) { - bond.setAtoms(new IAtom[] {cAtom, targetAtom}); + // + /** + * Method for translating this instance back into a "real" IAtom instance when expanding the functional group + * environment, transferring all the cached properties, except the type(!). + * + * @param aTargetAtom marked functional group atom + * @param anEnvCAtom new carbon atom instance that should receive all the cached properties except the type(!); + * element, atom type "C" and implicit hydrogen count = 0 should be set already; type can later + * be set via .setIsAromatic(boolean); + * @return new bond connecting marked FG atom and environment atom in the correct order and with the cached properties + */ + public IBond createBond(IAtom aTargetAtom, IAtom anEnvCAtom) { + IBond tmpBond = aTargetAtom.getBuilder().newInstance(IBond.class); + if (this.bondIndex == 0) { + tmpBond.setAtoms(new IAtom[] {anEnvCAtom, aTargetAtom}); } else { - bond.setAtoms(new IAtom[] {targetAtom, cAtom}); + tmpBond.setAtoms(new IAtom[] {aTargetAtom, anEnvCAtom}); } - bond.setOrder(bondOrder); - bond.setStereo(bondStereo); - bond.setFlags(bondFlags); - - return bond; + tmpBond.setOrder(this.bondOrder); + tmpBond.setStereo(this.bondStereo); + tmpBond.setFlags(this.bondFlags); + return tmpBond; } } - + // + /** + * CDK logging tool instance for this class. + */ + private static final ILoggingTool LOGGING_TOOL = LoggingToolFactory.createLoggingTool(ErtlFunctionalGroupsFinder.class); + // /** - * Default constructor for ErtlFunctionalGroupsFinder. + * Property name for marking carbonyl carbon atoms via IAtom properties. + */ + private static final String CARBONYL_C_MARKER = "EFGF-Carbonyl-C"; + // + /** + * Set of atomic numbers that are accepted in the input molecule if the strict input restrictions are activated + * (excludes metal and metalloid elements, only organic elements included). + */ + private static final Set NONMETAL_ATOMIC_NUMBERS = Set.of(1, 2, 6, 7, 8, 9, 10, 15, 16, 17, 18, 34, 35, 36, 53, 54, 86); + // + /** + * Environment mode setting, defining whether environments should be generalized (default) or kept as whole. + */ + private Mode envMode; + // + /** + * Map of bonds in the input molecule, cache(!). + */ + private EdgeToBondMap bondMap; + // + /** + * Adjacency list representation of input molecule, cache(!). + */ + private int[][] adjList; + // + /** + * Set for atoms marked as being part of a functional group, represented by an internal index based on the atom + * count in the input molecule, cache(!). + */ + private HashSet markedAtoms; + // + /** + * HashMap for storing aromatic hetero-atom indices and whether they have already been assigned to a larger functional + * group. If false, they form single-atom FG by themselves, cache(!). + * + * key: atom idx, value: isInGroup + */ + private HashMap aromaticHeteroAtomIndicesToIsInGroupBoolMap; + // + /** + * HashMap for storing marked atom to connected environmental carbon atom relations, cache(!). + */ + private HashMap> markedAtomToConnectedEnvCMap; + // + /** + * Default constructor for ErtlFunctionalGroupsFinder with functional group generalization turned ON. */ public ErtlFunctionalGroupsFinder() { this(Mode.DEFAULT); } - + // /** - * Constructor for ErtlFunctionalGroupsFinder. + * Constructor for ErtlFunctionalGroupsFinder that allows setting the treatment of environments in the identified + * functional groups. Default: environments will be generalized; no generalization: environments will be kept as whole. * - * @param mode working mode (see {@code ErtlFunctionalGroupsFinder.Mode}). + * @param envMode mode for treating functional group environments (see {@link ErtlFunctionalGroupsFinder.Mode}). */ - public ErtlFunctionalGroupsFinder(Mode mode) { - this.mode = mode; - - // init non-metal and non-metalloid atom numbers - nonmetalAtomicNumbers = Set.of(1, 2, 6, 7, 8, 9, 10, 15, 16, 17, 18, 34, 35, 36, 53, 54, 86); //ImmutableSet.of(1, 2, 6, 7, 8, 9, 10, 15, 16, 17, 18, 34, 35, 36, 53, 54, 86); + public ErtlFunctionalGroupsFinder(Mode envMode) { + this.envMode = envMode; } - + // /** * Find all functional groups contained in a molecule. * @@ -191,10 +286,10 @@ public List find(IAtomContainer container, boolean clone){ List groups = extractGroups(mol); // handle environment - if(mode == Mode.DEFAULT) { + if(envMode == Mode.DEFAULT) { expandGeneralizedEnvironments(groups); } - else if (mode == Mode.NO_GENERALIZATION) { + else if (envMode == Mode.NO_GENERALIZATION) { expandFullEnvironments(groups); } else { @@ -205,24 +300,45 @@ else if (mode == Mode.NO_GENERALIZATION) { bondMap = null; adjList = null; markedAtoms = null; - aromaticHeteroAtoms = null; - environmentsMap = null; + aromaticHeteroAtomIndicesToIsInGroupBoolMap = null; + markedAtomToConnectedEnvCMap = null; return groups; } + /** + * TODO + */ + public void setEnvMode(Mode anEnvMode) { + + } + + /** + * TODO + */ + public void isFunctionalGroupEnvironmentGeneralized(boolean aGeneralizeEnvironment) { + + } + + /** + * TODO + */ + private void clearChache() { + + } + /** * Mark all atoms and store them in a set for further processing. * * @param molecule Molecule with atoms to mark */ private void markAtoms(IAtomContainer molecule) { - if(isDbg()) log.debug("########## Starting search for atoms to mark ... ##########"); + if(isDbg()) LOGGING_TOOL.debug("########## Starting search for atoms to mark ... ##########"); // store marked atoms markedAtoms = new HashSet(molecule.getAtomCount()); //Sets.newHashSetWithExpectedSize(molecule.getAtomCount()); // store aromatic heteroatoms - aromaticHeteroAtoms = new HashMap<>(); + aromaticHeteroAtomIndicesToIsInGroupBoolMap = new HashMap<>(); for(int idx = 0; idx < molecule.getAtomCount(); idx++) { // skip atoms that already got marked in a previous iteration @@ -233,7 +349,7 @@ private void markAtoms(IAtomContainer molecule) { // skip aromatic atoms but add them to set if(cAtom.isAromatic()) { if(isHeteroatom(cAtom)) { - aromaticHeteroAtoms.put(idx, false); + aromaticHeteroAtomIndicesToIsInGroupBoolMap.put(idx, false); } continue; } @@ -255,19 +371,19 @@ private void markAtoms(IAtomContainer molecule) { // set the connected atom as marked if(markedAtoms.add(connectedIdx)) { String connectedAtomCondition = connectedAtom.getAtomicNumber() == 6 ? "2.1/2.2" : "1"; - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition %s", + if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition %s", connectedIdx, connectedAtom.getSymbol(), connectedAtomCondition)); } // set the current atom as marked and break out of connected atoms - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition 2.1/2.2", + if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 2.1/2.2", idx, cAtom.getSymbol())); isMarked = true; // but check for carbonyl-C before break if(connectedAtom.getAtomicNumber() == 8 && connectedBond.getOrder() == Order.DOUBLE && adjList[idx].length == 3) { - if(isDbg()) log.debug(" - was flagged as Carbonly-C"); + if(isDbg()) LOGGING_TOOL.debug(" - was flagged as Carbonly-C"); cAtom.setProperty(CARBONYL_C_MARKER, true); } @@ -281,7 +397,7 @@ else if((connectedAtom.getAtomicNumber() == 7 // if connected O/N/S is not aromatic... if(!connectedAtom.isAromatic()) { // set the connected O/N/S atom as marked - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition 1", + if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 1", connectedIdx, connectedAtom.getSymbol())); markedAtoms.add(connectedIdx); @@ -298,7 +414,7 @@ else if((connectedAtom.getAtomicNumber() == 7 oNSCounter++; if(oNSCounter > 1 && adjList[idx].length + cAtom.getImplicitHydrogenCount() == 4) { // set as marked and break out of connected atoms - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition 2.3", + if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 2.3", idx, cAtom.getSymbol())); isMarked = true; break; @@ -313,14 +429,14 @@ else if((connectedAtom.getAtomicNumber() == 7 IAtom connectedInSphere3Atom = molecule.getAtom(connectedInSphere3Idx); if(connectedInSphere3Atom.equals(cAtom)) { // set connected atoms as marked - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition 2.4", + if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 2.4", connectedInSphere2Idx, connectedInSphere2Atom.getSymbol())); - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition 2.4", + if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 2.4", connectedInSphere3Idx, connectedInSphere3Atom.getSymbol())); markedAtoms.add(connectedInSphere2Idx); markedAtoms.add(connectedInSphere3Idx); // set current atom as marked and break out of connected atoms - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition 2.4", + if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 2.4", idx, cAtom.getSymbol())); isMarked = true; break; @@ -358,12 +474,12 @@ else if (atomicNr == 1){ } // if heteroatom... (CONDITION 1) else { - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition 1", idx, cAtom.getSymbol())); + if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 1", idx, cAtom.getSymbol())); markedAtoms.add(idx); continue; } } - if(isDbg()) log.debug(String.format("########## End of search. Marked %d/%d atoms. ##########", markedAtoms.size(), molecule.getAtomCount())); + if(isDbg()) LOGGING_TOOL.debug(String.format("########## End of search. Marked %d/%d atoms. ##########", markedAtoms.size(), molecule.getAtomCount())); } /** @@ -374,9 +490,9 @@ else if (atomicNr == 1){ * @return a list of all functional groups (including "environments") extracted from the molecule */ private List extractGroups(IAtomContainer molecule) { - if(isDbg()) log.debug("########## Starting identification & extraction of functional groups... ##########"); + if(isDbg()) LOGGING_TOOL.debug("########## Starting identification & extraction of functional groups... ##########"); - environmentsMap = new HashMap>(molecule.getAtomCount());//Maps.newHashMapWithExpectedSize(molecule.getAtomCount()); + markedAtomToConnectedEnvCMap = new HashMap>(molecule.getAtomCount());//Maps.newHashMapWithExpectedSize(molecule.getAtomCount()); int[] atomIdxToFGMap = new int[molecule.getAtomCount()]; Arrays.fill(atomIdxToFGMap, -1); int fGroupIdx = -1; @@ -387,7 +503,7 @@ private List extractGroups(IAtomContainer molecule) { // get next markedAtom as the starting node for the search int beginIdx = markedAtoms.iterator().next(); - if(isDbg()) log.debug(String.format("Searching new functional group from atom #%d (%s)...", beginIdx, molecule.getAtom(beginIdx).getSymbol())); + if(isDbg()) LOGGING_TOOL.debug(String.format("Searching new functional group from atom #%d (%s)...", beginIdx, molecule.getAtom(beginIdx).getSymbol())); // do a BFS from there Queue queue = new ArrayDeque<>(); @@ -403,7 +519,7 @@ private List extractGroups(IAtomContainer molecule) { // if it isn't... IAtom currentAtom = molecule.getAtom(currentIdx); - if(isDbg()) log.debug(String.format(" visiting marked atom: #%d (%s)", currentIdx, currentAtom.getSymbol())); + if(isDbg()) LOGGING_TOOL.debug(String.format(" visiting marked atom: #%d (%s)", currentIdx, currentAtom.getSymbol())); // add its index to the functional group atomIdxToFGMap[currentIdx] = fGroupIdx; @@ -427,21 +543,21 @@ private List extractGroups(IAtomContainer molecule) { // add unmarked connected aromatic heteroatoms IAtom connectedAtom = molecule.getAtom(connectedIdx); if(isHeteroatom(connectedAtom) && connectedAtom.isAromatic()) { - if(isDbg()) log.debug(" added connected aromatic heteroatom " + connectedAtom.getSymbol()); + if(isDbg()) LOGGING_TOOL.debug(" added connected aromatic heteroatom " + connectedAtom.getSymbol()); atomIdxToFGMap[connectedIdx] = fGroupIdx; // note that this aromatic heteroatom has been added to a group - aromaticHeteroAtoms.put(connectedIdx, true); + aromaticHeteroAtomIndicesToIsInGroupBoolMap.put(connectedIdx, true); } // add unmarked connected atoms to current marked atom's environment IBond connectedBond = bondMap.get(currentIdx, connectedIdx); - EnvironmentCalCType type; + EnvironmentalCType type; if (connectedAtom.getAtomicNumber() == 6) { if(connectedAtom.isAromatic()) - type = EnvironmentCalCType.C_AROMATIC; + type = EnvironmentalCType.C_AROMATIC; else - type = EnvironmentCalCType.C_ALIPHATIC; + type = EnvironmentalCType.C_ALIPHATIC; } else { // aromatic heteroatom, so just ignore @@ -449,36 +565,36 @@ private List extractGroups(IAtomContainer molecule) { } currentEnvironment.add(new EnvironmentalC(type, connectedBond, connectedBond.getBegin() == connectedAtom ? 0 : 1)); } - environmentsMap.put(currentAtom, currentEnvironment); + markedAtomToConnectedEnvCMap.put(currentAtom, currentEnvironment); // debug logging if(isDbg()) { int cAromCount = 0, cAliphCount = 0; for(EnvironmentalC comp : currentEnvironment) { - if(comp.getType() == EnvironmentCalCType.C_AROMATIC) + if(comp.getType() == EnvironmentalCType.C_AROMATIC) cAromCount++; - else if(comp.getType() == EnvironmentCalCType.C_ALIPHATIC) + else if(comp.getType() == EnvironmentalCType.C_ALIPHATIC) cAliphCount++; } - log.debug(String.format(" logged marked atom's environment: C_ar:%d, C_al:%d (and %d implicit hydrogens)", cAromCount, cAliphCount, currentAtom.getImplicitHydrogenCount())); + LOGGING_TOOL.debug(String.format(" logged marked atom's environment: C_ar:%d, C_al:%d (and %d implicit hydrogens)", cAromCount, cAliphCount, currentAtom.getImplicitHydrogenCount())); } } - if(isDbg()) log.debug(" search completed."); + if(isDbg()) LOGGING_TOOL.debug(" search completed."); } // also create FG for lone aromatic heteroatoms, not connected to a FG yet. - for(int atomIdx : aromaticHeteroAtoms.keySet()) { - if(!aromaticHeteroAtoms.get(atomIdx)) { + for(int atomIdx : aromaticHeteroAtomIndicesToIsInGroupBoolMap.keySet()) { + if(!aromaticHeteroAtomIndicesToIsInGroupBoolMap.get(atomIdx)) { fGroupIdx++; atomIdxToFGMap[atomIdx] = fGroupIdx; - if(isDbg()) log.debug("Created FG for lone aromatic heteroatom: " + molecule.getAtom(atomIdx).getSymbol()); + if(isDbg()) LOGGING_TOOL.debug("Created FG for lone aromatic heteroatom: " + molecule.getAtom(atomIdx).getSymbol()); } } List fGs = partitionIntoGroups(molecule, atomIdxToFGMap, fGroupIdx + 1); - if(isDbg()) log.debug(String.format("########## Found & extracted %d functional groups. ##########", fGroupIdx + 1)); + if(isDbg()) LOGGING_TOOL.debug(String.format("########## Found & extracted %d functional groups. ##########", fGroupIdx + 1)); return fGs; } @@ -489,17 +605,17 @@ else if(comp.getType() == EnvironmentCalCType.C_ALIPHATIC) * @param fGroups the list of functional groups including "environments" */ private void expandGeneralizedEnvironments(List fGroups){ - if(isDbg()) log.debug("########## Starting generalization of functional groups... ##########"); + if(isDbg()) LOGGING_TOOL.debug("########## Starting generalization of functional groups... ##########"); for(IAtomContainer fGroup : fGroups) { int atomCount = fGroup.getAtomCount(); - if(isDbg()) log.debug(String.format("Generalizing functional group (%d atoms)...", atomCount)); + if(isDbg()) LOGGING_TOOL.debug(String.format("Generalizing functional group (%d atoms)...", atomCount)); // prechecking for special cases... if(fGroup.getAtomCount() == 1) { IAtom atom = fGroup.getAtom(0); - List environment = environmentsMap.get(atom); + List environment = markedAtomToConnectedEnvCMap.get(atom); if(environment != null) { int envCCount = environment.size(); @@ -507,12 +623,12 @@ private void expandGeneralizedEnvironments(List fGroups){ // for H2N-C_env & HO-C_env -> do not replace H & C_env by R! if((atom.getAtomicNumber() == 8 && envCCount == 1) || (atom.getAtomicNumber() == 7 && envCCount == 1)){ - if(isDbg()) log.debug(String.format(" - found single atomic N or O FG with one env. C. Expanding environment...", atom.getSymbol())); + if(isDbg()) LOGGING_TOOL.debug(String.format(" - found single atomic N or O FG with one env. C. Expanding environment...", atom.getSymbol())); expandEnvironment(atom, fGroup); int hCount = atom.getImplicitHydrogenCount(); if(hCount != 0) { - if(isDbg()) log.debug(String.format(" - adding %d hydrogens...", hCount)); + if(isDbg()) LOGGING_TOOL.debug(String.format(" - adding %d hydrogens...", hCount)); addHydrogens(atom, hCount, fGroup); atom.setImplicitHydrogenCount(0); } @@ -521,14 +637,14 @@ private void expandGeneralizedEnvironments(List fGroups){ // for HN-(C_env)-C_env & HS-C_env -> do not replace H by R! (only C_env!) if((atom.getAtomicNumber() == 7 && envCCount == 2) || (atom.getAtomicNumber() == 16 && envCCount == 1)) { - if(isDbg()) log.debug(" - found sec. amine or simple thiol"); + if(isDbg()) LOGGING_TOOL.debug(" - found sec. amine or simple thiol"); int hCount = atom.getImplicitHydrogenCount(); if(hCount != 0) { - if(isDbg()) log.debug(String.format(" - adding %d hydrogens...", hCount)); + if(isDbg()) LOGGING_TOOL.debug(String.format(" - adding %d hydrogens...", hCount)); addHydrogens(atom, hCount, fGroup); atom.setImplicitHydrogenCount(0); } - if(isDbg()) log.debug(" - expanding environment..."); + if(isDbg()) LOGGING_TOOL.debug(" - expanding environment..."); expandEnvironmentGeneralized(atom, fGroup); continue; } @@ -540,7 +656,7 @@ else if(isHeteroatom(atom)) { atom.setImplicitHydrogenCount(0); } String atomTypeName = atom.getAtomTypeName(); - if(isDbg()) log.debug(String.format(" - found single aromatic heteroatom (%s, Atomtype %s). Adding %d R-Atoms...", atom.getSymbol(), atomTypeName, rAtomCount)); + if(isDbg()) LOGGING_TOOL.debug(String.format(" - found single aromatic heteroatom (%s, Atomtype %s). Adding %d R-Atoms...", atom.getSymbol(), atomTypeName, rAtomCount)); addRAtoms(atom, rAtomCount, fGroup); continue; } @@ -552,14 +668,14 @@ else if(isHeteroatom(atom)) { // process atoms... for(IAtom atom : fGroupAtoms) { - List environment = environmentsMap.get(atom); + List environment = markedAtomToConnectedEnvCMap.get(atom); if(environment == null) { if(atom.getImplicitHydrogenCount() != 0) { atom.setImplicitHydrogenCount(0); } int rAtomCount = atom.getValency() - 1; - if(isDbg()) log.debug(String.format(" - found connected aromatic heteroatom (%s). Adding %d R-Atoms...", atom.getSymbol(), rAtomCount)); + if(isDbg()) LOGGING_TOOL.debug(String.format(" - found connected aromatic heteroatom (%s). Adding %d R-Atoms...", atom.getSymbol(), rAtomCount)); addRAtoms(atom, rAtomCount, fGroup); } @@ -569,25 +685,25 @@ else if(isHeteroatom(atom)) { if(atom.getImplicitHydrogenCount() != 0) { atom.setImplicitHydrogenCount(0); } - if(isDbg()) log.debug(" - ignoring environment for marked carbon atom"); + if(isDbg()) LOGGING_TOOL.debug(" - ignoring environment for marked carbon atom"); continue; } else { - if(isDbg()) log.debug(" - found carbonyl-carbon. Expanding environment..."); + if(isDbg()) LOGGING_TOOL.debug(" - found carbonyl-carbon. Expanding environment..."); expandEnvironmentGeneralized(atom, fGroup); continue; } } // processing heteroatoms... else { - if(isDbg()) log.debug(String.format(" - found heteroatom (%s). Expanding environment...", atom.getSymbol())); + if(isDbg()) LOGGING_TOOL.debug(String.format(" - found heteroatom (%s). Expanding environment...", atom.getSymbol())); expandEnvironmentGeneralized(atom, fGroup); continue; } } } - if(isDbg()) log.debug("########## Generalization of functional groups completed. ##########"); + if(isDbg()) LOGGING_TOOL.debug("########## Generalization of functional groups completed. ##########"); } /** @@ -596,35 +712,35 @@ else if(isHeteroatom(atom)) { * @param fGroups the list of functional groups including "environments" */ private void expandFullEnvironments(List fGroups) { - if(isDbg()) log.debug("########## Starting expansion of full environments for functional groups... ##########"); + if(isDbg()) LOGGING_TOOL.debug("########## Starting expansion of full environments for functional groups... ##########"); for(IAtomContainer fGroup : fGroups) { int atomCount = fGroup.getAtomCount(); - if(isDbg()) log.debug(String.format("Expanding environment on functional group (%d atoms)...", atomCount)); + if(isDbg()) LOGGING_TOOL.debug(String.format("Expanding environment on functional group (%d atoms)...", atomCount)); for(int i = 0; i < atomCount; i++) { IAtom atom = fGroup.getAtom(i); - if(isDbg()) log.debug(String.format(" - Atom #%d - Expanding environment...", i)); + if(isDbg()) LOGGING_TOOL.debug(String.format(" - Atom #%d - Expanding environment...", i)); expandEnvironment(atom, fGroup); int hCount = atom.getImplicitHydrogenCount(); if(hCount != 0) { - if(isDbg()) log.debug(String.format(" - adding %d hydrogens...", hCount)); + if(isDbg()) LOGGING_TOOL.debug(String.format(" - adding %d hydrogens...", hCount)); addHydrogens(atom, hCount, fGroup); atom.setImplicitHydrogenCount(0); } } } - if(isDbg()) log.debug("########## Expansion of full environments for functional groups completed. ##########"); + if(isDbg()) LOGGING_TOOL.debug("########## Expansion of full environments for functional groups completed. ##########"); } private void expandEnvironment(IAtom atom, IAtomContainer container) { - List environment = environmentsMap.get(atom); + List environment = markedAtomToConnectedEnvCMap.get(atom); if(environment == null || environment.isEmpty()) { - if(isDbg()) log.debug(" found no environment to expand."); + if(isDbg()) LOGGING_TOOL.debug(" found no environment to expand."); return; } @@ -633,7 +749,7 @@ private void expandEnvironment(IAtom atom, IAtomContainer container) { IAtom cAtom = atom.getBuilder().newInstance(IAtom.class, "C"); cAtom.setAtomTypeName("C"); cAtom.setImplicitHydrogenCount(0); - if(envC.getType() == EnvironmentCalCType.C_AROMATIC) { + if(envC.getType() == EnvironmentalCType.C_AROMATIC) { cAtom.setIsAromatic(true); cAromCount++; } @@ -647,16 +763,16 @@ private void expandEnvironment(IAtom atom, IAtomContainer container) { container.addBond(bond); } - if(isDbg()) log.debug(String.format(" expanded environment: %dx C_ar and %dx C_al", cAromCount, cAliphCount)); + if(isDbg()) LOGGING_TOOL.debug(String.format(" expanded environment: %dx C_ar and %dx C_al", cAromCount, cAliphCount)); } // only call this on marked heteroatoms / carbonyl-C's! private void expandEnvironmentGeneralized(IAtom atom, IAtomContainer container) { - List environment = environmentsMap.get(atom); + List environment = markedAtomToConnectedEnvCMap.get(atom); if(environment == null) { - if(isDbg()) log.debug(" found no environment to expand."); + if(isDbg()) LOGGING_TOOL.debug(" found no environment to expand."); return; } @@ -665,7 +781,7 @@ private void expandEnvironmentGeneralized(IAtom atom, IAtomContainer container) if(atom.getAtomicNumber() == 8 && atom.getImplicitHydrogenCount() == 1) { addHydrogens(atom, 1, container); atom.setImplicitHydrogenCount(0); - if(isDbg()) log.debug(" expanded hydrogen on connected OH-Group"); + if(isDbg()) LOGGING_TOOL.debug(" expanded hydrogen on connected OH-Group"); } else if(isHeteroatom(atom)) rAtomCount += atom.getImplicitHydrogenCount(); addRAtoms(atom, rAtomCount, container); @@ -674,7 +790,7 @@ private void expandEnvironmentGeneralized(IAtom atom, IAtomContainer container) atom.setImplicitHydrogenCount(0); } - if(isDbg()) log.debug(String.format(" expanded environment: %dx R-atom (incl. %d for H replacement)", rAtomCount, rAtomCount - rAtomsForCCount)); + if(isDbg()) LOGGING_TOOL.debug(String.format(" expanded environment: %dx R-atom (incl. %d for H replacement)", rAtomCount, rAtomCount - rAtomsForCCount)); } private static final boolean isHeteroatom(IAtom atom) { @@ -683,7 +799,7 @@ private static final boolean isHeteroatom(IAtom atom) { } private final boolean isNonmetal(IAtom atom) { - return nonmetalAtomicNumbers.contains(atom.getAtomicNumber()); + return NONMETAL_ATOMIC_NUMBERS.contains(atom.getAtomicNumber()); } private void addHydrogens(IAtom atom, int number, IAtomContainer container) { @@ -759,7 +875,7 @@ private List partitionIntoGroups(IAtomContainer sourceContainer, } private boolean isDbg() { - return log.isDebugEnabled(); + return LOGGING_TOOL.isDebugEnabled(); } private boolean checkConstraints(IAtomContainer molecule) { From 436b40a549ce3f9a07dab80feeeff171a9f3edd8 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Tue, 9 Jan 2024 17:35:15 +0100 Subject: [PATCH 05/27] Continued refactoring; --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 248 ++++++++++-------- .../ErtlFunctionalGroupsFinderUtility.java | 4 +- 2 files changed, 136 insertions(+), 116 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index a14d362..b8370f8 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -38,6 +38,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Queue; import java.util.Set; @@ -169,18 +170,18 @@ public IBond createBond(IAtom aTargetAtom, IAtom anEnvCAtom) { /** * CDK logging tool instance for this class. */ - private static final ILoggingTool LOGGING_TOOL = LoggingToolFactory.createLoggingTool(ErtlFunctionalGroupsFinder.class); + public static final ILoggingTool LOGGING_TOOL = LoggingToolFactory.createLoggingTool(ErtlFunctionalGroupsFinder.class); // /** * Property name for marking carbonyl carbon atoms via IAtom properties. */ - private static final String CARBONYL_C_MARKER = "EFGF-Carbonyl-C"; + public static final String CARBONYL_C_MARKER = "EFGF-Carbonyl-C"; // /** * Set of atomic numbers that are accepted in the input molecule if the strict input restrictions are activated * (excludes metal and metalloid elements, only organic elements included). */ - private static final Set NONMETAL_ATOMIC_NUMBERS = Set.of(1, 2, 6, 7, 8, 9, 10, 15, 16, 17, 18, 34, 35, 36, 53, 54, 86); + public static final Set NONMETAL_ATOMIC_NUMBERS = Set.of(1, 2, 6, 7, 8, 9, 10, 15, 16, 17, 18, 34, 35, 36, 53, 54, 86); // /** * Environment mode setting, defining whether environments should be generalized (default) or kept as whole. @@ -227,141 +228,163 @@ public ErtlFunctionalGroupsFinder() { * Constructor for ErtlFunctionalGroupsFinder that allows setting the treatment of environments in the identified * functional groups. Default: environments will be generalized; no generalization: environments will be kept as whole. * - * @param envMode mode for treating functional group environments (see {@link ErtlFunctionalGroupsFinder.Mode}). + * @param anEnvMode mode for treating functional group environments (see {@link ErtlFunctionalGroupsFinder.Mode}). */ - public ErtlFunctionalGroupsFinder(Mode envMode) { - this.envMode = envMode; + public ErtlFunctionalGroupsFinder(Mode anEnvMode) { + Objects.requireNonNull(anEnvMode, "Given environment mode cannot be null."); + this.envMode = anEnvMode; } // /** - * Find all functional groups contained in a molecule. + * Allows setting the treatment of functional group environments after extraction. Default: environments will be + * generalized; no generalization: environments will be kept as whole. * - * NOTE: The input must consist of one connected structure and may not contain charged atoms, metals or metalloids. - * - * @param container the molecule which contains the functional groups (may not contain charged atoms, metals, - * metalloids or unconnected components!) - * @return a list with all functional groups found in the molecule. + * @param anEnvMode mode for treating functional group environments (see {@link ErtlFunctionalGroupsFinder.Mode}). */ - public List find(IAtomContainer container){ - return find(container, true); + public void setEnvMode(Mode anEnvMode) { + Objects.requireNonNull(anEnvMode, "Given environment mode cannot be null."); + this.envMode = anEnvMode; } - + // /** - * Find all functional groups contained in a molecule. + * Returns the current setting for the treatment of functional group environments after extraction. * - * NOTE: The input must consist of one connected structure and may not contain charged atoms, metals or metalloids. + * @return currently set environment mode + */ + public Mode getEnvMode() { + return this.envMode; + } + // + /** + * Find all functional groups in a molecule. The input atom container instance is cloned before processing to leave + * the input container intact. + *

+ * Note: The strict input restrictions from previous versions (no charged atoms, metals, metalloids or + * unconnected components) do not apply anymore by default. They can be turned on again in another variant of + * this method below. + *

* - * @param container the molecule which contains the functional groups (may not contain charged atoms, metals, - * metalloids or unconnected components!) - * @param clone Use 'false' to reuse the input container's bonds and atoms in the extraction of the functional - * groups. This may speed up the extraction and lower the memory consumption for processing large - * amounts of data but corrupts the original input container. - * Use 'true' to work with a clone and leave the input container intact (default). - * @return a list with all functional groups found in the molecule. + * @param aMolecule the molecule to identify functional groups in + * @throws CloneNotSupportedException if cloning is not possible + * @return a list with all functional groups found in the molecule */ - public List find(IAtomContainer container, boolean clone){ - // work with a clone? - IAtomContainer mol; - if(clone){ - try { - mol = container.clone(); - } catch (CloneNotSupportedException e) { - throw new IllegalStateException("Atom container could not be cloned"); - } - } - else{ - mol = container; - } - - // init GraphUtil & EdgeToBondMap - bondMap = EdgeToBondMap.withSpaceFor(mol); - adjList = GraphUtil.toAdjList(mol, bondMap); - - //checkConstraints(mol); - - // atom marking - markAtoms(mol); - - // extract raw groups - List groups = extractGroups(mol); - - // handle environment - if(envMode == Mode.DEFAULT) { - expandGeneralizedEnvironments(groups); - } - else if (envMode == Mode.NO_GENERALIZATION) { - expandFullEnvironments(groups); - } - else { - throw new IllegalStateException("Unknown mode."); - } - - // clear fields - bondMap = null; - adjList = null; - markedAtoms = null; - aromaticHeteroAtomIndicesToIsInGroupBoolMap = null; - markedAtomToConnectedEnvCMap = null; - - return groups; + public List find(IAtomContainer aMolecule) throws CloneNotSupportedException { + return this.find(aMolecule, true, false); } - + // /** - * TODO + * Find all functional groups in a molecule. + *

+ * Note: The strict input restrictions from previous versions (no charged atoms, metals, metalloids or + * unconnected components) do not apply anymore by default. They can be turned on again in another variant of + * this method below. + *

+ * + * @param aMolecule the molecule to identify functional groups in + * @param aShouldInputBeCloned use 'false' to reuse the input container's bonds and atoms in the extraction of the functional + * groups; this may speed up the extraction and lower the memory consumption for processing large + * amounts of data but corrupts the original input container; use 'true' to work with a clone and + * leave the input container intact + * @throws CloneNotSupportedException if cloning is not possible + * @return a list with all functional groups found in the molecule */ - public void setEnvMode(Mode anEnvMode) { - + public List find (IAtomContainer aMolecule, boolean aShouldInputBeCloned) throws CloneNotSupportedException { + return this.find(aMolecule, aShouldInputBeCloned, false); } /** - * TODO + * Find all functional groups in a molecule. + * + * @param aMolecule the molecule to identify functional groups in + * @param aShouldInputBeCloned use 'false' to reuse the input container's bonds and atoms in the extraction of the functional + * groups; this may speed up the extraction and lower the memory consumption for processing large + * amounts of data but corrupts the original input container; use 'true' to work with a clone and + * leave the input container intact + * @param anAreInputRestrictionsApplied if true, the input must consist of one connected structure and may not + * contain charged atoms, metals or metalloids; an IllegalArgumentException will + * be thrown otherwise + * @throws CloneNotSupportedException if cloning is not possible + * @throws IllegalArgumentException if input restrictions are applied and the given molecule does not fulfill them + * @return a list with all functional groups found in the molecule */ - public void isFunctionalGroupEnvironmentGeneralized(boolean aGeneralizeEnvironment) { - + public List find(IAtomContainer aMolecule, boolean aShouldInputBeCloned, boolean anAreInputRestrictionsApplied) + throws CloneNotSupportedException, IllegalArgumentException { + IAtomContainer tmpMolecule; + if (aShouldInputBeCloned) { + tmpMolecule = aMolecule.clone(); + } else { + tmpMolecule = aMolecule; + } + if (anAreInputRestrictionsApplied) { + this.checkConstraints(tmpMolecule); + } + for (IAtom tmpAtom : tmpMolecule.atoms()) { + if(Objects.isNull(tmpAtom.getImplicitHydrogenCount())) { + tmpAtom.setImplicitHydrogenCount(0); + } + } + this.bondMap = EdgeToBondMap.withSpaceFor(tmpMolecule); + this.adjList = GraphUtil.toAdjList(tmpMolecule, this.bondMap); + this.markAtoms(tmpMolecule); + // extract raw groups + List tmpFunctionalGroupsList = this.extractGroups(tmpMolecule); + // handle environment + if (this.envMode == Mode.DEFAULT) { + this.expandGeneralizedEnvironments(tmpFunctionalGroupsList); + } else if (this.envMode == Mode.NO_GENERALIZATION) { + this.expandFullEnvironments(tmpFunctionalGroupsList); + } else { + throw new IllegalArgumentException("Unknown mode."); + } + this.clearCache(); + return tmpFunctionalGroupsList; } /** - * TODO + * Clear caches related to the input molecule. */ - private void clearChache() { - + private void clearCache() { + this.bondMap = null; + this.adjList = null; + this.markedAtoms = null; + this.aromaticHeteroAtomIndicesToIsInGroupBoolMap = null; + this.markedAtomToConnectedEnvCMap = null; } /** * Mark all atoms and store them in a set for further processing. * - * @param molecule Molecule with atoms to mark + * @param aMolecule molecule with atoms to mark */ - private void markAtoms(IAtomContainer molecule) { - if(isDbg()) LOGGING_TOOL.debug("########## Starting search for atoms to mark ... ##########"); - + private void markAtoms(IAtomContainer aMolecule) { + if (this.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Starting search for atoms to mark ... ##########"); + } // store marked atoms - markedAtoms = new HashSet(molecule.getAtomCount()); //Sets.newHashSetWithExpectedSize(molecule.getAtomCount()); + this.markedAtoms = new HashSet<>((int) ((aMolecule.getAtomCount() / 0.75f) + 2), 0.75f); // store aromatic heteroatoms - aromaticHeteroAtomIndicesToIsInGroupBoolMap = new HashMap<>(); - - for(int idx = 0; idx < molecule.getAtomCount(); idx++) { - // skip atoms that already got marked in a previous iteration - if(markedAtoms.contains(idx)) { + this.aromaticHeteroAtomIndicesToIsInGroupBoolMap = new HashMap<>((int) ((aMolecule.getAtomCount() / 0.75f) + 2), 0.75f); + //TODO set and use a more explicit and trustworthy index? + for (int idx = 0; idx < aMolecule.getAtomCount(); idx++) { + // skip atoms that were already marked in a previous iteration + if (this.markedAtoms.contains(idx)) { continue; } - IAtom cAtom = molecule.getAtom(idx); - // skip aromatic atoms but add them to set - if(cAtom.isAromatic()) { - if(isHeteroatom(cAtom)) { - aromaticHeteroAtomIndicesToIsInGroupBoolMap.put(idx, false); + IAtom tmpAtom = aMolecule.getAtom(idx); + // skip aromatic atoms but add aromatic HETERO-atoms to map for later processing + if (tmpAtom.isAromatic()) { + if (ErtlFunctionalGroupsFinder.isHeteroatom(tmpAtom)) { + this.aromaticHeteroAtomIndicesToIsInGroupBoolMap.put(idx, false); } continue; } - - int atomicNr = cAtom.getAtomicNumber(); - + int tmpAtomicNr = tmpAtom.getAtomicNumber(); // if C... - if(atomicNr == 6) { + if (tmpAtomicNr == 6) { boolean isMarked = false; // to detect if foor loop ran with or without marking the C atom int oNSCounter = 0; // count for the number of connected O, N & S atoms for(int connectedIdx : adjList[idx]) { - IAtom connectedAtom = molecule.getAtom(connectedIdx); + IAtom connectedAtom = aMolecule.getAtom(connectedIdx); IBond connectedBond = bondMap.get(idx, connectedIdx); // if connected to Heteroatom or C in aliphatic double or triple bond... [CONDITIONS 2.1 & 2.2] @@ -377,14 +400,14 @@ private void markAtoms(IAtomContainer molecule) { // set the current atom as marked and break out of connected atoms if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 2.1/2.2", - idx, cAtom.getSymbol())); + idx, tmpAtom.getSymbol())); isMarked = true; // but check for carbonyl-C before break if(connectedAtom.getAtomicNumber() == 8 && connectedBond.getOrder() == Order.DOUBLE && adjList[idx].length == 3) { if(isDbg()) LOGGING_TOOL.debug(" - was flagged as Carbonly-C"); - cAtom.setProperty(CARBONYL_C_MARKER, true); + tmpAtom.setProperty(CARBONYL_C_MARKER, true); } break; @@ -412,10 +435,10 @@ else if((connectedAtom.getAtomicNumber() == 7 } if(isAllSingleBonds) { oNSCounter++; - if(oNSCounter > 1 && adjList[idx].length + cAtom.getImplicitHydrogenCount() == 4) { + if(oNSCounter > 1 && adjList[idx].length + tmpAtom.getImplicitHydrogenCount() == 4) { // set as marked and break out of connected atoms if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 2.3", - idx, cAtom.getSymbol())); + idx, tmpAtom.getSymbol())); isMarked = true; break; } @@ -423,11 +446,11 @@ else if((connectedAtom.getAtomicNumber() == 7 } // if part of oxirane, aziridine and thiirane ring... [CONDITION 2.4] for(int connectedInSphere2Idx : adjList[connectedIdx]) { - IAtom connectedInSphere2Atom = molecule.getAtom(connectedInSphere2Idx); + IAtom connectedInSphere2Atom = aMolecule.getAtom(connectedInSphere2Idx); if(connectedInSphere2Atom.getAtomicNumber() == 6) { for(int connectedInSphere3Idx : adjList[connectedInSphere2Idx]) { - IAtom connectedInSphere3Atom = molecule.getAtom(connectedInSphere3Idx); - if(connectedInSphere3Atom.equals(cAtom)) { + IAtom connectedInSphere3Atom = aMolecule.getAtom(connectedInSphere3Idx); + if(connectedInSphere3Atom.equals(tmpAtom)) { // set connected atoms as marked if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 2.4", connectedInSphere2Idx, connectedInSphere2Atom.getSymbol())); @@ -437,7 +460,7 @@ else if((connectedAtom.getAtomicNumber() == 7 markedAtoms.add(connectedInSphere3Idx); // set current atom as marked and break out of connected atoms if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 2.4", - idx, cAtom.getSymbol())); + idx, tmpAtom.getSymbol())); isMarked = true; break; } @@ -453,11 +476,11 @@ else if((connectedAtom.getAtomicNumber() == 7 // if none of the conditions 2.X apply, we have an unmarked C (not relevant here) } // if H... - else if (atomicNr == 1){ + else if (tmpAtomicNr == 1){ // convert to implicit H IAtom connectedAtom; try { - connectedAtom = molecule.getAtom(adjList[idx][0]); + connectedAtom = aMolecule.getAtom(adjList[idx][0]); } catch(ArrayIndexOutOfBoundsException e) { break; @@ -474,12 +497,12 @@ else if (atomicNr == 1){ } // if heteroatom... (CONDITION 1) else { - if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 1", idx, cAtom.getSymbol())); + if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 1", idx, tmpAtom.getSymbol())); markedAtoms.add(idx); continue; } } - if(isDbg()) LOGGING_TOOL.debug(String.format("########## End of search. Marked %d/%d atoms. ##########", markedAtoms.size(), molecule.getAtomCount())); + if(isDbg()) LOGGING_TOOL.debug(String.format("########## End of search. Marked %d/%d atoms. ##########", markedAtoms.size(), aMolecule.getAtomCount())); } /** @@ -886,9 +909,6 @@ private boolean checkConstraints(IAtomContainer molecule) { if(!isNonmetal(atom)) { throw new IllegalArgumentException("Input molecule must not contain metals or metalloids."); } - if(atom.getImplicitHydrogenCount() == null) { - atom.setImplicitHydrogenCount(0); - } } ConnectedComponents cc = new ConnectedComponents(adjList); diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java index cd80434..b4ec175 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java @@ -718,7 +718,7 @@ public static IAtomContainer applyFiltersAndPreprocessing(IAtomContainer aMolecu * @throws IllegalArgumentException if the given atom container cannot be passed to ErtlFunctionalGroupsFinder; * check methods for filtering and preprocessing in this case */ - public static List findMarkedAtoms(IAtomContainer aMolecule) throws NullPointerException, IllegalArgumentException { + public static List findMarkedAtoms(IAtomContainer aMolecule) throws NullPointerException, IllegalArgumentException, CloneNotSupportedException { return ErtlFunctionalGroupsFinderUtility.findMarkedAtoms(aMolecule, true); } @@ -735,7 +735,7 @@ public static List findMarkedAtoms(IAtomContainer aMolecule) thr * @throws IllegalArgumentException if the given atom container cannot be passed to ErtlFunctionalGroupsFinder; * check methods for filtering and preprocessing in this case */ - public static List findMarkedAtoms(IAtomContainer aMolecule, boolean areSingleAtomsFiltered) throws NullPointerException, IllegalArgumentException { + public static List findMarkedAtoms(IAtomContainer aMolecule, boolean areSingleAtomsFiltered) throws NullPointerException, IllegalArgumentException, CloneNotSupportedException { Objects.requireNonNull(aMolecule, "Given molecule is null."); if (aMolecule.isEmpty()) { return new ArrayList(0); From b1652660e0c105cc3c5a8374b419951f24da3b9c Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Wed, 10 Jan 2024 11:27:36 +0100 Subject: [PATCH 06/27] Refactoring of markAtoms method --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 215 ++++++++++-------- .../ErtlFunctionalGroupsFinderUtility.java | 2 + 2 files changed, 126 insertions(+), 91 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index b8370f8..4fa3f2e 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -381,130 +381,163 @@ private void markAtoms(IAtomContainer aMolecule) { int tmpAtomicNr = tmpAtom.getAtomicNumber(); // if C... if (tmpAtomicNr == 6) { - boolean isMarked = false; // to detect if foor loop ran with or without marking the C atom - int oNSCounter = 0; // count for the number of connected O, N & S atoms - for(int connectedIdx : adjList[idx]) { - IAtom connectedAtom = aMolecule.getAtom(connectedIdx); - IBond connectedBond = bondMap.get(idx, connectedIdx); - - // if connected to Heteroatom or C in aliphatic double or triple bond... [CONDITIONS 2.1 & 2.2] - if(connectedAtom.getAtomicNumber() != 1 && ((connectedBond.getOrder() == Order.DOUBLE - || connectedBond.getOrder() == Order.TRIPLE) && !connectedBond.isAromatic())) { - - // set the connected atom as marked - if(markedAtoms.add(connectedIdx)) { - String connectedAtomCondition = connectedAtom.getAtomicNumber() == 6 ? "2.1/2.2" : "1"; - if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition %s", - connectedIdx, connectedAtom.getSymbol(), connectedAtomCondition)); + // to detect if for loop ran with or without marking the C atom + boolean tmpIsMarked = false; + // count for the number of connected O, N & S atoms to detect acetal carbons + int tmpConnectedONSatomsCounter = 0; + for (int tmpConnectedIdx : this.adjList[idx]) { + IAtom tmpConnectedAtom = aMolecule.getAtom(tmpConnectedIdx); + IBond tmpConnectedBond = this.bondMap.get(idx, tmpConnectedIdx); + + // if connected to heteroatom or C in aliphatic double or triple bond... [CONDITIONS 2.1 & 2.2] + if (tmpConnectedAtom.getAtomicNumber() != 1 + && ((tmpConnectedBond.getOrder() == Order.DOUBLE || tmpConnectedBond.getOrder() == Order.TRIPLE) + && !tmpConnectedBond.isAromatic())) { + + // set the *connected* atom as marked (add() true if this set did not already contain the specified element) + if (this.markedAtoms.add(tmpConnectedIdx)) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition %s", + tmpConnectedIdx, + tmpConnectedAtom.getSymbol(), + tmpConnectedAtom.getAtomicNumber() == 6 ? "2.1/2.2" : "1")); + } + } + // set the *current* atom as marked and break out of connected atoms + tmpIsMarked = true; + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition 2.1/2.2", + idx, + tmpAtom.getSymbol())); } - - // set the current atom as marked and break out of connected atoms - if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 2.1/2.2", - idx, tmpAtom.getSymbol())); - isMarked = true; - // but check for carbonyl-C before break - if(connectedAtom.getAtomicNumber() == 8 && connectedBond.getOrder() == Order.DOUBLE - && adjList[idx].length == 3) { - if(isDbg()) LOGGING_TOOL.debug(" - was flagged as Carbonly-C"); + if (tmpConnectedAtom.getAtomicNumber() == 8 + && tmpConnectedBond.getOrder() == Order.DOUBLE + && this.adjList[idx].length == 3) { tmpAtom.setProperty(CARBONYL_C_MARKER, true); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("- was flagged as Carbonly-C"); + } } - + // break out of connected atoms break; - } - // if connected to O/N/S in single bond... - else if((connectedAtom.getAtomicNumber() == 7 - || connectedAtom.getAtomicNumber() == 8 - || connectedAtom.getAtomicNumber() == 16) - && connectedBond.getOrder() == Order.SINGLE){ + } else if ((tmpConnectedAtom.getAtomicNumber() == 7 + || tmpConnectedAtom.getAtomicNumber() == 8 + || tmpConnectedAtom.getAtomicNumber() == 16) + && tmpConnectedBond.getOrder() == Order.SINGLE) { + // if connected to O/N/S in single bond... // if connected O/N/S is not aromatic... - if(!connectedAtom.isAromatic()) { + if (!tmpConnectedAtom.isAromatic()) { // set the connected O/N/S atom as marked - if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 1", - connectedIdx, connectedAtom.getSymbol())); - markedAtoms.add(connectedIdx); - + this.markedAtoms.add(tmpConnectedIdx); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition 1", + tmpConnectedIdx, + tmpConnectedAtom.getSymbol())); + } // if "acetal C" (2+ O/N/S in single bonds connected to sp3-C)... [CONDITION 2.3] - boolean isAllSingleBonds = true; - for(int connectedInSphere2Idx : adjList[connectedIdx]) { - IBond sphere2Bond = bondMap.get(connectedIdx, connectedInSphere2Idx); - if(sphere2Bond.getOrder() != Order.SINGLE) { - isAllSingleBonds = false; + boolean tmpIsAllSingleBonds = true; + for (int tmpConnectedInSphere2Idx : this.adjList[tmpConnectedIdx]) { + IBond tmpSphere2Bond = this.bondMap.get(tmpConnectedIdx, tmpConnectedInSphere2Idx); + if (tmpSphere2Bond.getOrder() != Order.SINGLE) { + tmpIsAllSingleBonds = false; break; } } - if(isAllSingleBonds) { - oNSCounter++; - if(oNSCounter > 1 && adjList[idx].length + tmpAtom.getImplicitHydrogenCount() == 4) { + if (tmpIsAllSingleBonds) { + tmpConnectedONSatomsCounter++; + if (tmpConnectedONSatomsCounter > 1 && this.adjList[idx].length + tmpAtom.getImplicitHydrogenCount() == 4) { // set as marked and break out of connected atoms - if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 2.3", - idx, tmpAtom.getSymbol())); - isMarked = true; + tmpIsMarked = true; + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition 2.3", + idx, + tmpAtom.getSymbol())); + } break; } } } - // if part of oxirane, aziridine and thiirane ring... [CONDITION 2.4] - for(int connectedInSphere2Idx : adjList[connectedIdx]) { - IAtom connectedInSphere2Atom = aMolecule.getAtom(connectedInSphere2Idx); - if(connectedInSphere2Atom.getAtomicNumber() == 6) { - for(int connectedInSphere3Idx : adjList[connectedInSphere2Idx]) { - IAtom connectedInSphere3Atom = aMolecule.getAtom(connectedInSphere3Idx); - if(connectedInSphere3Atom.equals(tmpAtom)) { + // if part of oxirane, aziridine, or thiirane ring... [CONDITION 2.4] + for (int tmpConnectedInSphere2Idx : this.adjList[tmpConnectedIdx]) { + IAtom tmpConnectedInSphere2Atom = aMolecule.getAtom(tmpConnectedInSphere2Idx); + if (tmpConnectedInSphere2Atom.getAtomicNumber() == 6) { + for (int tmpConnectedInSphere3Idx : this.adjList[tmpConnectedInSphere2Idx]) { + IAtom tmpConnectedInSphere3Atom = aMolecule.getAtom(tmpConnectedInSphere3Idx); + if (tmpConnectedInSphere3Atom.equals(tmpAtom)) { // set connected atoms as marked - if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 2.4", - connectedInSphere2Idx, connectedInSphere2Atom.getSymbol())); - if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 2.4", - connectedInSphere3Idx, connectedInSphere3Atom.getSymbol())); - markedAtoms.add(connectedInSphere2Idx); - markedAtoms.add(connectedInSphere3Idx); + this.markedAtoms.add(tmpConnectedInSphere2Idx); + this.markedAtoms.add(tmpConnectedInSphere3Idx); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition 2.4", + tmpConnectedInSphere2Idx, + tmpConnectedInSphere2Atom.getSymbol())); + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition 2.4", + tmpConnectedInSphere3Idx, + tmpConnectedInSphere3Atom.getSymbol())); + } // set current atom as marked and break out of connected atoms - if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 2.4", - idx, tmpAtom.getSymbol())); - isMarked = true; + tmpIsMarked = true; + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition 2.4", + idx, + tmpAtom.getSymbol())); + } break; } } } - } - } - } - if(isMarked) { - markedAtoms.add(idx); + } //end of for loop iterating over second sphere atoms + } // end of else if connected to O/N/S in single bond + } //end of for loop that iterates over all connected atoms of the carbon atom + if (tmpIsMarked) { + this.markedAtoms.add(idx); continue; } // if none of the conditions 2.X apply, we have an unmarked C (not relevant here) - } - // if H... - else if (tmpAtomicNr == 1){ + } else if (tmpAtomicNr == 1){ + // if H... // convert to implicit H - IAtom connectedAtom; + IAtom tmpConnectedAtom; try { - connectedAtom = aMolecule.getAtom(adjList[idx][0]); - } - catch(ArrayIndexOutOfBoundsException e) { + tmpConnectedAtom = aMolecule.getAtom(this.adjList[idx][0]); + } catch(ArrayIndexOutOfBoundsException anException) { + //TODO: do sth here? break; } - - - if(connectedAtom.getImplicitHydrogenCount() == null) { - connectedAtom.setImplicitHydrogenCount(1); - } - else { - connectedAtom.setImplicitHydrogenCount(connectedAtom.getImplicitHydrogenCount() + 1); + if (Objects.isNull(tmpConnectedAtom.getImplicitHydrogenCount())) { + tmpConnectedAtom.setImplicitHydrogenCount(1); + } else { + tmpConnectedAtom.setImplicitHydrogenCount(tmpConnectedAtom.getImplicitHydrogenCount() + 1); } continue; - } - // if heteroatom... (CONDITION 1) - else { - if(isDbg()) LOGGING_TOOL.debug(String.format("Marking Atom #%d (%s) - Met condition 1", idx, tmpAtom.getSymbol())); - markedAtoms.add(idx); + } else { + // if heteroatom... (CONDITION 1) + this.markedAtoms.add(idx); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition 1", + idx, + tmpAtom.getSymbol())); + } continue; } + } //end of for loop that iterates over all atoms in the mol + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "########## End of search. Marked %d/%d atoms. ##########", + this.markedAtoms.size(), + aMolecule.getAtomCount())); } - if(isDbg()) LOGGING_TOOL.debug(String.format("########## End of search. Marked %d/%d atoms. ##########", markedAtoms.size(), aMolecule.getAtomCount())); } - + // /** * Searches the molecule for groups of connected marked atoms and extracts each as a new functional group. * The extraction process includes marked atom's "environments". Connected H's are captured implicitly. @@ -897,8 +930,8 @@ private List partitionIntoGroups(IAtomContainer sourceContainer, return groups; } - private boolean isDbg() { - return LOGGING_TOOL.isDebugEnabled(); + private static boolean isDbg() { + return ErtlFunctionalGroupsFinder.LOGGING_TOOL.isDebugEnabled(); } private boolean checkConstraints(IAtomContainer molecule) { diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java index b4ec175..0c6cbe9 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java @@ -716,6 +716,7 @@ public static IAtomContainer applyFiltersAndPreprocessing(IAtomContainer aMolecu * @return List of IAtomContainer objects representing the detected functional groups * @throws NullPointerException if the given atom container is null * @throws IllegalArgumentException if the given atom container cannot be passed to ErtlFunctionalGroupsFinder; + * @throws CloneNotSupportedException if cloning is not possible * check methods for filtering and preprocessing in this case */ public static List findMarkedAtoms(IAtomContainer aMolecule) throws NullPointerException, IllegalArgumentException, CloneNotSupportedException { @@ -733,6 +734,7 @@ public static List findMarkedAtoms(IAtomContainer aMolecule) thr * @return List of IAtomContainer objects representing the detected functional groups * @throws NullPointerException if the given atom container is null * @throws IllegalArgumentException if the given atom container cannot be passed to ErtlFunctionalGroupsFinder; + * @throws CloneNotSupportedException if cloning is not possible * check methods for filtering and preprocessing in this case */ public static List findMarkedAtoms(IAtomContainer aMolecule, boolean areSingleAtomsFiltered) throws NullPointerException, IllegalArgumentException, CloneNotSupportedException { From 6b222c5c5dd4b50f1125bd56a4f10085ba527807 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Wed, 10 Jan 2024 16:49:52 +0100 Subject: [PATCH 07/27] Overhaul of extractGroups --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 174 ++++++++++-------- 1 file changed, 94 insertions(+), 80 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 4fa3f2e..b802ca3 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -31,6 +31,7 @@ import org.openscience.cdk.interfaces.IPseudoAtom; import org.openscience.cdk.interfaces.ISingleElectron; +import javax.security.auth.kerberos.KerberosTicket; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; @@ -540,120 +541,133 @@ private void markAtoms(IAtomContainer aMolecule) { // /** * Searches the molecule for groups of connected marked atoms and extracts each as a new functional group. - * The extraction process includes marked atom's "environments". Connected H's are captured implicitly. + * The extraction process includes marked atoms' "environments". Connected H's are captured implicitly. * - * @param molecule the molecule which contains the functional groups + * @param aMolecule the molecule which contains the functional groups * @return a list of all functional groups (including "environments") extracted from the molecule */ - private List extractGroups(IAtomContainer molecule) { - if(isDbg()) LOGGING_TOOL.debug("########## Starting identification & extraction of functional groups... ##########"); - - markedAtomToConnectedEnvCMap = new HashMap>(molecule.getAtomCount());//Maps.newHashMapWithExpectedSize(molecule.getAtomCount()); - int[] atomIdxToFGMap = new int[molecule.getAtomCount()]; - Arrays.fill(atomIdxToFGMap, -1); - int fGroupIdx = -1; - - while(!markedAtoms.isEmpty()) { + private List extractGroups(IAtomContainer aMolecule) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Starting identification & extraction of functional groups... ##########"); + } + this.markedAtomToConnectedEnvCMap = new HashMap<>((int) ((aMolecule.getAtomCount() / 0.75f) + 2), 0.75f); + int[] tmpAtomIdxToFGMap = new int[aMolecule.getAtomCount()]; + Arrays.fill(tmpAtomIdxToFGMap, -1); + int tmpFunctionalGroupIdx = -1; + while(!this.markedAtoms.isEmpty()) { // search for another functional group - fGroupIdx++; - + tmpFunctionalGroupIdx++; // get next markedAtom as the starting node for the search - int beginIdx = markedAtoms.iterator().next(); - if(isDbg()) LOGGING_TOOL.debug(String.format("Searching new functional group from atom #%d (%s)...", beginIdx, molecule.getAtom(beginIdx).getSymbol())); - + int tmpBeginIdx = this.markedAtoms.iterator().next(); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Searching new functional group from atom #%d (%s)...", + tmpBeginIdx, + aMolecule.getAtom(tmpBeginIdx).getSymbol())); + } // do a BFS from there - Queue queue = new ArrayDeque<>(); - queue.add(beginIdx); - - while(!queue.isEmpty()) { - int currentIdx = queue.poll(); - + Queue tmpQueue = new ArrayDeque<>(); + tmpQueue.add(tmpBeginIdx); + while(!tmpQueue.isEmpty()) { + int tmpCurrentQueueIdx = tmpQueue.poll(); // we are only interested in marked atoms that are not yet included in a group - if(!markedAtoms.contains(currentIdx)){ + if(!this.markedAtoms.contains(tmpCurrentQueueIdx)){ continue; } - // if it isn't... - IAtom currentAtom = molecule.getAtom(currentIdx); - if(isDbg()) LOGGING_TOOL.debug(String.format(" visiting marked atom: #%d (%s)", currentIdx, currentAtom.getSymbol())); - + IAtom tmpCurrentAtom = aMolecule.getAtom(tmpCurrentQueueIdx); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format(" visiting marked atom: #%d (%s)", + tmpCurrentQueueIdx, + tmpCurrentAtom.getSymbol())); + } // add its index to the functional group - atomIdxToFGMap[currentIdx] = fGroupIdx; + tmpAtomIdxToFGMap[tmpCurrentQueueIdx] = tmpFunctionalGroupIdx; // also scratch the index from markedAtoms - markedAtoms.remove(currentIdx); - + this. markedAtoms.remove(tmpCurrentQueueIdx); // and take look at the connected atoms - List currentEnvironment = new ArrayList<>(); - for(int connectedIdx : adjList[currentIdx]) { + List tmpCurrentEnvironment = new ArrayList<>(); + for (int tmpConnectedIdx : this.adjList[tmpCurrentQueueIdx]) { // add connected marked atoms to queue - if(markedAtoms.contains(connectedIdx)) { - queue.add(connectedIdx); + if(this.markedAtoms.contains(tmpConnectedIdx)) { + tmpQueue.add(tmpConnectedIdx); continue; } - // ignore already handled connected atoms - if(atomIdxToFGMap[connectedIdx] >= 0){ + if (tmpAtomIdxToFGMap[tmpConnectedIdx] >= 0) { continue; } - // add unmarked connected aromatic heteroatoms - IAtom connectedAtom = molecule.getAtom(connectedIdx); - if(isHeteroatom(connectedAtom) && connectedAtom.isAromatic()) { - if(isDbg()) LOGGING_TOOL.debug(" added connected aromatic heteroatom " + connectedAtom.getSymbol()); - atomIdxToFGMap[connectedIdx] = fGroupIdx; + IAtom tmpConnectedAtom = aMolecule.getAtom(tmpConnectedIdx); + if (ErtlFunctionalGroupsFinder.isHeteroatom(tmpConnectedAtom) && tmpConnectedAtom.isAromatic()) { + tmpAtomIdxToFGMap[tmpConnectedIdx] = tmpFunctionalGroupIdx; // note that this aromatic heteroatom has been added to a group - aromaticHeteroAtomIndicesToIsInGroupBoolMap.put(connectedIdx, true); + this.aromaticHeteroAtomIndicesToIsInGroupBoolMap.put(tmpConnectedIdx, true); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" added connected aromatic heteroatom " + + tmpConnectedAtom.getSymbol()); + } } - // add unmarked connected atoms to current marked atom's environment - IBond connectedBond = bondMap.get(currentIdx, connectedIdx); - - EnvironmentalCType type; - if (connectedAtom.getAtomicNumber() == 6) { - if(connectedAtom.isAromatic()) - type = EnvironmentalCType.C_AROMATIC; - else - type = EnvironmentalCType.C_ALIPHATIC; + IBond tmpConnectedBond = this.bondMap.get(tmpCurrentQueueIdx, tmpConnectedIdx); + EnvironmentalCType tmpEnvironmentalCType; + if (tmpConnectedAtom.getAtomicNumber() == 6) { + if (tmpConnectedAtom.isAromatic()) { + tmpEnvironmentalCType = EnvironmentalCType.C_AROMATIC; + } else { + tmpEnvironmentalCType = EnvironmentalCType.C_ALIPHATIC; + } } else { // aromatic heteroatom, so just ignore continue; } - currentEnvironment.add(new EnvironmentalC(type, connectedBond, connectedBond.getBegin() == connectedAtom ? 0 : 1)); + tmpCurrentEnvironment.add(new EnvironmentalC( + tmpEnvironmentalCType, + tmpConnectedBond, + tmpConnectedBond.getBegin().equals(tmpConnectedAtom) ? 0 : 1)); } - markedAtomToConnectedEnvCMap.put(currentAtom, currentEnvironment); - + this.markedAtomToConnectedEnvCMap.put(tmpCurrentAtom, tmpCurrentEnvironment); // debug logging - if(isDbg()) { - int cAromCount = 0, cAliphCount = 0; - for(EnvironmentalC comp : currentEnvironment) { - if(comp.getType() == EnvironmentalCType.C_AROMATIC) - cAromCount++; - else if(comp.getType() == EnvironmentalCType.C_ALIPHATIC) - cAliphCount++; + if (ErtlFunctionalGroupsFinder.isDbg()) { + int tmpCAromCount = 0, tmpCAliphCount = 0; + for(EnvironmentalC tmpEnvC : tmpCurrentEnvironment) { + if (tmpEnvC.getType() == EnvironmentalCType.C_AROMATIC) { + tmpCAromCount++; + } else if (tmpEnvC.getType() == EnvironmentalCType.C_ALIPHATIC) { + tmpCAliphCount++; + } } - LOGGING_TOOL.debug(String.format(" logged marked atom's environment: C_ar:%d, C_al:%d (and %d implicit hydrogens)", cAromCount, cAliphCount, currentAtom.getImplicitHydrogenCount())); + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + " logged marked atom's environment: C_ar:%d, C_al:%d (and %d implicit hydrogens)", + tmpCAromCount, + tmpCAliphCount, + tmpCurrentAtom.getImplicitHydrogenCount())); } } - - if(isDbg()) LOGGING_TOOL.debug(" search completed."); - } - - // also create FG for lone aromatic heteroatoms, not connected to a FG yet. - for(int atomIdx : aromaticHeteroAtomIndicesToIsInGroupBoolMap.keySet()) { - if(!aromaticHeteroAtomIndicesToIsInGroupBoolMap.get(atomIdx)) { - fGroupIdx++; - atomIdxToFGMap[atomIdx] = fGroupIdx; - if(isDbg()) LOGGING_TOOL.debug("Created FG for lone aromatic heteroatom: " + molecule.getAtom(atomIdx).getSymbol()); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" search completed."); + } + } //markedAtoms is empty now + // also create FG for lone aromatic heteroatoms, not connected to an FG yet. + for (int tmpAtomIdx : this.aromaticHeteroAtomIndicesToIsInGroupBoolMap.keySet()) { + if (!this.aromaticHeteroAtomIndicesToIsInGroupBoolMap.get(tmpAtomIdx)) { + tmpFunctionalGroupIdx++; + tmpAtomIdxToFGMap[tmpAtomIdx] = tmpFunctionalGroupIdx; + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("Created FG for lone aromatic heteroatom: " + + aMolecule.getAtom(tmpAtomIdx).getSymbol()); + } } } - - List fGs = partitionIntoGroups(molecule, atomIdxToFGMap, fGroupIdx + 1); - - if(isDbg()) LOGGING_TOOL.debug(String.format("########## Found & extracted %d functional groups. ##########", fGroupIdx + 1)); - return fGs; + List tmpFunctionalGroupsList = this.partitionIntoGroups(aMolecule, tmpAtomIdxToFGMap, tmpFunctionalGroupIdx + 1); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format("########## Found & extracted %d functional groups. ##########", + tmpFunctionalGroupIdx + 1)); + } + return tmpFunctionalGroupsList; } - + // /** * Generalizes the full environments of functional groups, providing a good balance between preserving * meaningful detail and generalization. @@ -849,12 +863,12 @@ private void expandEnvironmentGeneralized(IAtom atom, IAtomContainer container) if(isDbg()) LOGGING_TOOL.debug(String.format(" expanded environment: %dx R-atom (incl. %d for H replacement)", rAtomCount, rAtomCount - rAtomsForCCount)); } - private static final boolean isHeteroatom(IAtom atom) { + private static boolean isHeteroatom(IAtom atom) { int atomicNr = atom.getAtomicNumber(); return atomicNr != 1 && atomicNr != 6; } - private final boolean isNonmetal(IAtom atom) { + private boolean isNonmetal(IAtom atom) { return NONMETAL_ATOMIC_NUMBERS.contains(atom.getAtomicNumber()); } From 9699ef18061df7b64274c8eedb3f1bf967a57213 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Thu, 11 Jan 2024 16:51:49 +0100 Subject: [PATCH 08/27] Overhaul of expandGeneralizedEnvironments, WIP --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 158 ++++++++++-------- 1 file changed, 86 insertions(+), 72 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index b802ca3..3df74ec 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -551,10 +551,10 @@ private List extractGroups(IAtomContainer aMolecule) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Starting identification & extraction of functional groups... ##########"); } this.markedAtomToConnectedEnvCMap = new HashMap<>((int) ((aMolecule.getAtomCount() / 0.75f) + 2), 0.75f); - int[] tmpAtomIdxToFGMap = new int[aMolecule.getAtomCount()]; - Arrays.fill(tmpAtomIdxToFGMap, -1); + int[] tmpAtomIdxToFGArray = new int[aMolecule.getAtomCount()]; + Arrays.fill(tmpAtomIdxToFGArray, -1); int tmpFunctionalGroupIdx = -1; - while(!this.markedAtoms.isEmpty()) { + while (!this.markedAtoms.isEmpty()) { // search for another functional group tmpFunctionalGroupIdx++; // get next markedAtom as the starting node for the search @@ -568,10 +568,10 @@ private List extractGroups(IAtomContainer aMolecule) { // do a BFS from there Queue tmpQueue = new ArrayDeque<>(); tmpQueue.add(tmpBeginIdx); - while(!tmpQueue.isEmpty()) { + while (!tmpQueue.isEmpty()) { int tmpCurrentQueueIdx = tmpQueue.poll(); // we are only interested in marked atoms that are not yet included in a group - if(!this.markedAtoms.contains(tmpCurrentQueueIdx)){ + if (!this.markedAtoms.contains(tmpCurrentQueueIdx)) { continue; } // if it isn't... @@ -582,25 +582,25 @@ private List extractGroups(IAtomContainer aMolecule) { tmpCurrentAtom.getSymbol())); } // add its index to the functional group - tmpAtomIdxToFGMap[tmpCurrentQueueIdx] = tmpFunctionalGroupIdx; + tmpAtomIdxToFGArray[tmpCurrentQueueIdx] = tmpFunctionalGroupIdx; // also scratch the index from markedAtoms - this. markedAtoms.remove(tmpCurrentQueueIdx); - // and take look at the connected atoms + this.markedAtoms.remove(tmpCurrentQueueIdx); + // and take a look at the connected atoms List tmpCurrentEnvironment = new ArrayList<>(); for (int tmpConnectedIdx : this.adjList[tmpCurrentQueueIdx]) { // add connected marked atoms to queue - if(this.markedAtoms.contains(tmpConnectedIdx)) { + if (this.markedAtoms.contains(tmpConnectedIdx)) { tmpQueue.add(tmpConnectedIdx); continue; } // ignore already handled connected atoms - if (tmpAtomIdxToFGMap[tmpConnectedIdx] >= 0) { + if (tmpAtomIdxToFGArray[tmpConnectedIdx] >= 0) { continue; } // add unmarked connected aromatic heteroatoms IAtom tmpConnectedAtom = aMolecule.getAtom(tmpConnectedIdx); if (ErtlFunctionalGroupsFinder.isHeteroatom(tmpConnectedAtom) && tmpConnectedAtom.isAromatic()) { - tmpAtomIdxToFGMap[tmpConnectedIdx] = tmpFunctionalGroupIdx; + tmpAtomIdxToFGArray[tmpConnectedIdx] = tmpFunctionalGroupIdx; // note that this aromatic heteroatom has been added to a group this.aromaticHeteroAtomIndicesToIsInGroupBoolMap.put(tmpConnectedIdx, true); if (ErtlFunctionalGroupsFinder.isDbg()) { @@ -626,7 +626,7 @@ private List extractGroups(IAtomContainer aMolecule) { tmpEnvironmentalCType, tmpConnectedBond, tmpConnectedBond.getBegin().equals(tmpConnectedAtom) ? 0 : 1)); - } + } //end of loop of connected atoms this.markedAtomToConnectedEnvCMap.put(tmpCurrentAtom, tmpCurrentEnvironment); // debug logging if (ErtlFunctionalGroupsFinder.isDbg()) { @@ -644,7 +644,7 @@ private List extractGroups(IAtomContainer aMolecule) { tmpCAliphCount, tmpCurrentAtom.getImplicitHydrogenCount())); } - } + } // end of BFS if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" search completed."); } @@ -653,14 +653,14 @@ private List extractGroups(IAtomContainer aMolecule) { for (int tmpAtomIdx : this.aromaticHeteroAtomIndicesToIsInGroupBoolMap.keySet()) { if (!this.aromaticHeteroAtomIndicesToIsInGroupBoolMap.get(tmpAtomIdx)) { tmpFunctionalGroupIdx++; - tmpAtomIdxToFGMap[tmpAtomIdx] = tmpFunctionalGroupIdx; + tmpAtomIdxToFGArray[tmpAtomIdx] = tmpFunctionalGroupIdx; if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("Created FG for lone aromatic heteroatom: " + aMolecule.getAtom(tmpAtomIdx).getSymbol()); } } } - List tmpFunctionalGroupsList = this.partitionIntoGroups(aMolecule, tmpAtomIdxToFGMap, tmpFunctionalGroupIdx + 1); + List tmpFunctionalGroupsList = this.partitionIntoGroups(aMolecule, tmpAtomIdxToFGArray, tmpFunctionalGroupIdx + 1); if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format("########## Found & extracted %d functional groups. ##########", tmpFunctionalGroupIdx + 1)); @@ -669,72 +669,85 @@ private List extractGroups(IAtomContainer aMolecule) { } // /** - * Generalizes the full environments of functional groups, providing a good balance between preserving - * meaningful detail and generalization. + * Generalizes the full environments of functional groups, according to the Ertl generalization algorithm, providing + * a good balance between preserving meaningful detail and generalization. * - * @param fGroups the list of functional groups including "environments" + * @param aFunctionalGroupsList the list of functional groups including "environments" */ - private void expandGeneralizedEnvironments(List fGroups){ - if(isDbg()) LOGGING_TOOL.debug("########## Starting generalization of functional groups... ##########"); - - for(IAtomContainer fGroup : fGroups) { - int atomCount = fGroup.getAtomCount(); - - if(isDbg()) LOGGING_TOOL.debug(String.format("Generalizing functional group (%d atoms)...", atomCount)); - - // prechecking for special cases... - if(fGroup.getAtomCount() == 1) { - IAtom atom = fGroup.getAtom(0); - List environment = markedAtomToConnectedEnvCMap.get(atom); - - if(environment != null) { - int envCCount = environment.size(); - - // for H2N-C_env & HO-C_env -> do not replace H & C_env by R! - if((atom.getAtomicNumber() == 8 && envCCount == 1) - || (atom.getAtomicNumber() == 7 && envCCount == 1)){ - if(isDbg()) LOGGING_TOOL.debug(String.format(" - found single atomic N or O FG with one env. C. Expanding environment...", atom.getSymbol())); - expandEnvironment(atom, fGroup); - - int hCount = atom.getImplicitHydrogenCount(); - if(hCount != 0) { - if(isDbg()) LOGGING_TOOL.debug(String.format(" - adding %d hydrogens...", hCount)); - addHydrogens(atom, hCount, fGroup); - atom.setImplicitHydrogenCount(0); + private void expandGeneralizedEnvironments(List aFunctionalGroupsList) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Starting generalization of functional groups... ##########"); + } + for (IAtomContainer tmpFunctionalGroup : aFunctionalGroupsList) { + int tmpAtomCount = tmpFunctionalGroup.getAtomCount(); + if(ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format("Generalizing functional group (%d atoms)...", tmpAtomCount)); + } + // pre-checking for special cases... + if (tmpFunctionalGroup.getAtomCount() == 1) { + IAtom tmpAtom = tmpFunctionalGroup.getAtom(0); + List tmpEnvironment = this.markedAtomToConnectedEnvCMap.get(tmpAtom); + + if (!Objects.isNull(tmpEnvironment)) { + int tmpEnvCCount = tmpEnvironment.size(); + // for H2N-C_env & HO-C_env -> do not replace H & C_env by R to differentiate primary/secondary/tertiary amine and alcohol vs. phenol + if ((tmpAtom.getAtomicNumber() == 8 && tmpEnvCCount == 1) + || (tmpAtom.getAtomicNumber() == 7 && tmpEnvCCount == 1)) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + " - found single atomic N or O FG with one env. C. Expanding environment...", + tmpAtom.getSymbol())); + } + this.expandEnvironment(tmpAtom, tmpFunctionalGroup); + int tmpAtomImplicitHydrogenCount = tmpAtom.getImplicitHydrogenCount(); + if (tmpAtomImplicitHydrogenCount != 0) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + " - adding %d hydrogens...", tmpAtomImplicitHydrogenCount)); } - continue; + this.addHydrogens(tmpAtom, tmpAtomImplicitHydrogenCount, tmpFunctionalGroup); + tmpAtom.setImplicitHydrogenCount(0); + } + continue; } // for HN-(C_env)-C_env & HS-C_env -> do not replace H by R! (only C_env!) - if((atom.getAtomicNumber() == 7 && envCCount == 2) - || (atom.getAtomicNumber() == 16 && envCCount == 1)) { - if(isDbg()) LOGGING_TOOL.debug(" - found sec. amine or simple thiol"); - int hCount = atom.getImplicitHydrogenCount(); - if(hCount != 0) { - if(isDbg()) LOGGING_TOOL.debug(String.format(" - adding %d hydrogens...", hCount)); - addHydrogens(atom, hCount, fGroup); - atom.setImplicitHydrogenCount(0); + if ((tmpAtom.getAtomicNumber() == 7 && tmpEnvCCount == 2) + || (tmpAtom.getAtomicNumber() == 16 && tmpEnvCCount == 1)) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" - found sec. amine or simple thiol"); + } + int tmpAtomImplicitHydrogenCount = tmpAtom.getImplicitHydrogenCount(); + if (tmpAtomImplicitHydrogenCount != 0) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format(" - adding %d hydrogens...", + tmpAtomImplicitHydrogenCount)); + } + this.addHydrogens(tmpAtom, tmpAtomImplicitHydrogenCount, tmpFunctionalGroup); + tmpAtom.setImplicitHydrogenCount(0); + } + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" - expanding environment..."); } - if(isDbg()) LOGGING_TOOL.debug(" - expanding environment..."); - expandEnvironmentGeneralized(atom, fGroup); + this.expandEnvironmentGeneralized(tmpAtom, tmpFunctionalGroup); continue; } - } - else if(isHeteroatom(atom)) { - int rAtomCount = atom.getValency(); - Integer hCount = atom.getImplicitHydrogenCount(); + } else if (ErtlFunctionalGroupsFinder.isHeteroatom(tmpAtom)) { + // env is null and marked atoms is a hetero atom -> single aromatic heteroatom + int rAtomCount = tmpAtom.getValency(); + Integer hCount = tmpAtom.getImplicitHydrogenCount(); if(hCount != null && hCount != 0) { - atom.setImplicitHydrogenCount(0); + tmpAtom.setImplicitHydrogenCount(0); } - String atomTypeName = atom.getAtomTypeName(); - if(isDbg()) LOGGING_TOOL.debug(String.format(" - found single aromatic heteroatom (%s, Atomtype %s). Adding %d R-Atoms...", atom.getSymbol(), atomTypeName, rAtomCount)); - addRAtoms(atom, rAtomCount, fGroup); + String atomTypeName = tmpAtom.getAtomTypeName(); + if(isDbg()) LOGGING_TOOL.debug(String.format(" - found single aromatic heteroatom (%s, Atomtype %s). Adding %d R-Atoms...", tmpAtom.getSymbol(), atomTypeName, rAtomCount)); + addRAtoms(tmpAtom, rAtomCount, tmpFunctionalGroup); continue; } } // get atoms to process - List fGroupAtoms = new ArrayList(fGroup.getAtomCount());//Lists.newArrayList(fGroup.atoms()); - fGroup.atoms().forEach(fGroupAtoms::add); + List fGroupAtoms = new ArrayList(tmpFunctionalGroup.getAtomCount());//Lists.newArrayList(fGroup.atoms()); + tmpFunctionalGroup.atoms().forEach(fGroupAtoms::add); // process atoms... for(IAtom atom : fGroupAtoms) { @@ -746,7 +759,7 @@ else if(isHeteroatom(atom)) { } int rAtomCount = atom.getValency() - 1; if(isDbg()) LOGGING_TOOL.debug(String.format(" - found connected aromatic heteroatom (%s). Adding %d R-Atoms...", atom.getSymbol(), rAtomCount)); - addRAtoms(atom, rAtomCount, fGroup); + addRAtoms(atom, rAtomCount, tmpFunctionalGroup); } // processing carbons... @@ -760,22 +773,23 @@ else if(isHeteroatom(atom)) { } else { if(isDbg()) LOGGING_TOOL.debug(" - found carbonyl-carbon. Expanding environment..."); - expandEnvironmentGeneralized(atom, fGroup); + expandEnvironmentGeneralized(atom, tmpFunctionalGroup); continue; } } // processing heteroatoms... else { if(isDbg()) LOGGING_TOOL.debug(String.format(" - found heteroatom (%s). Expanding environment...", atom.getSymbol())); - expandEnvironmentGeneralized(atom, fGroup); + expandEnvironmentGeneralized(atom, tmpFunctionalGroup); continue; } } + } //end of loop over given functional groups list + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Generalization of functional groups completed. ##########"); } - - if(isDbg()) LOGGING_TOOL.debug("########## Generalization of functional groups completed. ##########"); } - + // /** * Expands the full environments of functional groups, converted into atoms and bonds. * From d6f9004662c10a5edf7859ecfc6e8ca25847de78 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Fri, 12 Jan 2024 10:05:56 +0100 Subject: [PATCH 09/27] Continued refactoring --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 150 +++++++++++------- 1 file changed, 91 insertions(+), 59 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 3df74ec..3329830 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -169,7 +169,8 @@ public IBond createBond(IAtom aTargetAtom, IAtom anEnvCAtom) { } // /** - * CDK logging tool instance for this class. + * CDK logging tool instance for this class. Use ErtlFunctionalGroupsFinder.LOGGING_TOOL.setLevel(ILoggingTool.DEBUG); + * to activate debug messages. */ public static final ILoggingTool LOGGING_TOOL = LoggingToolFactory.createLoggingTool(ErtlFunctionalGroupsFinder.class); // @@ -733,54 +734,65 @@ private void expandGeneralizedEnvironments(List aFunctionalGroup } } else if (ErtlFunctionalGroupsFinder.isHeteroatom(tmpAtom)) { // env is null and marked atoms is a hetero atom -> single aromatic heteroatom - int rAtomCount = tmpAtom.getValency(); - Integer hCount = tmpAtom.getImplicitHydrogenCount(); - if(hCount != null && hCount != 0) { + int tmpRAtomCount = tmpAtom.getValency(); + Integer tmpAtomImplicitHydrogenCount = tmpAtom.getImplicitHydrogenCount(); + if (tmpAtomImplicitHydrogenCount != null && tmpAtomImplicitHydrogenCount != 0) { tmpAtom.setImplicitHydrogenCount(0); } - String atomTypeName = tmpAtom.getAtomTypeName(); - if(isDbg()) LOGGING_TOOL.debug(String.format(" - found single aromatic heteroatom (%s, Atomtype %s). Adding %d R-Atoms...", tmpAtom.getSymbol(), atomTypeName, rAtomCount)); - addRAtoms(tmpAtom, rAtomCount, tmpFunctionalGroup); + String tmpAtomTypeName = tmpAtom.getAtomTypeName(); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + " - found single aromatic heteroatom (%s, Atomtype %s). Adding %d R-Atoms...", + tmpAtom.getSymbol(), + tmpAtomTypeName, + tmpRAtomCount)); + } + this.addRAtoms(tmpAtom, tmpRAtomCount, tmpFunctionalGroup); continue; } - } - + } // end of pre-check for special one-atom FG cases // get atoms to process - List fGroupAtoms = new ArrayList(tmpFunctionalGroup.getAtomCount());//Lists.newArrayList(fGroup.atoms()); - tmpFunctionalGroup.atoms().forEach(fGroupAtoms::add); - - // process atoms... - for(IAtom atom : fGroupAtoms) { - List environment = markedAtomToConnectedEnvCMap.get(atom); - - if(environment == null) { - if(atom.getImplicitHydrogenCount() != 0) { - atom.setImplicitHydrogenCount(0); + List tmpFunctionalGroupAtoms = new ArrayList<>(tmpFunctionalGroup.getAtomCount()); + tmpFunctionalGroup.atoms().forEach(tmpFunctionalGroupAtoms::add); + // process individual functional group atoms... + for (IAtom tmpFunctionalGroupAtom : tmpFunctionalGroupAtoms) { + List tmpFGenvCs = this.markedAtomToConnectedEnvCMap.get(tmpFunctionalGroupAtom); + if (tmpFGenvCs == null) { + if (tmpFunctionalGroupAtom.getImplicitHydrogenCount() != 0) { + tmpFunctionalGroupAtom.setImplicitHydrogenCount(0); + } + int tmpRAtomCount = tmpFunctionalGroupAtom.getValency() - 1; + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + " - found connected aromatic heteroatom (%s). Adding %d R-Atoms...", + tmpFunctionalGroupAtom.getSymbol(), + tmpRAtomCount)); } - int rAtomCount = atom.getValency() - 1; - if(isDbg()) LOGGING_TOOL.debug(String.format(" - found connected aromatic heteroatom (%s). Adding %d R-Atoms...", atom.getSymbol(), rAtomCount)); - addRAtoms(atom, rAtomCount, tmpFunctionalGroup); + this.addRAtoms(tmpFunctionalGroupAtom, tmpRAtomCount, tmpFunctionalGroup); } - // processing carbons... - if(atom.getAtomicNumber() == 6) { - if(atom.getProperty(CARBONYL_C_MARKER) == null) { - if(atom.getImplicitHydrogenCount() != 0) { - atom.setImplicitHydrogenCount(0); + if (tmpFunctionalGroupAtom.getAtomicNumber() == 6) { + if (Objects.isNull(tmpFunctionalGroupAtom.getProperty(ErtlFunctionalGroupsFinder.CARBONYL_C_MARKER))) { + if (tmpFunctionalGroupAtom.getImplicitHydrogenCount() != 0) { + tmpFunctionalGroupAtom.setImplicitHydrogenCount(0); + } + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" - ignoring environment for marked carbon atom"); } - if(isDbg()) LOGGING_TOOL.debug(" - ignoring environment for marked carbon atom"); continue; - } - else { - if(isDbg()) LOGGING_TOOL.debug(" - found carbonyl-carbon. Expanding environment..."); - expandEnvironmentGeneralized(atom, tmpFunctionalGroup); + } else { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" - found carbonyl-carbon. Expanding environment..."); + } + this.expandEnvironmentGeneralized(tmpFunctionalGroupAtom, tmpFunctionalGroup); continue; } - } - // processing heteroatoms... - else { - if(isDbg()) LOGGING_TOOL.debug(String.format(" - found heteroatom (%s). Expanding environment...", atom.getSymbol())); - expandEnvironmentGeneralized(atom, tmpFunctionalGroup); + } else { // processing heteroatoms... + if (ErtlFunctionalGroupsFinder.isDbg()) { + LOGGING_TOOL.debug(String.format(" - found heteroatom (%s). Expanding environment...", + tmpFunctionalGroupAtom.getSymbol())); + } + this.expandEnvironmentGeneralized(tmpFunctionalGroupAtom, tmpFunctionalGroup); continue; } } @@ -793,33 +805,46 @@ private void expandGeneralizedEnvironments(List aFunctionalGroup /** * Expands the full environments of functional groups, converted into atoms and bonds. * - * @param fGroups the list of functional groups including "environments" + * @param aFunctionalGroupsList the list of functional groups including their "environments" */ - private void expandFullEnvironments(List fGroups) { - if(isDbg()) LOGGING_TOOL.debug("########## Starting expansion of full environments for functional groups... ##########"); - - for(IAtomContainer fGroup : fGroups) { - int atomCount = fGroup.getAtomCount(); - if(isDbg()) LOGGING_TOOL.debug(String.format("Expanding environment on functional group (%d atoms)...", atomCount)); - - for(int i = 0; i < atomCount; i++) { - IAtom atom = fGroup.getAtom(i); - - if(isDbg()) LOGGING_TOOL.debug(String.format(" - Atom #%d - Expanding environment...", i)); - expandEnvironment(atom, fGroup); - - int hCount = atom.getImplicitHydrogenCount(); - if(hCount != 0) { - if(isDbg()) LOGGING_TOOL.debug(String.format(" - adding %d hydrogens...", hCount)); - addHydrogens(atom, hCount, fGroup); - atom.setImplicitHydrogenCount(0); + private void expandFullEnvironments(List aFunctionalGroupsList) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Starting expansion of full environments for functional groups... ##########"); + } + for (IAtomContainer tmpFunctionalGroup : aFunctionalGroupsList) { + int tmpAtomCount = tmpFunctionalGroup.getAtomCount(); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Expanding environment on functional group (%d atoms)...", tmpAtomCount)); + } + for (int i = 0; i < tmpAtomCount; i++) { + IAtom tmpFunctionalGroupAtom = tmpFunctionalGroup.getAtom(i); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + " - Atom #%d - Expanding environment...", i)); + } + this.expandEnvironment(tmpFunctionalGroupAtom, tmpFunctionalGroup); + int tmpImplicitHydrogenCount = tmpFunctionalGroupAtom.getImplicitHydrogenCount(); + if (tmpImplicitHydrogenCount != 0) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + " - adding %d hydrogens...", tmpImplicitHydrogenCount)); + } + this.addHydrogens(tmpFunctionalGroupAtom, tmpImplicitHydrogenCount, tmpFunctionalGroup); + tmpFunctionalGroupAtom.setImplicitHydrogenCount(0); } } } - - if(isDbg()) LOGGING_TOOL.debug("########## Expansion of full environments for functional groups completed. ##########"); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Expansion of full environments for functional groups completed. ##########"); + } } - + // + /** + * TODO + * @param atom + * @param container + */ private void expandEnvironment(IAtom atom, IAtomContainer container) { List environment = markedAtomToConnectedEnvCMap.get(atom); @@ -957,7 +982,14 @@ private List partitionIntoGroups(IAtomContainer sourceContainer, return groups; } + // + /** + * + * Use ErtlFunctionalGroupsFinder.LOGGING_TOOL.setLevel(ILoggingTool.DEBUG); to activate debug messages. + * + * @return + */ private static boolean isDbg() { return ErtlFunctionalGroupsFinder.LOGGING_TOOL.isDebugEnabled(); } From 9d8edf625d6768cdab6b674a02929765e1d1c319 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Fri, 12 Jan 2024 17:02:02 +0100 Subject: [PATCH 10/27] Overhaul of more methods, WIP --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 210 +++++++++++------- 1 file changed, 132 insertions(+), 78 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 3329830..1f015d2 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -789,7 +789,7 @@ private void expandGeneralizedEnvironments(List aFunctionalGroup } } else { // processing heteroatoms... if (ErtlFunctionalGroupsFinder.isDbg()) { - LOGGING_TOOL.debug(String.format(" - found heteroatom (%s). Expanding environment...", + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format(" - found heteroatom (%s). Expanding environment...", tmpFunctionalGroupAtom.getSymbol())); } this.expandEnvironmentGeneralized(tmpFunctionalGroupAtom, tmpFunctionalGroup); @@ -841,98 +841,147 @@ private void expandFullEnvironments(List aFunctionalGroupsList) } // /** - * TODO - * @param atom - * @param container + * Expand the environment of one atom in a functional group. Takes all environmental C atoms cached earlier and + * re-adds them to the atom as environment. + * + * @param aFunctionalGroupAtom the atom whose environment to expand + * @param aFunctionalGroup the functional group container that the atom is part of */ - private void expandEnvironment(IAtom atom, IAtomContainer container) { - List environment = markedAtomToConnectedEnvCMap.get(atom); + private void expandEnvironment(IAtom aFunctionalGroupAtom, IAtomContainer aFunctionalGroup) { + List tmpEnvCAtomsList = this.markedAtomToConnectedEnvCMap.get(aFunctionalGroupAtom); - if(environment == null || environment.isEmpty()) { - if(isDbg()) LOGGING_TOOL.debug(" found no environment to expand."); + if (Objects.isNull(tmpEnvCAtomsList) || tmpEnvCAtomsList.isEmpty()) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" found no environment to expand."); + } return; } - - int cAromCount = 0, cAliphCount = 0; - for(EnvironmentalC envC : environment) { - IAtom cAtom = atom.getBuilder().newInstance(IAtom.class, "C"); - cAtom.setAtomTypeName("C"); - cAtom.setImplicitHydrogenCount(0); - if(envC.getType() == EnvironmentalCType.C_AROMATIC) { - cAtom.setIsAromatic(true); - cAromCount++; - } - else { - cAliphCount++; + int tmpAromaticCAtomCount = 0, tmpAliphaticCAtomCount = 0; + for (EnvironmentalC tmpEnvCAtom : tmpEnvCAtomsList) { + IAtom tmpCAtom = aFunctionalGroupAtom.getBuilder().newInstance(IAtom.class, "C"); + tmpCAtom.setAtomTypeName("C"); + tmpCAtom.setImplicitHydrogenCount(0); + if (tmpEnvCAtom.getType() == EnvironmentalCType.C_AROMATIC) { + tmpCAtom.setIsAromatic(true); + tmpAromaticCAtomCount++; + } else { + tmpAliphaticCAtomCount++; } - - IBond bond = envC.createBond(atom, cAtom); - - container.addAtom(cAtom); - container.addBond(bond); + IBond tmpBond = tmpEnvCAtom.createBond(aFunctionalGroupAtom, tmpCAtom); + aFunctionalGroup.addAtom(tmpCAtom); + aFunctionalGroup.addBond(tmpBond); + } + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + " expanded environment: %dx C_ar and %dx C_al", + tmpAromaticCAtomCount, + tmpAliphaticCAtomCount)); } - - if(isDbg()) LOGGING_TOOL.debug(String.format(" expanded environment: %dx C_ar and %dx C_al", cAromCount, cAliphCount)); } - - // only call this on marked heteroatoms / carbonyl-C's! - private void expandEnvironmentGeneralized(IAtom atom, IAtomContainer container) { - - List environment = markedAtomToConnectedEnvCMap.get(atom); - - if(environment == null) { - if(isDbg()) LOGGING_TOOL.debug(" found no environment to expand."); + // + /** + * Expand the generalized environment of marked heteroatoms and carbonyl-Cs in a functional group. + * Takes all environmental C atoms cached earlier and re-adds them to the atom as environment. + * Note: only call this on marked heteroatoms / carbonyl-C's! + * + * @param aFunctionalGroupAtom the atom whose environment to expand + * @param aFunctionalGroup the functional group container that the atom is part of + */ + private void expandEnvironmentGeneralized(IAtom aFunctionalGroupAtom, IAtomContainer aFunctionalGroup) { + List tmpEnvironment = this.markedAtomToConnectedEnvCMap.get(aFunctionalGroupAtom); + if (Objects.isNull(tmpEnvironment)) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" found no environment to expand."); + } return; } - - int rAtomCount = environment.size(); - int rAtomsForCCount = rAtomCount; - if(atom.getAtomicNumber() == 8 && atom.getImplicitHydrogenCount() == 1) { - addHydrogens(atom, 1, container); - atom.setImplicitHydrogenCount(0); - if(isDbg()) LOGGING_TOOL.debug(" expanded hydrogen on connected OH-Group"); - } - else if(isHeteroatom(atom)) rAtomCount += atom.getImplicitHydrogenCount(); - addRAtoms(atom, rAtomCount, container); - - if(atom.getImplicitHydrogenCount() != 0) { - atom.setImplicitHydrogenCount(0); + int tmpRAtomCount = tmpEnvironment.size(); + int tmpRAtomsForCCount = tmpRAtomCount; + if (aFunctionalGroupAtom.getAtomicNumber() == 8 && aFunctionalGroupAtom.getImplicitHydrogenCount() == 1) { + this.addHydrogens(aFunctionalGroupAtom, 1, aFunctionalGroup); + aFunctionalGroupAtom.setImplicitHydrogenCount(0); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" expanded hydrogen on connected OH-Group"); + } + } else if (this.isHeteroatom(aFunctionalGroupAtom)) { + tmpRAtomCount += aFunctionalGroupAtom.getImplicitHydrogenCount(); + } + this.addRAtoms(aFunctionalGroupAtom, tmpRAtomCount, aFunctionalGroup); + if (aFunctionalGroupAtom.getImplicitHydrogenCount() != 0) { + aFunctionalGroupAtom.setImplicitHydrogenCount(0); + } + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + " expanded environment: %dx R-atom (incl. %d for H replacement)", + tmpRAtomCount, + tmpRAtomCount - tmpRAtomsForCCount)); } - - if(isDbg()) LOGGING_TOOL.debug(String.format(" expanded environment: %dx R-atom (incl. %d for H replacement)", rAtomCount, rAtomCount - rAtomsForCCount)); } - - private static boolean isHeteroatom(IAtom atom) { - int atomicNr = atom.getAtomicNumber(); - return atomicNr != 1 && atomicNr != 6; + // + /** + * Checks whether the given atom is a hetero-atom (i.e. non-carbon and non-hydrogen, judged by atomic number). + * + * @param anAtom the atom to test + * @return true if the given atom is neither a carbon nor a hydrogen atom + */ + private static boolean isHeteroatom(IAtom anAtom) { + int tmpAtomicNr = anAtom.getAtomicNumber(); + return tmpAtomicNr != 1 && tmpAtomicNr != 6; } - - private boolean isNonmetal(IAtom atom) { - return NONMETAL_ATOMIC_NUMBERS.contains(atom.getAtomicNumber()); + // + /** + * Checks whether the given atom is from an element in the organic subset, i.e. not a metal or metalloid atom. + * See the public constant set of non-metal atomic numbers declared in this class. + * + * @param anAtom + * @return true if the given atom is organic and not a metal or metalloid atom + */ + private static boolean isNonmetal(IAtom anAtom) { + return ErtlFunctionalGroupsFinder.NONMETAL_ATOMIC_NUMBERS.contains(anAtom.getAtomicNumber()); } - - private void addHydrogens(IAtom atom, int number, IAtomContainer container) { - for(int i = 0; i < number; i++) { - IAtom hydrogen = atom.getBuilder().newInstance(IAtom.class, "H"); - hydrogen.setAtomTypeName("H"); - hydrogen.setImplicitHydrogenCount(0); - - container.addAtom(hydrogen); - container.addBond(atom.getBuilder().newInstance(IBond.class, atom, hydrogen, Order.SINGLE)); + // + /** + * Add explicit hydrogen atoms to an atom in a molecule. + * + * @param anAtom the atom to add the explicit hydrogen atoms to + * @param aNrOfHydrogenAtoms the number of explicit hydrogens atoms to add + * @param aMolecule the molecule the atom belongs to + */ + private static void addHydrogens(IAtom anAtom, int aNrOfHydrogenAtoms, IAtomContainer aMolecule) { + for (int i = 0; i < aNrOfHydrogenAtoms; i++) { + IAtom tmpHydrogenAtom = anAtom.getBuilder().newInstance(IAtom.class, "H"); + tmpHydrogenAtom.setAtomTypeName("H"); + tmpHydrogenAtom.setImplicitHydrogenCount(0); + aMolecule.addAtom(tmpHydrogenAtom); + aMolecule.addBond(anAtom.getBuilder().newInstance(IBond.class, anAtom, tmpHydrogenAtom, Order.SINGLE)); } } - - private void addRAtoms(IAtom atom, int number, IAtomContainer container) { - for(int i = 0; i < number; i++) { - IPseudoAtom rAtom = atom.getBuilder().newInstance(IPseudoAtom.class, "R"); - rAtom.setAttachPointNum(1); - rAtom.setImplicitHydrogenCount(0); - - container.addAtom(rAtom); - container.addBond(atom.getBuilder().newInstance(IBond.class, atom, rAtom, Order.SINGLE)); + // + /** + * Add pseudo ("R") atoms to an atom in a molecule. + * + * @param anAtom the atom to add the pseudo atoms to + * @param aNrOfRAtoms the number of pseudo atoms to add + * @param aMolecule the molecule the atom belongs to + */ + private void addRAtoms(IAtom anAtom, int aNrOfRAtoms, IAtomContainer aMolecule) { + for (int i = 0; i < aNrOfRAtoms; i++) { + IPseudoAtom tmpRAtom = anAtom.getBuilder().newInstance(IPseudoAtom.class, "R"); + tmpRAtom.setAttachPointNum(1); + tmpRAtom.setImplicitHydrogenCount(0); + aMolecule.addAtom(tmpRAtom); + aMolecule.addBond(anAtom.getBuilder().newInstance(IBond.class, anAtom, tmpRAtom, Order.SINGLE)); } } - + // + /** + * + * + * @param sourceContainer + * @param atomIdxToFGMap + * @param fGroupCount + * @return + */ private List partitionIntoGroups(IAtomContainer sourceContainer, int[] atomIdxToFGMap, int fGroupCount) { List groups = new ArrayList<>(fGroupCount); for(int i = 0; i < fGroupCount; i++) { @@ -983,8 +1032,8 @@ private List partitionIntoGroups(IAtomContainer sourceContainer, return groups; } // - /** + * * * Use ErtlFunctionalGroupsFinder.LOGGING_TOOL.setLevel(ILoggingTool.DEBUG); to activate debug messages. * @@ -993,7 +1042,12 @@ private List partitionIntoGroups(IAtomContainer sourceContainer, private static boolean isDbg() { return ErtlFunctionalGroupsFinder.LOGGING_TOOL.isDebugEnabled(); } - + // + /** + * + * @param molecule + * @return + */ private boolean checkConstraints(IAtomContainer molecule) { for(IAtom atom : molecule.atoms()) { if(atom.getFormalCharge() != null && atom.getFormalCharge() != 0) { From b13a14cafeabd458ae87f0c5d878a93ef895d4b2 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Mon, 15 Jan 2024 11:26:32 +0100 Subject: [PATCH 11/27] Overhaul of partitionIntoGroups() --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 80 +++++++++---------- 1 file changed, 38 insertions(+), 42 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 1f015d2..5c4acc4 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -31,7 +31,6 @@ import org.openscience.cdk.interfaces.IPseudoAtom; import org.openscience.cdk.interfaces.ISingleElectron; -import javax.security.auth.kerberos.KerberosTicket; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; @@ -43,6 +42,7 @@ import java.util.Queue; import java.util.Set; + /** * Finds and extracts a molecule's functional groups in a purely rule-based manner. * @@ -975,61 +975,57 @@ private void addRAtoms(IAtom anAtom, int aNrOfRAtoms, IAtomContainer aMolecule) } // /** + * Partitions the marked atoms and their processed environments into separate functional groups and builds atom containers + * for them as final step before returning them. Transfers the respective atoms, bonds, single electrons, and lone + * pairs from the source atom container to the new functional group atom containers. * - * - * @param sourceContainer - * @param atomIdxToFGMap - * @param fGroupCount - * @return + * @param aSourceContainer molecule atom container to take atoms, bonds, and electron objects from + * @param anAtomIdxToFGIdxMap array that maps atom indices (array positions) to functional group indices that the atoms belong to + * @param aFunctionalGroupCount maximum functional group index (+1) to know how many functional group atom containers to build + * @return list of partitioned functional group atom containers */ - private List partitionIntoGroups(IAtomContainer sourceContainer, int[] atomIdxToFGMap, int fGroupCount) { - List groups = new ArrayList<>(fGroupCount); - for(int i = 0; i < fGroupCount; i++) { - groups.add(sourceContainer.getBuilder().newInstance(IAtomContainer.class)); + private List partitionIntoGroups(IAtomContainer aSourceContainer, int[] anAtomIdxToFGIdxMap, int aFunctionalGroupCount) { + List tmpFunctionalGroups = new ArrayList<>(aFunctionalGroupCount); + for (int i = 0; i < aFunctionalGroupCount; i++) { + tmpFunctionalGroups.add(aSourceContainer.getBuilder().newInstance(IAtomContainer.class)); } - - Map atomtoFGMap = new HashMap(sourceContainer.getAtomCount());//Maps.newHashMapWithExpectedSize(sourceContainer.getAtomCount()); - + Map tmpAtomtoFGMap = new HashMap<>((int) ((aSourceContainer.getAtomCount() / 0.75f) + 2), 0.75f); // atoms - for(int atomIdx = 0; atomIdx < sourceContainer.getAtomCount(); atomIdx++) { - int fGroupId = atomIdxToFGMap[atomIdx]; - - if(fGroupId == -1) { + for (int tmpAtomIdx = 0; tmpAtomIdx < aSourceContainer.getAtomCount(); tmpAtomIdx++) { + int tmpFGroupIdx = anAtomIdxToFGIdxMap[tmpAtomIdx]; + if (tmpFGroupIdx == -1) { continue; } - - IAtom atom = sourceContainer.getAtom(atomIdx); - IAtomContainer myGroup = groups.get(fGroupId); - myGroup.addAtom(atom); - atomtoFGMap.put(atom, myGroup); + IAtom tmpAtom = aSourceContainer.getAtom(tmpAtomIdx); + IAtomContainer tmpPartitionedFunctionalGroup = tmpFunctionalGroups.get(tmpFGroupIdx); + tmpPartitionedFunctionalGroup.addAtom(tmpAtom); + tmpAtomtoFGMap.put(tmpAtom, tmpPartitionedFunctionalGroup); } - // bonds - for(IBond bond : sourceContainer.bonds()) { - IAtomContainer beginGroup = atomtoFGMap.get(bond.getBegin()); - IAtomContainer endGroup = atomtoFGMap.get(bond.getEnd()); - - if(beginGroup == null || endGroup == null || beginGroup != endGroup) + for (IBond tmpBond : aSourceContainer.bonds()) { + // check whether begin and end atom of the bond have been correctly assigned to the same FG + IAtomContainer tmpFGofBeginAtom = tmpAtomtoFGMap.get(tmpBond.getBegin()); + IAtomContainer tmpFGofEndAtom = tmpAtomtoFGMap.get(tmpBond.getEnd()); + if (Objects.isNull(tmpFGofBeginAtom) || Objects.isNull(tmpFGofEndAtom) || tmpFGofBeginAtom != tmpFGofEndAtom) { continue; - - beginGroup.addBond(bond); + } + tmpFGofBeginAtom.addBond(tmpBond); } - // single electrons - for (ISingleElectron electron : sourceContainer.singleElectrons()) { - IAtomContainer group = atomtoFGMap.get(electron.getAtom()); - if(group != null) - group.addSingleElectron(electron); + for (ISingleElectron tmpSingleElectron : aSourceContainer.singleElectrons()) { + IAtomContainer tmpFunctionalGroup = tmpAtomtoFGMap.get(tmpSingleElectron.getAtom()); + if (!Objects.isNull(tmpFunctionalGroup)) { + tmpFunctionalGroup.addSingleElectron(tmpSingleElectron); + } } - // lone pairs - for (ILonePair lonePair : sourceContainer.lonePairs()) { - IAtomContainer group = atomtoFGMap.get(lonePair.getAtom()); - if(group != null) - group.addLonePair(lonePair); + for (ILonePair tmpLonePair : aSourceContainer.lonePairs()) { + IAtomContainer tmpFunctionalGroup = tmpAtomtoFGMap.get(tmpLonePair.getAtom()); + if (!Objects.isNull(tmpFunctionalGroup)) { + tmpFunctionalGroup.addLonePair(tmpLonePair); + } } - - return groups; + return tmpFunctionalGroups; } // /** From a241b13a9385d22003503ae4fdba3f2bf12f4303 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Mon, 15 Jan 2024 14:00:39 +0100 Subject: [PATCH 12/27] Finished refactoring of EFGF for now; --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 234 +++++++++--------- 1 file changed, 120 insertions(+), 114 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 5c4acc4..928a81e 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -45,7 +45,6 @@ /** * Finds and extracts a molecule's functional groups in a purely rule-based manner. - * * This class implements Peter Ertl's algorithm for the automated detection and extraction * of functional groups in organic molecules * ([Ertl P. An algorithm to identify functional groups in organic molecules. J Cheminform. 2017; 9:36.]). @@ -96,28 +95,28 @@ private class EnvironmentalC { /** * Indicates whether carbon atom is aromatic or aliphatic. */ - private EnvironmentalCType type; + private final EnvironmentalCType type; // /** * Bond index of the original C atom. */ - private int bondIndex; + private final int bondIndex; // /** * Order of the bond connecting this environmental C atom to the marked functional group atom. */ - private IBond.Order bondOrder; + private final IBond.Order bondOrder; // /** * Stereo information of the bond connecting this environmental C atom to the marked functional group atom. */ - private IBond.Stereo bondStereo; + private final IBond.Stereo bondStereo; // /** * Flags of the bond connecting this environmental C atom to the marked functional group atom. IChemObjecflags * are properties defined by an integer value (array position) and a boolean value. */ - private boolean[] bondFlags; + private final boolean[] bondFlags; // /** * Default constructor defining all fields. Order, stereo, and flags are taken from the IBond object directly. @@ -193,31 +192,30 @@ public IBond createBond(IAtom aTargetAtom, IAtom anEnvCAtom) { /** * Map of bonds in the input molecule, cache(!). */ - private EdgeToBondMap bondMap; + private EdgeToBondMap bondMapCache; // /** * Adjacency list representation of input molecule, cache(!). */ - private int[][] adjList; + private int[][] adjListCache; // /** * Set for atoms marked as being part of a functional group, represented by an internal index based on the atom * count in the input molecule, cache(!). */ - private HashSet markedAtoms; + private HashSet markedAtomsCache; // /** * HashMap for storing aromatic hetero-atom indices and whether they have already been assigned to a larger functional * group. If false, they form single-atom FG by themselves, cache(!). - * * key: atom idx, value: isInGroup */ - private HashMap aromaticHeteroAtomIndicesToIsInGroupBoolMap; + private HashMap aromaticHeteroAtomIndicesToIsInGroupBoolMapCache; // /** * HashMap for storing marked atom to connected environmental carbon atom relations, cache(!). */ - private HashMap> markedAtomToConnectedEnvCMap; + private HashMap> markedAtomToConnectedEnvCMapCache; // /** * Default constructor for ErtlFunctionalGroupsFinder with functional group generalization turned ON. @@ -311,22 +309,25 @@ public List find (IAtomContainer aMolecule, boolean aShouldInput */ public List find(IAtomContainer aMolecule, boolean aShouldInputBeCloned, boolean anAreInputRestrictionsApplied) throws CloneNotSupportedException, IllegalArgumentException { + this.clearCache(); IAtomContainer tmpMolecule; if (aShouldInputBeCloned) { tmpMolecule = aMolecule.clone(); } else { tmpMolecule = aMolecule; } - if (anAreInputRestrictionsApplied) { - this.checkConstraints(tmpMolecule); - } for (IAtom tmpAtom : tmpMolecule.atoms()) { if(Objects.isNull(tmpAtom.getImplicitHydrogenCount())) { tmpAtom.setImplicitHydrogenCount(0); } } - this.bondMap = EdgeToBondMap.withSpaceFor(tmpMolecule); - this.adjList = GraphUtil.toAdjList(tmpMolecule, this.bondMap); + this.bondMapCache = EdgeToBondMap.withSpaceFor(tmpMolecule); + this.adjListCache = GraphUtil.toAdjList(tmpMolecule, this.bondMapCache); + if (anAreInputRestrictionsApplied) { + // throws IllegalArgumentException if constraints are not met + // only done now because adjacency list cache is needed in the method + this.checkConstraints(tmpMolecule); + } this.markAtoms(tmpMolecule); // extract raw groups List tmpFunctionalGroupsList = this.extractGroups(tmpMolecule); @@ -343,14 +344,16 @@ public List find(IAtomContainer aMolecule, boolean aShouldInputB } /** - * Clear caches related to the input molecule. + * Clear caches related to the input molecule. Note, these are not proper caches, there are no results cached. Here, + * only data taken from the input molecule is saved for only one execution of the find() method, to facilitate + * communication between the private methods involved. */ private void clearCache() { - this.bondMap = null; - this.adjList = null; - this.markedAtoms = null; - this.aromaticHeteroAtomIndicesToIsInGroupBoolMap = null; - this.markedAtomToConnectedEnvCMap = null; + this.bondMapCache = null; + this.adjListCache = null; + this.markedAtomsCache = null; + this.aromaticHeteroAtomIndicesToIsInGroupBoolMapCache = null; + this.markedAtomToConnectedEnvCMapCache = null; } /** @@ -359,24 +362,23 @@ private void clearCache() { * @param aMolecule molecule with atoms to mark */ private void markAtoms(IAtomContainer aMolecule) { - if (this.isDbg()) { + if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Starting search for atoms to mark ... ##########"); } // store marked atoms - this.markedAtoms = new HashSet<>((int) ((aMolecule.getAtomCount() / 0.75f) + 2), 0.75f); + this.markedAtomsCache = new HashSet<>((int) ((aMolecule.getAtomCount() / 0.75f) + 2), 0.75f); // store aromatic heteroatoms - this.aromaticHeteroAtomIndicesToIsInGroupBoolMap = new HashMap<>((int) ((aMolecule.getAtomCount() / 0.75f) + 2), 0.75f); - //TODO set and use a more explicit and trustworthy index? + this.aromaticHeteroAtomIndicesToIsInGroupBoolMapCache = new HashMap<>((int) ((aMolecule.getAtomCount() / 0.75f) + 2), 0.75f); for (int idx = 0; idx < aMolecule.getAtomCount(); idx++) { // skip atoms that were already marked in a previous iteration - if (this.markedAtoms.contains(idx)) { + if (this.markedAtomsCache.contains(idx)) { continue; } IAtom tmpAtom = aMolecule.getAtom(idx); // skip aromatic atoms but add aromatic HETERO-atoms to map for later processing if (tmpAtom.isAromatic()) { - if (ErtlFunctionalGroupsFinder.isHeteroatom(tmpAtom)) { - this.aromaticHeteroAtomIndicesToIsInGroupBoolMap.put(idx, false); + if (this.isHeteroatom(tmpAtom)) { + this.aromaticHeteroAtomIndicesToIsInGroupBoolMapCache.put(idx, false); } continue; } @@ -387,9 +389,9 @@ private void markAtoms(IAtomContainer aMolecule) { boolean tmpIsMarked = false; // count for the number of connected O, N & S atoms to detect acetal carbons int tmpConnectedONSatomsCounter = 0; - for (int tmpConnectedIdx : this.adjList[idx]) { + for (int tmpConnectedIdx : this.adjListCache[idx]) { IAtom tmpConnectedAtom = aMolecule.getAtom(tmpConnectedIdx); - IBond tmpConnectedBond = this.bondMap.get(idx, tmpConnectedIdx); + IBond tmpConnectedBond = this.bondMapCache.get(idx, tmpConnectedIdx); // if connected to heteroatom or C in aliphatic double or triple bond... [CONDITIONS 2.1 & 2.2] if (tmpConnectedAtom.getAtomicNumber() != 1 @@ -397,7 +399,7 @@ private void markAtoms(IAtomContainer aMolecule) { && !tmpConnectedBond.isAromatic())) { // set the *connected* atom as marked (add() true if this set did not already contain the specified element) - if (this.markedAtoms.add(tmpConnectedIdx)) { + if (this.markedAtomsCache.add(tmpConnectedIdx)) { if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( "Marking Atom #%d (%s) - Met condition %s", @@ -417,7 +419,7 @@ private void markAtoms(IAtomContainer aMolecule) { // but check for carbonyl-C before break if (tmpConnectedAtom.getAtomicNumber() == 8 && tmpConnectedBond.getOrder() == Order.DOUBLE - && this.adjList[idx].length == 3) { + && this.adjListCache[idx].length == 3) { tmpAtom.setProperty(CARBONYL_C_MARKER, true); if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("- was flagged as Carbonly-C"); @@ -433,7 +435,7 @@ private void markAtoms(IAtomContainer aMolecule) { // if connected O/N/S is not aromatic... if (!tmpConnectedAtom.isAromatic()) { // set the connected O/N/S atom as marked - this.markedAtoms.add(tmpConnectedIdx); + this.markedAtomsCache.add(tmpConnectedIdx); if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( "Marking Atom #%d (%s) - Met condition 1", @@ -442,8 +444,8 @@ private void markAtoms(IAtomContainer aMolecule) { } // if "acetal C" (2+ O/N/S in single bonds connected to sp3-C)... [CONDITION 2.3] boolean tmpIsAllSingleBonds = true; - for (int tmpConnectedInSphere2Idx : this.adjList[tmpConnectedIdx]) { - IBond tmpSphere2Bond = this.bondMap.get(tmpConnectedIdx, tmpConnectedInSphere2Idx); + for (int tmpConnectedInSphere2Idx : this.adjListCache[tmpConnectedIdx]) { + IBond tmpSphere2Bond = this.bondMapCache.get(tmpConnectedIdx, tmpConnectedInSphere2Idx); if (tmpSphere2Bond.getOrder() != Order.SINGLE) { tmpIsAllSingleBonds = false; break; @@ -451,7 +453,7 @@ private void markAtoms(IAtomContainer aMolecule) { } if (tmpIsAllSingleBonds) { tmpConnectedONSatomsCounter++; - if (tmpConnectedONSatomsCounter > 1 && this.adjList[idx].length + tmpAtom.getImplicitHydrogenCount() == 4) { + if (tmpConnectedONSatomsCounter > 1 && this.adjListCache[idx].length + tmpAtom.getImplicitHydrogenCount() == 4) { // set as marked and break out of connected atoms tmpIsMarked = true; if (ErtlFunctionalGroupsFinder.isDbg()) { @@ -465,15 +467,15 @@ private void markAtoms(IAtomContainer aMolecule) { } } // if part of oxirane, aziridine, or thiirane ring... [CONDITION 2.4] - for (int tmpConnectedInSphere2Idx : this.adjList[tmpConnectedIdx]) { + for (int tmpConnectedInSphere2Idx : this.adjListCache[tmpConnectedIdx]) { IAtom tmpConnectedInSphere2Atom = aMolecule.getAtom(tmpConnectedInSphere2Idx); if (tmpConnectedInSphere2Atom.getAtomicNumber() == 6) { - for (int tmpConnectedInSphere3Idx : this.adjList[tmpConnectedInSphere2Idx]) { + for (int tmpConnectedInSphere3Idx : this.adjListCache[tmpConnectedInSphere2Idx]) { IAtom tmpConnectedInSphere3Atom = aMolecule.getAtom(tmpConnectedInSphere3Idx); if (tmpConnectedInSphere3Atom.equals(tmpAtom)) { // set connected atoms as marked - this.markedAtoms.add(tmpConnectedInSphere2Idx); - this.markedAtoms.add(tmpConnectedInSphere3Idx); + this.markedAtomsCache.add(tmpConnectedInSphere2Idx); + this.markedAtomsCache.add(tmpConnectedInSphere3Idx); if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( "Marking Atom #%d (%s) - Met condition 2.4", @@ -500,7 +502,7 @@ private void markAtoms(IAtomContainer aMolecule) { } // end of else if connected to O/N/S in single bond } //end of for loop that iterates over all connected atoms of the carbon atom if (tmpIsMarked) { - this.markedAtoms.add(idx); + this.markedAtomsCache.add(idx); continue; } // if none of the conditions 2.X apply, we have an unmarked C (not relevant here) @@ -509,9 +511,9 @@ private void markAtoms(IAtomContainer aMolecule) { // convert to implicit H IAtom tmpConnectedAtom; try { - tmpConnectedAtom = aMolecule.getAtom(this.adjList[idx][0]); + tmpConnectedAtom = aMolecule.getAtom(this.adjListCache[idx][0]); } catch(ArrayIndexOutOfBoundsException anException) { - //TODO: do sth here? + ErtlFunctionalGroupsFinder.LOGGING_TOOL.warn("Explicit H was included in atom count but not correctly in adjacency list"); break; } if (Objects.isNull(tmpConnectedAtom.getImplicitHydrogenCount())) { @@ -522,7 +524,7 @@ private void markAtoms(IAtomContainer aMolecule) { continue; } else { // if heteroatom... (CONDITION 1) - this.markedAtoms.add(idx); + this.markedAtomsCache.add(idx); if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( "Marking Atom #%d (%s) - Met condition 1", @@ -535,7 +537,7 @@ private void markAtoms(IAtomContainer aMolecule) { if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( "########## End of search. Marked %d/%d atoms. ##########", - this.markedAtoms.size(), + this.markedAtomsCache.size(), aMolecule.getAtomCount())); } } @@ -551,15 +553,15 @@ private List extractGroups(IAtomContainer aMolecule) { if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Starting identification & extraction of functional groups... ##########"); } - this.markedAtomToConnectedEnvCMap = new HashMap<>((int) ((aMolecule.getAtomCount() / 0.75f) + 2), 0.75f); + this.markedAtomToConnectedEnvCMapCache = new HashMap<>((int) ((aMolecule.getAtomCount() / 0.75f) + 2), 0.75f); int[] tmpAtomIdxToFGArray = new int[aMolecule.getAtomCount()]; Arrays.fill(tmpAtomIdxToFGArray, -1); int tmpFunctionalGroupIdx = -1; - while (!this.markedAtoms.isEmpty()) { + while (!this.markedAtomsCache.isEmpty()) { // search for another functional group tmpFunctionalGroupIdx++; // get next markedAtom as the starting node for the search - int tmpBeginIdx = this.markedAtoms.iterator().next(); + int tmpBeginIdx = this.markedAtomsCache.iterator().next(); if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( "Searching new functional group from atom #%d (%s)...", @@ -572,25 +574,25 @@ private List extractGroups(IAtomContainer aMolecule) { while (!tmpQueue.isEmpty()) { int tmpCurrentQueueIdx = tmpQueue.poll(); // we are only interested in marked atoms that are not yet included in a group - if (!this.markedAtoms.contains(tmpCurrentQueueIdx)) { + if (!this.markedAtomsCache.contains(tmpCurrentQueueIdx)) { continue; } // if it isn't... IAtom tmpCurrentAtom = aMolecule.getAtom(tmpCurrentQueueIdx); if (ErtlFunctionalGroupsFinder.isDbg()) { - ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format(" visiting marked atom: #%d (%s)", + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format("\tvisiting marked atom: #%d (%s)", tmpCurrentQueueIdx, tmpCurrentAtom.getSymbol())); } // add its index to the functional group tmpAtomIdxToFGArray[tmpCurrentQueueIdx] = tmpFunctionalGroupIdx; // also scratch the index from markedAtoms - this.markedAtoms.remove(tmpCurrentQueueIdx); + this.markedAtomsCache.remove(tmpCurrentQueueIdx); // and take a look at the connected atoms List tmpCurrentEnvironment = new ArrayList<>(); - for (int tmpConnectedIdx : this.adjList[tmpCurrentQueueIdx]) { + for (int tmpConnectedIdx : this.adjListCache[tmpCurrentQueueIdx]) { // add connected marked atoms to queue - if (this.markedAtoms.contains(tmpConnectedIdx)) { + if (this.markedAtomsCache.contains(tmpConnectedIdx)) { tmpQueue.add(tmpConnectedIdx); continue; } @@ -600,17 +602,17 @@ private List extractGroups(IAtomContainer aMolecule) { } // add unmarked connected aromatic heteroatoms IAtom tmpConnectedAtom = aMolecule.getAtom(tmpConnectedIdx); - if (ErtlFunctionalGroupsFinder.isHeteroatom(tmpConnectedAtom) && tmpConnectedAtom.isAromatic()) { + if (this.isHeteroatom(tmpConnectedAtom) && tmpConnectedAtom.isAromatic()) { tmpAtomIdxToFGArray[tmpConnectedIdx] = tmpFunctionalGroupIdx; // note that this aromatic heteroatom has been added to a group - this.aromaticHeteroAtomIndicesToIsInGroupBoolMap.put(tmpConnectedIdx, true); + this.aromaticHeteroAtomIndicesToIsInGroupBoolMapCache.put(tmpConnectedIdx, true); if (ErtlFunctionalGroupsFinder.isDbg()) { - ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" added connected aromatic heteroatom " + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t\tadded connected aromatic heteroatom " + tmpConnectedAtom.getSymbol()); } } // add unmarked connected atoms to current marked atom's environment - IBond tmpConnectedBond = this.bondMap.get(tmpCurrentQueueIdx, tmpConnectedIdx); + IBond tmpConnectedBond = this.bondMapCache.get(tmpCurrentQueueIdx, tmpConnectedIdx); EnvironmentalCType tmpEnvironmentalCType; if (tmpConnectedAtom.getAtomicNumber() == 6) { if (tmpConnectedAtom.isAromatic()) { @@ -628,10 +630,11 @@ private List extractGroups(IAtomContainer aMolecule) { tmpConnectedBond, tmpConnectedBond.getBegin().equals(tmpConnectedAtom) ? 0 : 1)); } //end of loop of connected atoms - this.markedAtomToConnectedEnvCMap.put(tmpCurrentAtom, tmpCurrentEnvironment); + this.markedAtomToConnectedEnvCMapCache.put(tmpCurrentAtom, tmpCurrentEnvironment); // debug logging if (ErtlFunctionalGroupsFinder.isDbg()) { - int tmpCAromCount = 0, tmpCAliphCount = 0; + int tmpCAromCount = 0; + int tmpCAliphCount = 0; for(EnvironmentalC tmpEnvC : tmpCurrentEnvironment) { if (tmpEnvC.getType() == EnvironmentalCType.C_AROMATIC) { tmpCAromCount++; @@ -640,19 +643,19 @@ private List extractGroups(IAtomContainer aMolecule) { } } ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( - " logged marked atom's environment: C_ar:%d, C_al:%d (and %d implicit hydrogens)", + "\t\tlogged marked atom's environment: C_ar:%d, C_al:%d (and %d implicit hydrogens)", tmpCAromCount, tmpCAliphCount, tmpCurrentAtom.getImplicitHydrogenCount())); } } // end of BFS if (ErtlFunctionalGroupsFinder.isDbg()) { - ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" search completed."); + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\tsearch completed."); } } //markedAtoms is empty now // also create FG for lone aromatic heteroatoms, not connected to an FG yet. - for (int tmpAtomIdx : this.aromaticHeteroAtomIndicesToIsInGroupBoolMap.keySet()) { - if (!this.aromaticHeteroAtomIndicesToIsInGroupBoolMap.get(tmpAtomIdx)) { + for (int tmpAtomIdx : this.aromaticHeteroAtomIndicesToIsInGroupBoolMapCache.keySet()) { + if (!this.aromaticHeteroAtomIndicesToIsInGroupBoolMapCache.get(tmpAtomIdx).booleanValue()) { tmpFunctionalGroupIdx++; tmpAtomIdxToFGArray[tmpAtomIdx] = tmpFunctionalGroupIdx; if (ErtlFunctionalGroupsFinder.isDbg()) { @@ -687,7 +690,7 @@ private void expandGeneralizedEnvironments(List aFunctionalGroup // pre-checking for special cases... if (tmpFunctionalGroup.getAtomCount() == 1) { IAtom tmpAtom = tmpFunctionalGroup.getAtom(0); - List tmpEnvironment = this.markedAtomToConnectedEnvCMap.get(tmpAtom); + List tmpEnvironment = this.markedAtomToConnectedEnvCMapCache.get(tmpAtom); if (!Objects.isNull(tmpEnvironment)) { int tmpEnvCCount = tmpEnvironment.size(); @@ -696,7 +699,7 @@ private void expandGeneralizedEnvironments(List aFunctionalGroup || (tmpAtom.getAtomicNumber() == 7 && tmpEnvCCount == 1)) { if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( - " - found single atomic N or O FG with one env. C. Expanding environment...", + "\t- found single atomic %s FG with one env. C. Expanding environment...", tmpAtom.getSymbol())); } this.expandEnvironment(tmpAtom, tmpFunctionalGroup); @@ -704,7 +707,7 @@ private void expandGeneralizedEnvironments(List aFunctionalGroup if (tmpAtomImplicitHydrogenCount != 0) { if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( - " - adding %d hydrogens...", tmpAtomImplicitHydrogenCount)); + "\t- adding %d hydrogens...", tmpAtomImplicitHydrogenCount)); } this.addHydrogens(tmpAtom, tmpAtomImplicitHydrogenCount, tmpFunctionalGroup); tmpAtom.setImplicitHydrogenCount(0); @@ -715,24 +718,24 @@ private void expandGeneralizedEnvironments(List aFunctionalGroup if ((tmpAtom.getAtomicNumber() == 7 && tmpEnvCCount == 2) || (tmpAtom.getAtomicNumber() == 16 && tmpEnvCCount == 1)) { if (ErtlFunctionalGroupsFinder.isDbg()) { - ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" - found sec. amine or simple thiol"); + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t- found sec. amine or simple thiol"); } int tmpAtomImplicitHydrogenCount = tmpAtom.getImplicitHydrogenCount(); if (tmpAtomImplicitHydrogenCount != 0) { if (ErtlFunctionalGroupsFinder.isDbg()) { - ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format(" - adding %d hydrogens...", + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format("\t- adding %d hydrogens...", tmpAtomImplicitHydrogenCount)); } this.addHydrogens(tmpAtom, tmpAtomImplicitHydrogenCount, tmpFunctionalGroup); tmpAtom.setImplicitHydrogenCount(0); } if (ErtlFunctionalGroupsFinder.isDbg()) { - ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" - expanding environment..."); + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t- expanding environment..."); } this.expandEnvironmentGeneralized(tmpAtom, tmpFunctionalGroup); continue; } - } else if (ErtlFunctionalGroupsFinder.isHeteroatom(tmpAtom)) { + } else if (this.isHeteroatom(tmpAtom)) { // env is null and marked atoms is a hetero atom -> single aromatic heteroatom int tmpRAtomCount = tmpAtom.getValency(); Integer tmpAtomImplicitHydrogenCount = tmpAtom.getImplicitHydrogenCount(); @@ -742,7 +745,7 @@ private void expandGeneralizedEnvironments(List aFunctionalGroup String tmpAtomTypeName = tmpAtom.getAtomTypeName(); if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( - " - found single aromatic heteroatom (%s, Atomtype %s). Adding %d R-Atoms...", + "\t- found single aromatic heteroatom (%s, Atomtype %s). Adding %d R-Atoms...", tmpAtom.getSymbol(), tmpAtomTypeName, tmpRAtomCount)); @@ -756,7 +759,7 @@ private void expandGeneralizedEnvironments(List aFunctionalGroup tmpFunctionalGroup.atoms().forEach(tmpFunctionalGroupAtoms::add); // process individual functional group atoms... for (IAtom tmpFunctionalGroupAtom : tmpFunctionalGroupAtoms) { - List tmpFGenvCs = this.markedAtomToConnectedEnvCMap.get(tmpFunctionalGroupAtom); + List tmpFGenvCs = this.markedAtomToConnectedEnvCMapCache.get(tmpFunctionalGroupAtom); if (tmpFGenvCs == null) { if (tmpFunctionalGroupAtom.getImplicitHydrogenCount() != 0) { tmpFunctionalGroupAtom.setImplicitHydrogenCount(0); @@ -764,7 +767,7 @@ private void expandGeneralizedEnvironments(List aFunctionalGroup int tmpRAtomCount = tmpFunctionalGroupAtom.getValency() - 1; if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( - " - found connected aromatic heteroatom (%s). Adding %d R-Atoms...", + "\t- found connected aromatic heteroatom (%s). Adding %d R-Atoms...", tmpFunctionalGroupAtom.getSymbol(), tmpRAtomCount)); } @@ -777,19 +780,19 @@ private void expandGeneralizedEnvironments(List aFunctionalGroup tmpFunctionalGroupAtom.setImplicitHydrogenCount(0); } if (ErtlFunctionalGroupsFinder.isDbg()) { - ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" - ignoring environment for marked carbon atom"); + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t- ignoring environment for marked carbon atom"); } continue; } else { if (ErtlFunctionalGroupsFinder.isDbg()) { - ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" - found carbonyl-carbon. Expanding environment..."); + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t- found carbonyl-carbon. Expanding environment..."); } this.expandEnvironmentGeneralized(tmpFunctionalGroupAtom, tmpFunctionalGroup); continue; } } else { // processing heteroatoms... if (ErtlFunctionalGroupsFinder.isDbg()) { - ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format(" - found heteroatom (%s). Expanding environment...", + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format("\t- found heteroatom (%s). Expanding environment...", tmpFunctionalGroupAtom.getSymbol())); } this.expandEnvironmentGeneralized(tmpFunctionalGroupAtom, tmpFunctionalGroup); @@ -828,7 +831,7 @@ private void expandFullEnvironments(List aFunctionalGroupsList) if (tmpImplicitHydrogenCount != 0) { if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( - " - adding %d hydrogens...", tmpImplicitHydrogenCount)); + "\t- adding %d hydrogens...", tmpImplicitHydrogenCount)); } this.addHydrogens(tmpFunctionalGroupAtom, tmpImplicitHydrogenCount, tmpFunctionalGroup); tmpFunctionalGroupAtom.setImplicitHydrogenCount(0); @@ -848,15 +851,16 @@ private void expandFullEnvironments(List aFunctionalGroupsList) * @param aFunctionalGroup the functional group container that the atom is part of */ private void expandEnvironment(IAtom aFunctionalGroupAtom, IAtomContainer aFunctionalGroup) { - List tmpEnvCAtomsList = this.markedAtomToConnectedEnvCMap.get(aFunctionalGroupAtom); + List tmpEnvCAtomsList = this.markedAtomToConnectedEnvCMapCache.get(aFunctionalGroupAtom); if (Objects.isNull(tmpEnvCAtomsList) || tmpEnvCAtomsList.isEmpty()) { if (ErtlFunctionalGroupsFinder.isDbg()) { - ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" found no environment to expand."); + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t\tfound no environment to expand."); } return; } - int tmpAromaticCAtomCount = 0, tmpAliphaticCAtomCount = 0; + int tmpAromaticCAtomCount = 0; + int tmpAliphaticCAtomCount = 0; for (EnvironmentalC tmpEnvCAtom : tmpEnvCAtomsList) { IAtom tmpCAtom = aFunctionalGroupAtom.getBuilder().newInstance(IAtom.class, "C"); tmpCAtom.setAtomTypeName("C"); @@ -873,7 +877,7 @@ private void expandEnvironment(IAtom aFunctionalGroupAtom, IAtomContainer aFunct } if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( - " expanded environment: %dx C_ar and %dx C_al", + "\t\texpanded environment: %dx C_ar and %dx C_al", tmpAromaticCAtomCount, tmpAliphaticCAtomCount)); } @@ -888,10 +892,10 @@ private void expandEnvironment(IAtom aFunctionalGroupAtom, IAtomContainer aFunct * @param aFunctionalGroup the functional group container that the atom is part of */ private void expandEnvironmentGeneralized(IAtom aFunctionalGroupAtom, IAtomContainer aFunctionalGroup) { - List tmpEnvironment = this.markedAtomToConnectedEnvCMap.get(aFunctionalGroupAtom); + List tmpEnvironment = this.markedAtomToConnectedEnvCMapCache.get(aFunctionalGroupAtom); if (Objects.isNull(tmpEnvironment)) { if (ErtlFunctionalGroupsFinder.isDbg()) { - ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" found no environment to expand."); + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t\tfound no environment to expand."); } return; } @@ -901,7 +905,7 @@ private void expandEnvironmentGeneralized(IAtom aFunctionalGroupAtom, IAtomConta this.addHydrogens(aFunctionalGroupAtom, 1, aFunctionalGroup); aFunctionalGroupAtom.setImplicitHydrogenCount(0); if (ErtlFunctionalGroupsFinder.isDbg()) { - ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(" expanded hydrogen on connected OH-Group"); + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t\texpanded hydrogen on connected OH-Group"); } } else if (this.isHeteroatom(aFunctionalGroupAtom)) { tmpRAtomCount += aFunctionalGroupAtom.getImplicitHydrogenCount(); @@ -912,7 +916,7 @@ private void expandEnvironmentGeneralized(IAtom aFunctionalGroupAtom, IAtomConta } if (ErtlFunctionalGroupsFinder.isDbg()) { ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( - " expanded environment: %dx R-atom (incl. %d for H replacement)", + "\t\texpanded environment: %dx R-atom (incl. %d for H replacement)", tmpRAtomCount, tmpRAtomCount - tmpRAtomsForCCount)); } @@ -924,7 +928,7 @@ private void expandEnvironmentGeneralized(IAtom aFunctionalGroupAtom, IAtomConta * @param anAtom the atom to test * @return true if the given atom is neither a carbon nor a hydrogen atom */ - private static boolean isHeteroatom(IAtom anAtom) { + private boolean isHeteroatom(IAtom anAtom) { int tmpAtomicNr = anAtom.getAtomicNumber(); return tmpAtomicNr != 1 && tmpAtomicNr != 6; } @@ -933,10 +937,10 @@ private static boolean isHeteroatom(IAtom anAtom) { * Checks whether the given atom is from an element in the organic subset, i.e. not a metal or metalloid atom. * See the public constant set of non-metal atomic numbers declared in this class. * - * @param anAtom + * @param anAtom atom to check * @return true if the given atom is organic and not a metal or metalloid atom */ - private static boolean isNonmetal(IAtom anAtom) { + private boolean isNonmetal(IAtom anAtom) { return ErtlFunctionalGroupsFinder.NONMETAL_ATOMIC_NUMBERS.contains(anAtom.getAtomicNumber()); } // @@ -947,7 +951,7 @@ private static boolean isNonmetal(IAtom anAtom) { * @param aNrOfHydrogenAtoms the number of explicit hydrogens atoms to add * @param aMolecule the molecule the atom belongs to */ - private static void addHydrogens(IAtom anAtom, int aNrOfHydrogenAtoms, IAtomContainer aMolecule) { + private void addHydrogens(IAtom anAtom, int aNrOfHydrogenAtoms, IAtomContainer aMolecule) { for (int i = 0; i < aNrOfHydrogenAtoms; i++) { IAtom tmpHydrogenAtom = anAtom.getBuilder().newInstance(IAtom.class, "H"); tmpHydrogenAtom.setAtomTypeName("H"); @@ -1029,36 +1033,38 @@ private List partitionIntoGroups(IAtomContainer aSourceContainer } // /** + * Checks input molecule for charged atoms, metal or metalloid atoms, and whether it consists of more than one unconnected structures. + * If one of the cases applies, an IllegalArgumentException is thrown with a specific error message. NOTE: adjacency + * list cache must already be set-up! * - * - * Use ErtlFunctionalGroupsFinder.LOGGING_TOOL.setLevel(ILoggingTool.DEBUG); to activate debug messages. - * - * @return - */ - private static boolean isDbg() { - return ErtlFunctionalGroupsFinder.LOGGING_TOOL.isDebugEnabled(); - } - // - /** - * - * @param molecule - * @return + * @param aMolecule the molecule to check + * @throws IllegalArgumentException if one of the constraints is not met */ - private boolean checkConstraints(IAtomContainer molecule) { - for(IAtom atom : molecule.atoms()) { - if(atom.getFormalCharge() != null && atom.getFormalCharge() != 0) { + private void checkConstraints(IAtomContainer aMolecule) throws IllegalArgumentException { + for (IAtom tmpAtom : aMolecule.atoms()) { + if (!Objects.isNull(tmpAtom.getFormalCharge()) && tmpAtom.getFormalCharge() != 0) { throw new IllegalArgumentException("Input molecule must not contain any charges."); } - if(!isNonmetal(atom)) { - throw new IllegalArgumentException("Input molecule must not contain metals or metalloids."); + if (!this.isNonmetal(tmpAtom)) { + throw new IllegalArgumentException("Input molecule must not contain metal or metalloid atoms."); } } - - ConnectedComponents cc = new ConnectedComponents(adjList); - if(cc.nComponents() != 1) { + Objects.requireNonNull(this.adjListCache, "Adjacency list cache must already be set-up for this check!"); + ConnectedComponents tmpConnectedComponents = new ConnectedComponents(this.adjListCache); + if (tmpConnectedComponents.nComponents() != 1) { throw new IllegalArgumentException("Input molecule must consist of only a single connected structure."); } - - return true; + } + // + /** + * Returns whether the CDK logging tool of this class (logger) is currently configured to log debug messages. + *

+ * Use ErtlFunctionalGroupsFinder.LOGGING_TOOL.setLevel(ILoggingTool.DEBUG); to activate debug messages. + *

+ * + * @return true if debug messages are enabled + */ + private static boolean isDbg() { + return ErtlFunctionalGroupsFinder.LOGGING_TOOL.isDebugEnabled(); } } From 2531267b307239b9eb6895ffed994ef813bc7500 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Mon, 15 Jan 2024 16:10:20 +0100 Subject: [PATCH 13/27] Added new environment mode to only extract marked atoms; added factory methods; added convenience method for accessing valid atomic numbers --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 64 +++++++++- .../ErtlFunctionalGroupsFinderUtility.java | 115 +----------------- .../tools/ErtlFunctionalGroupsFinderTest.java | 36 +++++- ...ErtlFunctionalGroupsFinderUtilityTest.java | 36 +----- 4 files changed, 97 insertions(+), 154 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 928a81e..8e60722 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -56,9 +56,8 @@ * @version 1.2.1 */ public class ErtlFunctionalGroupsFinder { - /** - * Defines the mode for generalizing functional group environments (default) or keeping them whole. + * Defines the mode for generalizing functional group environments (default), keeping them whole, or only extracting marked atoms. */ public static enum Mode { /** @@ -68,7 +67,12 @@ public static enum Mode { /** * Skips the generalization step. Functional groups will keep their full environment. */ - NO_GENERALIZATION; + NO_GENERALIZATION, + /** + * Functional groups will only consist of atoms marked according to the conditions defined by Ertl, environments + * will be completely ignored. + */ + ONLY_MARKED_ATOMS; } // /** @@ -226,7 +230,8 @@ public ErtlFunctionalGroupsFinder() { // /** * Constructor for ErtlFunctionalGroupsFinder that allows setting the treatment of environments in the identified - * functional groups. Default: environments will be generalized; no generalization: environments will be kept as whole. + * functional groups. Default: environments will be generalized; no generalization: environments will be kept as whole; + * only marked atoms: no environmental atoms whatsoever will be attached to the extracted functional groups. * * @param anEnvMode mode for treating functional group environments (see {@link ErtlFunctionalGroupsFinder.Mode}). */ @@ -235,9 +240,42 @@ public ErtlFunctionalGroupsFinder(Mode anEnvMode) { this.envMode = anEnvMode; } // + /** + * Constructs a new ErtlFunctionalGroupsFinder instance with generalization of returned functional groups turned ON. + * + * @return new ErtlFunctionalGroupsFinder instance that generalizes returned functional groups + */ + public static ErtlFunctionalGroupsFinder newErtlFunctionalGroupsFinderGeneralizingMode() { + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); + return tmpEFGF; + } + // + /** + * Constructs a new ErtlFunctionalGroupsFinder instance with generalization of returned functional groups turned OFF. + * The FG will have their full environments. + * + * @return new ErtlFunctionalGroupsFinder instance that does NOT generalize returned functional groups + */ + public static ErtlFunctionalGroupsFinder newErtlFunctionalGroupsFinderFullEnvironmentMode() { + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); + return tmpEFGF; + } + // + /** + * Constructs a new ErtlFunctionalGroupsFinder instance that extracts only the marked atoms of the functional groups, + * no attached environmental atoms. + * + * @return new ErtlFunctionalGroupsFinder instance that extracts only marked atoms + */ + public static ErtlFunctionalGroupsFinder newErtlFunctionalGroupsFinderOnlyMarkedAtomsMode() { + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.ONLY_MARKED_ATOMS); + return tmpEFGF; + } + // /** * Allows setting the treatment of functional group environments after extraction. Default: environments will be - * generalized; no generalization: environments will be kept as whole. + * generalized; no generalization: environments will be kept as whole; only marked atoms: no environmental atoms + * whatsoever will be attached to the extracted functional groups. * * @param anEnvMode mode for treating functional group environments (see {@link ErtlFunctionalGroupsFinder.Mode}). */ @@ -255,6 +293,18 @@ public Mode getEnvMode() { return this.envMode; } // + /** + * Returns the unmodifiable set containing all atomic numbers that can be passed on to ErtlFunctionalGroupsFinder.find() + * if input restrictions are enabled(!). + * All other atomic numbers are invalid because they represent metal, metalloid or pseudo ('R') atoms. + *
Analogous to using ErtlFunctionalGroupsFinder.NONMETAL_ATOMIC_NUMBERS. + * + * @return all valid atomic numbers for ErtlFunctionalGroupsFinder.find() if input restrictions are activated + */ + public static Set getValidAtomicNumbers() { + return ErtlFunctionalGroupsFinder.NONMETAL_ATOMIC_NUMBERS; + } + // /** * Find all functional groups in a molecule. The input atom container instance is cloned before processing to leave * the input container intact. @@ -336,7 +386,9 @@ public List find(IAtomContainer aMolecule, boolean aShouldInputB this.expandGeneralizedEnvironments(tmpFunctionalGroupsList); } else if (this.envMode == Mode.NO_GENERALIZATION) { this.expandFullEnvironments(tmpFunctionalGroupsList); - } else { + } else if (this.envMode == Mode.ONLY_MARKED_ATOMS) { + //do nothing + }else { throw new IllegalArgumentException("Unknown mode."); } this.clearCache(); diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java index 0c6cbe9..30d1e48 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java @@ -35,15 +35,12 @@ import org.openscience.cdk.interfaces.IAtomContainerSet; import org.openscience.cdk.interfaces.IAtomType; import org.openscience.cdk.interfaces.IChemObjectBuilder; -import org.openscience.cdk.interfaces.IPseudoAtom; import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; import org.openscience.cdk.tools.manipulator.AtomTypeManipulator; import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Objects; @@ -124,17 +121,6 @@ private ErtlFunctionalGroupsFinderUtility() { // // // - /** - * Returns an integer array containing all atomic numbers that can be passed on to ErtlFunctionalGroupsFinder.find(). - * All other atomic numbers are invalid because they represent metal, metalloid or pseudo ('R') atoms. - * - * @return all valid atomic numbers for ErtlFunctionalGroupsFinder.find() - */ - public static int[] getValidAtomicNumbers() { - return Arrays.copyOf(ErtlFunctionalGroupsFinderUtility.VALID_ATOMIC_NUMBERS, - ErtlFunctionalGroupsFinderUtility.VALID_ATOMIC_NUMBERS.length); - } - /** * Constructs a CDK MoleculeHashGenerator that is configured to count frequencies of the functional groups * returned by ErtlFunctionalGroupsFinder. It takes elements, bond order sum, and aromaticity of the atoms in @@ -156,27 +142,6 @@ public static MoleculeHashGenerator getFunctionalGroupHashGenerator() { .molecular(); return tmpHashGenerator; } - - /** - * Constructs a new ErtlFunctionalGroupsFinder object with generalization of returned functional groups turned ON. - * - * @return new ErtlFunctionalGroupsFinder object that generalizes returned functional groups - */ - public static ErtlFunctionalGroupsFinder getErtlFunctionalGroupsFinderGeneralizingMode() { - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); - return tmpEFGF; - } - - /** - * Constructs a new ErtlFunctionalGroupsFinder object with generalization of returned functional groups turned OFF. - * The FG will contain their full environments. - * - * @return new ErtlFunctionalGroupsFinder object that does NOT generalize returned functional groups - */ - public static ErtlFunctionalGroupsFinder getErtlFunctionalGroupsFinderNotGeneralizingMode() { - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); - return tmpEFGF; - } // // // @@ -197,7 +162,7 @@ public static boolean isStructureUnconnected(IAtomContainer aMolecule) throws Nu /** * Checks whether the atom count or bond count of the given molecule is zero. The ErtlFunctionalGroupsFinder.find() - * method would still accept these molecules but it is not recommended to pass them on (simply makes not much sense). + * method would still accept these molecules, but it is not recommended to pass them on (simply makes not much sense). * * @param aMolecule the molecule to check * @return true, if the atom or bond count of the molecule is zero @@ -707,84 +672,6 @@ public static IAtomContainer applyFiltersAndPreprocessing(IAtomContainer aMolecu // // // - /** - * Extracts functional groups from the given molecule, using the Ertl algorithm / ErtlFunctionalGroupsFinder, but - * only the marked atoms of every functional group are returned. They do not contain their environment (i.e. connected, - * unmarked carbon atoms) and are also not generalized. - * - * @param aMolecule the molecule to extracts functional groups from; it is not cloned in this method! - * @return List of IAtomContainer objects representing the detected functional groups - * @throws NullPointerException if the given atom container is null - * @throws IllegalArgumentException if the given atom container cannot be passed to ErtlFunctionalGroupsFinder; - * @throws CloneNotSupportedException if cloning is not possible - * check methods for filtering and preprocessing in this case - */ - public static List findMarkedAtoms(IAtomContainer aMolecule) throws NullPointerException, IllegalArgumentException, CloneNotSupportedException { - return ErtlFunctionalGroupsFinderUtility.findMarkedAtoms(aMolecule, true); - } - - /** - * Extracts functional groups from the given molecule, using the Ertl algorithm / ErtlFunctionalGroupsFinder, but - * only the marked atoms of every functional group are returned. They do not contain their environment (i.e. connected, - * unmarked carbon atoms) and are also not generalized. - * - * @param aMolecule the molecule to extracts functional groups from; it is not cloned in this method! - * @param areSingleAtomsFiltered if false, molecules with bond count 0 but atom count 1 will be processed and not raise - * an IllegalArgumentException - * @return List of IAtomContainer objects representing the detected functional groups - * @throws NullPointerException if the given atom container is null - * @throws IllegalArgumentException if the given atom container cannot be passed to ErtlFunctionalGroupsFinder; - * @throws CloneNotSupportedException if cloning is not possible - * check methods for filtering and preprocessing in this case - */ - public static List findMarkedAtoms(IAtomContainer aMolecule, boolean areSingleAtomsFiltered) throws NullPointerException, IllegalArgumentException, CloneNotSupportedException { - Objects.requireNonNull(aMolecule, "Given molecule is null."); - if (aMolecule.isEmpty()) { - return new ArrayList(0); - } - boolean tmpCanBeFragmented = ErtlFunctionalGroupsFinderUtility.isValidArgumentForFindMethod(aMolecule, areSingleAtomsFiltered); - if (!tmpCanBeFragmented) { - throw new IllegalArgumentException("Given molecule cannot be fragmented but needs to be filtered or preprocessed."); - } - HashMap tmpIdToAtomMap = new HashMap<>(aMolecule.getAtomCount() + 1, 1); - for (int i = 0; i < aMolecule.getAtomCount(); i++) { - IAtom tmpAtom = aMolecule.getAtom(i); - tmpAtom.setProperty("EFGFUtility.INDEX", i); - tmpIdToAtomMap.put(i, tmpAtom); - } - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); - List tmpFunctionalGroups = tmpEFGF.find(aMolecule, false); - if (tmpFunctionalGroups.isEmpty()) { - return tmpFunctionalGroups; - } - for (IAtomContainer tmpFunctionalGroup : tmpFunctionalGroups) { - for (int i = 0; i < tmpFunctionalGroup.getAtomCount(); i++) { - IAtom tmpAtom = tmpFunctionalGroup.getAtom(i); - if (Objects.isNull(tmpAtom.getProperty("EFGFUtility.INDEX"))) { - if (tmpAtom instanceof IPseudoAtom && "R".equals(((IPseudoAtom)tmpAtom).getLabel())) { - //atom is a pseudo atom added by the EFGF in generalization - tmpFunctionalGroup.removeAtom(tmpAtom); - i = i - 1; - continue; - } else if (tmpAtom.getSymbol().equals("C")){ - //atom is an environmental C added by the EFGF - tmpFunctionalGroup.removeAtom(tmpAtom); - i = i - 1; - continue; - } else if (tmpAtom.getSymbol().equals("H")) { - //atom is an explicit H added by the EFGF - tmpFunctionalGroup.removeAtom(tmpAtom); - i = i - 1; - continue; - } else { - //unknown atom - throw new IllegalArgumentException("Something went wrong, identified unknown added atom."); - } - } - } - } - return tmpFunctionalGroups; - } /** * Replaces the environmental carbon or pseudo-atoms (new IAtom objects) inserted by the EFGF in an identified diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java index e0d5c64..387303c 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java @@ -323,7 +323,7 @@ public void testMetalsMetalloids() throws Exception { AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpTetraethylOrthosilicate); tmpAromaticity.apply(tmpTetraethylOrthosilicate); - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); List tmpFGList = tmpEFGF.find(tmpTetraethylOrthosilicate); System.out.println("Tetraethyl Orthosilicate:"); @@ -341,7 +341,6 @@ public void testMetalsMetalloids() throws Exception { for (IAtomContainer tmpFG : tmpFGList) { System.out.println(tmpSmiGen.create(tmpFG)); } - } //TODO: Clean-up check constraints and add test molecules for these special cases to the testFind#() methods. @@ -387,6 +386,39 @@ public void readChebiLite3StarSubset() throws Exception { System.out.println(tmpExceptionsCounter); } + /** + * TODO: more testing necessary + */ + @Test + public void testOnlyMarkedAtoms() throws Exception { + SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical | SmiFlavor.UseAromaticSymbols); + Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); + + IAtomContainer tmpTetraethylOrthosilicate = tmpSmiPar.parseSmiles("CCO[Si](OCC)(OCC)OCC"); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpTetraethylOrthosilicate); + tmpAromaticity.apply(tmpTetraethylOrthosilicate); + + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.ONLY_MARKED_ATOMS); + List tmpFGList = tmpEFGF.find(tmpTetraethylOrthosilicate); + + System.out.println("Tetraethyl Orthosilicate:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); + } + + IAtomContainer tmpCHEMBL1201736 = tmpSmiPar.parseSmiles("CO/N=C(\\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N+]3(C)CCCC3)CS[C@H]12)c1csc(N)n1.Cl"); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpCHEMBL1201736); + tmpAromaticity.apply(tmpCHEMBL1201736); + + tmpFGList = tmpEFGF.find(tmpCHEMBL1201736); + + System.out.println("CHEMBL1201736:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); + } + } + private void testFind(String moleculeSmiles, String[] fGStrings) throws Exception { testFind(moleculeSmiles, fGStrings, new Aromaticity(ElectronDonation.daylight(), Cycles.all())); } diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java index ef35271..724a23f 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java @@ -31,7 +31,6 @@ import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.smiles.SmilesParser; -import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedList; import java.util.List; @@ -81,7 +80,7 @@ public void testPseudoSmilesGeneration() throws Exception { @Test public void testMoleculeHashGeneratorSettings() throws Exception { SmilesParser tmpSmilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); - ErtlFunctionalGroupsFinder tmpGeneralizingEFGF = ErtlFunctionalGroupsFinderUtility.getErtlFunctionalGroupsFinderGeneralizingMode(); + ErtlFunctionalGroupsFinder tmpGeneralizingEFGF = ErtlFunctionalGroupsFinder.newErtlFunctionalGroupsFinderGeneralizingMode(); MoleculeHashGenerator tmpHashGenerator = ErtlFunctionalGroupsFinderUtility.getFunctionalGroupHashGenerator(); /*Chebi70986, Chebi16238 and Chebi57692 all contain the same functional group with pseudo SMILES code "O=C1N=C(C(=NR)C(=O)N1R)N(R)R", but different hybridizations in the resulting atom containers. But their hash @@ -174,7 +173,7 @@ public void testRestorationOfEnvironmentalCarbons() throws Exception { SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Unique); //Adenophostin B, COCONUT ID CNP0214672 IAtomContainer tmpMolecule = tmpSmiPar.parseSmiles("O=C(OCC1OC(OC2C(OC(N3C=NC=4C(=NC=NC43)N)C2OP(=O)(O)O)CO)C(O)C(OP(=O)(O)O)C1OP(=O)(O)O)C"); - ErtlFunctionalGroupsFinder tmpEFGFFullEnv = ErtlFunctionalGroupsFinderUtility.getErtlFunctionalGroupsFinderNotGeneralizingMode(); + ErtlFunctionalGroupsFinder tmpEFGFFullEnv = ErtlFunctionalGroupsFinder.newErtlFunctionalGroupsFinderFullEnvironmentMode(); tmpMolecule = ErtlFunctionalGroupsFinderUtility.applyFiltersAndPreprocessing(tmpMolecule, Aromaticity.cdkLegacy()); List tmpFGList = tmpEFGFFullEnv.find(tmpMolecule, false); System.out.println("FGs with full environment returned by EFGF:"); @@ -192,7 +191,7 @@ public void testRestorationOfEnvironmentalCarbons() throws Exception { System.out.println(tmpSmiGen.create(tmpFG)); } tmpMolecule = tmpSmiPar.parseSmiles("O=C(OCC1OC(OC2C(OC(N3C=NC=4C(=NC=NC43)N)C2OP(=O)(O)O)CO)C(O)C(OP(=O)(O)O)C1OP(=O)(O)O)C"); - ErtlFunctionalGroupsFinder tmpEFGFgeneralized = ErtlFunctionalGroupsFinderUtility.getErtlFunctionalGroupsFinderGeneralizingMode(); + ErtlFunctionalGroupsFinder tmpEFGFgeneralized = ErtlFunctionalGroupsFinder.newErtlFunctionalGroupsFinderGeneralizingMode(); tmpMolecule = ErtlFunctionalGroupsFinderUtility.applyFiltersAndPreprocessing(tmpMolecule, Aromaticity.cdkLegacy()); tmpFGList = tmpEFGFgeneralized.find(tmpMolecule, false); System.out.println("FGs with generalized environment returned by EFGF:"); @@ -235,37 +234,10 @@ public void testOnMolecule() throws Exception { ErtlFunctionalGroupsFinderUtility.perceiveAtomTypesAndConfigureAtoms(tmpMolecule); ErtlFunctionalGroupsFinderUtility.applyAromaticityDetection(tmpMolecule, Aromaticity.cdkLegacy()); Assertions.assertTrue(ErtlFunctionalGroupsFinderUtility.isValidArgumentForFindMethod(tmpMolecule)); - ErtlFunctionalGroupsFinder tmpEFGF = ErtlFunctionalGroupsFinderUtility.getErtlFunctionalGroupsFinderGeneralizingMode(); + ErtlFunctionalGroupsFinder tmpEFGF = ErtlFunctionalGroupsFinder.newErtlFunctionalGroupsFinderGeneralizingMode(); List tmpFGList = tmpEFGF.find(tmpMolecule); for (IAtomContainer tmpFG : tmpFGList) { System.out.println(ErtlFunctionalGroupsFinderUtility.createPseudoSmilesCode(tmpFG)); } } - - /** - * Tests the extraction of only atoms marked by the Ertl algorithm as functional groups, implemented in - * ErtlFunctionalGroupsFinderUtility as a third option to "full environment" / "generalized environment". - * - * @throws Exception if anything goes wrong - */ - @Test - public void testFindMarkedAtoms() throws Exception { - SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); - //CHEMBL1201736 - IAtomContainer tmpMolecule = tmpSmiPar.parseSmiles("CO/N=C(\\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N+]3(C)CCCC3)CS[C@H]12)c1csc(N)n1.Cl"); - tmpMolecule = ErtlFunctionalGroupsFinderUtility.applyFiltersAndPreprocessing(tmpMolecule, Aromaticity.cdkLegacy()); - List tmpFGList = ErtlFunctionalGroupsFinderUtility.findMarkedAtoms(tmpMolecule); - List tmpPseudoSmilesList = new ArrayList<>(6); - for (IAtomContainer tmpFG : tmpFGList) { - String tmpPseudoSmiles = ErtlFunctionalGroupsFinderUtility.createPseudoSmilesCode(tmpFG); - System.out.println(tmpPseudoSmiles); - tmpPseudoSmilesList.add(tmpPseudoSmiles); - } - Assertions.assertTrue(tmpPseudoSmilesList.contains("[N]C(=O)[C]=N[O]")); - Assertions.assertTrue(tmpPseudoSmilesList.contains("[C]=C(C(=O)[O])N([C]=O)[C][S]")); - Assertions.assertTrue(tmpPseudoSmilesList.contains("[N]")); - Assertions.assertTrue(tmpPseudoSmilesList.contains("[S*]")); - Assertions.assertTrue(tmpPseudoSmilesList.contains("[N*]")); - Assertions.assertTrue(tmpPseudoSmilesList.size() == 6); - } } From c755e3f1773aa85ebdc13c166d6dfa9b2eccfa78 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Mon, 15 Jan 2024 18:23:59 +0100 Subject: [PATCH 14/27] Transferred some utility methods for input testing and preprocessing from utility class to EFGF; --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 209 +++++++++++++++--- .../ErtlFunctionalGroupsFinderUtility.java | 184 ++------------- ...ErtlFunctionalGroupsFinderUtilityTest.java | 6 +- 3 files changed, 202 insertions(+), 197 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 8e60722..edef7bf 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -20,7 +20,9 @@ package org.openscience.cdk.tools; +import org.openscience.cdk.aromaticity.Aromaticity; import org.openscience.cdk.graph.ConnectedComponents; +import org.openscience.cdk.graph.ConnectivityChecker; import org.openscience.cdk.graph.GraphUtil; import org.openscience.cdk.graph.GraphUtil.EdgeToBondMap; import org.openscience.cdk.interfaces.IAtom; @@ -30,6 +32,7 @@ import org.openscience.cdk.interfaces.ILonePair; import org.openscience.cdk.interfaces.IPseudoAtom; import org.openscience.cdk.interfaces.ISingleElectron; +import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; import java.util.ArrayDeque; import java.util.ArrayList; @@ -95,7 +98,6 @@ private static enum EnvironmentalCType { * or aliphatic and also contains a clone of its connecting bond. */ private class EnvironmentalC { - /** * Indicates whether carbon atom is aromatic or aliphatic. */ @@ -183,8 +185,10 @@ public IBond createBond(IAtom aTargetAtom, IAtom anEnvCAtom) { public static final String CARBONYL_C_MARKER = "EFGF-Carbonyl-C"; // /** - * Set of atomic numbers that are accepted in the input molecule if the strict input restrictions are activated - * (excludes metal and metalloid elements, only organic elements included). + * Set of atomic numbers of nonmetal elements, namely hydrogen, carbon, nitrogen, oxygen, phosphorus, sulfur, selenium, + * halogens (fluorine, chlorine, bromine, iodine), and noble gases (helium, neon, argon, krypton, xenon, radon). + * Atoms of these elements are exclusively accepted in the input molecule if(!) the strict input restrictions are + * activated (turned off by default). */ public static final Set NONMETAL_ATOMIC_NUMBERS = Set.of(1, 2, 6, 7, 8, 9, 10, 15, 16, 17, 18, 34, 35, 36, 53, 54, 86); // @@ -293,18 +297,6 @@ public Mode getEnvMode() { return this.envMode; } // - /** - * Returns the unmodifiable set containing all atomic numbers that can be passed on to ErtlFunctionalGroupsFinder.find() - * if input restrictions are enabled(!). - * All other atomic numbers are invalid because they represent metal, metalloid or pseudo ('R') atoms. - *
Analogous to using ErtlFunctionalGroupsFinder.NONMETAL_ATOMIC_NUMBERS. - * - * @return all valid atomic numbers for ErtlFunctionalGroupsFinder.find() if input restrictions are activated - */ - public static Set getValidAtomicNumbers() { - return ErtlFunctionalGroupsFinder.NONMETAL_ATOMIC_NUMBERS; - } - // /** * Find all functional groups in a molecule. The input atom container instance is cloned before processing to leave * the input container intact. @@ -341,7 +333,7 @@ public List find(IAtomContainer aMolecule) throws CloneNotSuppor public List find (IAtomContainer aMolecule, boolean aShouldInputBeCloned) throws CloneNotSupportedException { return this.find(aMolecule, aShouldInputBeCloned, false); } - + // /** * Find all functional groups in a molecule. * @@ -352,7 +344,8 @@ public List find (IAtomContainer aMolecule, boolean aShouldInput * leave the input container intact * @param anAreInputRestrictionsApplied if true, the input must consist of one connected structure and may not * contain charged atoms, metals or metalloids; an IllegalArgumentException will - * be thrown otherwise + * be thrown otherwise; see convenience methods in this class for detecting + * illegal input structures for this case * @throws CloneNotSupportedException if cloning is not possible * @throws IllegalArgumentException if input restrictions are applied and the given molecule does not fulfill them * @return a list with all functional groups found in the molecule @@ -394,7 +387,168 @@ public List find(IAtomContainer aMolecule, boolean aShouldInputB this.clearCache(); return tmpFunctionalGroupsList; } - + // + /** + * Returns the unmodifiable set containing the atomic numbers that can be passed on to ErtlFunctionalGroupsFinder.find() + * if(!) input restrictions are enabled (turned off by default). These nonmetal elements include + * hydrogen, carbon, nitrogen, oxygen, phosphorus, sulfur, selenium, halogens (fluorine, chlorine, bromine, iodine), + * and noble gases (helium, neon, argon, krypton, xenon, radon). + * All other atomic numbers represent metal, metalloid, or pseudo ('R') atoms. + *
Convenience method analogous to using ErtlFunctionalGroupsFinder.NONMETAL_ATOMIC_NUMBERS directly. + * + * @return all valid atomic numbers for ErtlFunctionalGroupsFinder.find() if input restrictions are activated + */ + public static Set getNonmetalAtomicNumbers() { + return ErtlFunctionalGroupsFinder.NONMETAL_ATOMIC_NUMBERS; + } + // + /** + * Checks whether a given atom is a metal, metalloid, or pseudo atom judging by its atomic number. These atoms + * cannot be passed on to ErtlFunctionalGroupsFinder.find() + * if(!) input restrictions are enabled (turned off by default). + * + * @param anAtom the atom to check + * @return true, if the atomic number is not in the nonmetal atomic numbers set or 'null' + * @throws NullPointerException if the given atom or its atomic number is 'null' + */ + public static boolean isMetalMetalloidOrPseudoAtom(IAtom anAtom) throws NullPointerException { + Objects.requireNonNull(anAtom, "Given atom is 'null'."); + Objects.requireNonNull(anAtom.getAtomicNumber(), "Atomic number is 'null'."); + return !ErtlFunctionalGroupsFinder.isNonmetal(anAtom); + } + // + /** + * Iterates through all atoms in the given molecule and checks them for metal, metalloid, and pseudo ("R") atoms. If this + * method returns 'true', the molecule cannot be passed on to ErtlFunctionalGroupsFinder.find() + * if(!) input restrictions are enabled (turned off by default). + *
This method scales linearly with O(n) with n: number of atoms in the given + * molecule. + * + * @param aMolecule the molecule to check + * @return true, if the molecule contains one or more metal, metalloid, or pseudo ("R") atoms + * @throws NullPointerException if the given molecule (or one of its atoms) is 'null' + */ + public static boolean containsMetalMetalloidOrPseudoAtom(IAtomContainer aMolecule) throws NullPointerException { + Objects.requireNonNull(aMolecule, "Given molecule is 'null'."); + boolean tmpIsAtomicNumberInvalid; + for (IAtom tmpAtom : aMolecule.atoms()) { + // throws NullPointerException if tmpAtom or its atomic number is 'null' + tmpIsAtomicNumberInvalid = ErtlFunctionalGroupsFinder.isMetalMetalloidOrPseudoAtom(tmpAtom); + if (tmpIsAtomicNumberInvalid) { + return true; + } + } + return false; + } + // + /** + * Checks whether a given atom is charged. These atoms cannot be passed on to ErtlFunctionalGroupsFinder.find() + * if(!) input restrictions are enabled (turned off by default). + * + * @param anAtom the atom to check + * @return true, if the atom is charged + * @throws NullPointerException if the given atom or its formal charge is 'null' + */ + public static boolean isCharged(IAtom anAtom) throws NullPointerException { + Objects.requireNonNull(anAtom, "Given atom is 'null'."); + Integer tmpFormalCharge = anAtom.getFormalCharge(); + Objects.requireNonNull(tmpFormalCharge, "Formal charge is 'null'."); + return (tmpFormalCharge.intValue() != 0); + } + // + /** + * Iterates through all atoms in the given molecule and checks whether they are charged. If this + * method returns 'true', the molecule cannot be passed on to ErtlFunctionalGroupsFinder.find() + * if(!) input restrictions are enabled (turned off by default). + *
This method scales linearly with O(n) with n: number of atoms in the given + * molecule. + * + * @param aMolecule the molecule to check + * @return true, if the molecule contains one or more charged atoms + * @throws NullPointerException if the given molecule (or one of its atoms) is 'null' + */ + public static boolean containsChargedAtom(IAtomContainer aMolecule) throws NullPointerException { + Objects.requireNonNull(aMolecule, "Given molecule is 'null'."); + int tmpAtomCount = aMolecule.getAtomCount(); + boolean tmpIsAtomCharged; + for (IAtom tmpAtom : aMolecule.atoms()) { + //Throws NullPointerException if tmpAtom is 'null' + tmpIsAtomCharged = ErtlFunctionalGroupsFinder.isCharged(tmpAtom); + if (tmpIsAtomCharged) { + return true; + } + } + return false; + } + // + /** + * Checks whether the given molecule consists of two or more unconnected structures, e.g. ion and counter-ion. This + * would make it unfit to be passed to ErtlFunctionalGroupsFinder.find() if(!) the input restrictions are turned on (turned off by default). + * Note: this is a convenience method basically applying ConnectivityChecker.isConnected(aMolecule);. + * + * @param aMolecule the molecule to check + * @return true, if the molecule consists of two or more unconnected structures + * @throws NullPointerException if the given molecule is 'null' + */ + public static boolean isStructureUnconnected(IAtomContainer aMolecule) throws NullPointerException { + //Developer's note: the private checkConstraints() method is not used here because it is intertwined with the + // find() method for speed-up; but it basically does the same. + Objects.requireNonNull(aMolecule, "Given molecule is 'null'"); + boolean tmpIsConnected = ConnectivityChecker.isConnected(aMolecule); + return (!tmpIsConnected); + } + // + /** + * Checks whether the given molecule represented by an atom container can be passed on to the + * ErtlFunctionalGroupsFinder.find() method without problems. + *
This method will return false if the molecule contains any metal, metalloid, pseudo, or charged atoms or consists of + * multiple unconnected parts. + * + * @param aMolecule the molecule to check + * @return true if the given molecule is a valid parameter for ErtlFunctionalGroupsFinder.find() method if(!) the input restrictions are turned on (turned off by default) + * @throws NullPointerException if parameter is 'null' + * @throws IllegalArgumentException if the input molecule causes any other type of exception while processing + */ + public static boolean isValidInputMoleculeWithRestrictionsTurnedOn(IAtomContainer aMolecule) throws NullPointerException, IllegalArgumentException { + Objects.requireNonNull(aMolecule, "Given molecule is null."); + boolean tmpIsValid; + try { + tmpIsValid = !(ErtlFunctionalGroupsFinder.containsMetalMetalloidOrPseudoAtom(aMolecule) + || ErtlFunctionalGroupsFinder.containsChargedAtom(aMolecule) + || ErtlFunctionalGroupsFinder.isStructureUnconnected(aMolecule)); + } catch (Exception anException) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.warn(anException); + throw new IllegalArgumentException(anException); + } + return tmpIsValid; + } + // + /** + * Applies the necessary preprocessing for functional group detection. Atom types are set and aromaticity detected + * in the input molecule. + *
NOTE: This changes properties and flags in the given atom container instance. If you + * want to retain your object unchanged for future calculations, use the IAtomContainer's + * clone() method. + * + * @param aMolecule the molecule to process + * @param anAromaticityModel the aromaticity model to apply to the molecule in preprocessing; Note: The chosen + * ElectronDonation model can massively influence the extracted functional groups of a molecule + * when using ErtlFunctionGroupsFinder! + * @throws NullPointerException if any parameter is null + * @throws IllegalArgumentException if the input molecule causes any other type of exception while processing + */ + public static void applyPreprocessing(IAtomContainer aMolecule, Aromaticity anAromaticityModel) throws NullPointerException, IllegalArgumentException { + Objects.requireNonNull(aMolecule, "Given atom container is 'null'."); + Objects.requireNonNull(anAromaticityModel, "Given aromaticity model is 'null'."); + try { + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(aMolecule); + anAromaticityModel.apply(aMolecule); + } catch (Exception anException) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.warn(anException); + throw new IllegalArgumentException(anException); + } + } + // /** * Clear caches related to the input molecule. Note, these are not proper caches, there are no results cached. Here, * only data taken from the input molecule is saved for only one execution of the find() method, to facilitate @@ -407,7 +561,7 @@ private void clearCache() { this.aromaticHeteroAtomIndicesToIsInGroupBoolMapCache = null; this.markedAtomToConnectedEnvCMapCache = null; } - + // /** * Mark all atoms and store them in a set for further processing. * @@ -987,13 +1141,16 @@ private boolean isHeteroatom(IAtom anAtom) { // /** * Checks whether the given atom is from an element in the organic subset, i.e. not a metal or metalloid atom. - * See the public constant set of non-metal atomic numbers declared in this class. + * See the public constant set of non-metal atomic numbers declared in this class. Given as static here because it is + * used by static public utility methods * * @param anAtom atom to check * @return true if the given atom is organic and not a metal or metalloid atom */ - private boolean isNonmetal(IAtom anAtom) { - return ErtlFunctionalGroupsFinder.NONMETAL_ATOMIC_NUMBERS.contains(anAtom.getAtomicNumber()); + private static boolean isNonmetal(IAtom anAtom) { + Integer tmpAtomicNumber = anAtom.getAtomicNumber(); + int tmpAtomicNumberInt = tmpAtomicNumber.intValue(); + return ErtlFunctionalGroupsFinder.NONMETAL_ATOMIC_NUMBERS.contains(tmpAtomicNumberInt); } // /** @@ -1094,14 +1251,16 @@ private List partitionIntoGroups(IAtomContainer aSourceContainer */ private void checkConstraints(IAtomContainer aMolecule) throws IllegalArgumentException { for (IAtom tmpAtom : aMolecule.atoms()) { - if (!Objects.isNull(tmpAtom.getFormalCharge()) && tmpAtom.getFormalCharge() != 0) { + if (ErtlFunctionalGroupsFinder.isCharged(tmpAtom)) { throw new IllegalArgumentException("Input molecule must not contain any charges."); } - if (!this.isNonmetal(tmpAtom)) { - throw new IllegalArgumentException("Input molecule must not contain metal or metalloid atoms."); + if (!ErtlFunctionalGroupsFinder.isNonmetal(tmpAtom)) { + throw new IllegalArgumentException("Input molecule must not contain metal, metalloid, or pseudo atoms."); } } Objects.requireNonNull(this.adjListCache, "Adjacency list cache must already be set-up for this check!"); + //Developer's note: this method does not use the public isStructureUnconnected() method because it is intertwined with the + // find() method for speed-up; but it basically does the same. ConnectedComponents tmpConnectedComponents = new ConnectedComponents(this.adjListCache); if (tmpConnectedComponents.nComponents() != 1) { throw new IllegalArgumentException("Input molecule must consist of only a single connected structure."); diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java index 30d1e48..60b5b27 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java @@ -41,7 +41,6 @@ import org.openscience.cdk.tools.manipulator.AtomTypeManipulator; import java.util.ArrayList; -import java.util.HashSet; import java.util.List; import java.util.Objects; import java.util.logging.Level; @@ -49,7 +48,7 @@ /** * This class gives utility methods for using ErtlFunctionalGroupsFinder, - * a CDK-based implementation, published here, of the + * a CDK-based implementation, published here of the * Ertl algorithm for automated functional groups detection. * The methods of this class are basically public static re-implementations of the routines used for testing and * evaluating the ErtlFunctionalGroupsFinder, as described in the publication. @@ -87,34 +86,12 @@ public int encode(IAtom anAtom, IAtomContainer aContainer) { //
// // - /** - * Atomic numbers that ErtlFunctionalGroupsFinder accepts, see getValidAtomicNumbers() - */ - private static final int[] VALID_ATOMIC_NUMBERS = new int[] {1,2,6,7,8,9,10,15,16,17,18,34,35,36,53,54,86}; - - /** - * Atomic numbers that ErtlFunctionalGroupsFinder accepts, loaded into a hash set for quick determination; set is - * filled in static initializer (see below) - */ - private static final HashSet VALID_ATOMIC_NUMBERS_SET = new HashSet<>(20, 1); - /** * Logger of this class */ private static final Logger LOGGER = Logger.getLogger(ErtlFunctionalGroupsFinderUtility.class.getName()); // // - // - /** - * Static initializer that sets up hash maps/sets used by static methods. - */ - static { - for (int i : ErtlFunctionalGroupsFinderUtility.VALID_ATOMIC_NUMBERS) { - ErtlFunctionalGroupsFinderUtility.VALID_ATOMIC_NUMBERS_SET.add(i); - } - } - // - // private ErtlFunctionalGroupsFinderUtility() { } @@ -145,21 +122,6 @@ public static MoleculeHashGenerator getFunctionalGroupHashGenerator() { //
// // - /** - * Checks whether the given molecule consists of two or more unconnected structures, e.g. ion and counter-ion. This - * would make it unfit to be passed to ErtlFunctionalGroupsFinder.find(). This can be fixed by preprocessing, see - * selectBiggestUnconnectedComponent() below. - * - * @param aMolecule the molecule to check - * @return true, if the molecule consists of two or more unconnected structures - * @throws NullPointerException if the given molecule is 'null' - */ - public static boolean isStructureUnconnected(IAtomContainer aMolecule) throws NullPointerException { - Objects.requireNonNull(aMolecule, "Given molecule is 'null'"); - boolean tmpIsConnected = ConnectivityChecker.isConnected(aMolecule); - return (!tmpIsConnected); - } - /** * Checks whether the atom count or bond count of the given molecule is zero. The ErtlFunctionalGroupsFinder.find() * method would still accept these molecules, but it is not recommended to pass them on (simply makes not much sense). @@ -175,96 +137,6 @@ public static boolean isAtomOrBondCountZero(IAtomContainer aMolecule) throws Nul return (tmpAtomCount == 0 || tmpBondCount == 0); } - /** - * Iterates through all atoms in the given molecule and checks whether they are charged. If this method returns - * 'true', the molecule cannot be passed on to ErtlFunctionalGroupsFinder.find() but should be filtered or the - * charges neutralized (see neutralizeCharges() below). - *
If no charged atoms are found, this method scales linearly with O(n) with n: number of atoms in the given - * molecule. - * - * @param aMolecule the molecule to check - * @return true, if the molecule contains one or more charged atoms - * @throws NullPointerException if the given molecule (or one of its atoms) is 'null' - */ - public static boolean isMoleculeCharged(IAtomContainer aMolecule) throws NullPointerException { - Objects.requireNonNull(aMolecule, "Given molecule is 'null'."); - int tmpAtomCount = aMolecule.getAtomCount(); - if (tmpAtomCount == 0) { - return false; - } - Iterable tmpAtoms = aMolecule.atoms(); - boolean tmpIsAtomCharged; - for (IAtom tmpAtom : tmpAtoms) { - //Throws NullPointerException if tmpAtom is 'null' - tmpIsAtomCharged = ErtlFunctionalGroupsFinderUtility.isAtomCharged(tmpAtom); - if (tmpIsAtomCharged) { - return true; - } - } - return false; - } - - /** - * Checks whether a given atom is charged. - * - * @param anAtom the atom to check - * @return true, if the atom is charged - * @throws NullPointerException if the given atom or its formal charge is 'null' - */ - public static boolean isAtomCharged(IAtom anAtom) throws NullPointerException { - Objects.requireNonNull(anAtom, "Given atom is 'null'."); - Integer tmpFormalCharge = anAtom.getFormalCharge(); - Objects.requireNonNull(tmpFormalCharge, "Formal charge is 'null'."); - return (tmpFormalCharge.intValue() != 0); - } - - /** - * Checks whether a given atom is a metal, metalloid or pseudo atom judging by its atomic number. Atoms with invalid - * atomic numbers (metal, metalloid or pseudo ('R') atoms) cannot be passed on to ErtlFunctionalGroupsFinder.find() - * but should be filtered. - * - * @param anAtom the atom to check - * @return true, if the atomic number is invalid or 'null' - * @throws NullPointerException if the given atom or its atomic number is 'null' - */ - public static boolean isAtomicNumberInvalid(IAtom anAtom) throws NullPointerException { - Objects.requireNonNull(anAtom, "Given atom is 'null'."); - Integer tmpAtomicNumber = anAtom.getAtomicNumber(); - Objects.requireNonNull(tmpAtomicNumber, "Atomic number is 'null'."); - int tmpAtomicNumberInt = tmpAtomicNumber.intValue(); - boolean tmpIsAtomicNumberValid = ErtlFunctionalGroupsFinderUtility.VALID_ATOMIC_NUMBERS_SET.contains(tmpAtomicNumberInt); - return !tmpIsAtomicNumberValid; - } - - /** - * Iterates through all atoms in the given molecule and checks whether their atomic numbers are invalid. If this - * method returns 'true', the molecule cannot be passed on to ErtlFunctionalGroupsFinder.find() but should be - * filtered. - *
If no invalid atoms are found, this method scales linearly with O(n) with n: number of atoms in the given - * molecule. - * - * @param aMolecule the molecule to check - * @return true, if the molecule contains one or more atoms with invalid atomic numbers - * @throws NullPointerException if the given molecule (or one of its atoms) is 'null' - */ - public static boolean containsInvalidAtomicNumbers(IAtomContainer aMolecule) throws NullPointerException { - Objects.requireNonNull(aMolecule, "Given molecule is 'null'."); - int tmpAtomCount = aMolecule.getAtomCount(); - if (tmpAtomCount == 0) { - return false; - } - Iterable tmpAtoms = aMolecule.atoms(); - boolean tmpIsAtomicNumberInvalid; - for (IAtom tmpAtom : tmpAtoms) { - //Throws NullPointerException if tmpAtom is 'null' - tmpIsAtomicNumberInvalid = ErtlFunctionalGroupsFinderUtility.isAtomicNumberInvalid(tmpAtom); - if (tmpIsAtomicNumberInvalid) { - return true; - } - } - return false; - } - /** * Checks whether the given molecule represented by an atom container should NOT be passed on to the * ErtlFunctionalGroupsFinder.find() method but instead be filtered. @@ -273,9 +145,6 @@ public static boolean containsInvalidAtomicNumbers(IAtomContainer aMolecule) thr *
If this method returns false, this does NOT mean the molecule can be passed on to find() without a problem. It * still might need to be preprocessed first. * - * @see ErtlFunctionalGroupsFinderUtility#isValidArgumentForFindMethod(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#shouldBePreprocessed(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#applyFiltersAndPreprocessing(IAtomContainer, Aromaticity) * @param aMolecule the atom container to check * @return true if the given atom container should be discarded * @throws NullPointerException if parameter is 'null' @@ -293,9 +162,6 @@ public static boolean shouldBeFiltered(IAtomContainer aMolecule) throws NullPoin *
If this method returns false, this does NOT mean the molecule can be passed on to find() without a problem. It * still might need to be preprocessed first. * - * @see ErtlFunctionalGroupsFinderUtility#isValidArgumentForFindMethod(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#shouldBePreprocessed(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#applyFiltersAndPreprocessing(IAtomContainer, Aromaticity) * @param aMolecule the atom container to check * @param areSingleAtomsFiltered if false, molecules with bond count 0 but atom count 1 will return false (do not filter) * @return true if the given atom container should be discarded @@ -306,10 +172,10 @@ public static boolean shouldBeFiltered(IAtomContainer aMolecule, boolean areSing boolean tmpShouldBeFiltered; try { if (areSingleAtomsFiltered) { - tmpShouldBeFiltered = (ErtlFunctionalGroupsFinderUtility.containsInvalidAtomicNumbers(aMolecule) + tmpShouldBeFiltered = (ErtlFunctionalGroupsFinder.containsMetalMetalloidOrPseudoAtom(aMolecule) || ErtlFunctionalGroupsFinderUtility.isAtomOrBondCountZero(aMolecule)); } else { - tmpShouldBeFiltered = (ErtlFunctionalGroupsFinderUtility.containsInvalidAtomicNumbers(aMolecule) + tmpShouldBeFiltered = (ErtlFunctionalGroupsFinder.containsMetalMetalloidOrPseudoAtom(aMolecule) || aMolecule.getAtomCount() == 0); } @@ -328,11 +194,6 @@ public static boolean shouldBeFiltered(IAtomContainer aMolecule, boolean areSing *
It is advised to check via shouldBeFiltered() whether the given molecule should be discarded anyway before * calling this function. * - * @see ErtlFunctionalGroupsFinderUtility#shouldBeFiltered(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#isValidArgumentForFindMethod(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#applyFiltersAndPreprocessing(IAtomContainer, Aromaticity) - * @see ErtlFunctionalGroupsFinderUtility#neutralizeCharges(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#selectBiggestUnconnectedComponent(IAtomContainer) * @param aMolecule the atom container to check * @return true is the given molecule needs to be preprocessed * @throws NullPointerException if parameter is 'null' @@ -341,8 +202,8 @@ public static boolean shouldBePreprocessed(IAtomContainer aMolecule) throws Null Objects.requireNonNull(aMolecule, "Given molecule is null."); boolean tmpNeedsPreprocessing; try { - tmpNeedsPreprocessing = (ErtlFunctionalGroupsFinderUtility.isMoleculeCharged(aMolecule) - || ErtlFunctionalGroupsFinderUtility.isStructureUnconnected(aMolecule)); + tmpNeedsPreprocessing = (ErtlFunctionalGroupsFinder.containsChargedAtom(aMolecule) + || ErtlFunctionalGroupsFinder.isStructureUnconnected(aMolecule)); } catch (Exception anException) { ErtlFunctionalGroupsFinderUtility.LOGGER.log(Level.WARNING, anException.toString() + " Molecule ID: " + ErtlFunctionalGroupsFinderUtility.getIDForLogging(aMolecule), @@ -358,10 +219,6 @@ public static boolean shouldBePreprocessed(IAtomContainer aMolecule) throws Null *
This method will return false if the molecule contains any metal, metalloid, pseudo, or charged atoms, contains * multiple unconnected parts, or has an atom or bond count of zero. * - * @see ErtlFunctionalGroupsFinder#find(IAtomContainer, boolean) - * @see ErtlFunctionalGroupsFinderUtility#shouldBeFiltered(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#shouldBePreprocessed(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#applyFiltersAndPreprocessing(IAtomContainer, Aromaticity) * @param aMolecule the molecule to check * @return true if the given molecule is a valid parameter for ErtlFunctionalGroupsFinder.find() method * @throws NullPointerException if parameter is 'null' @@ -377,10 +234,6 @@ public static boolean isValidArgumentForFindMethod(IAtomContainer aMolecule) thr * multiple unconnected parts, or has an atom or bond count of zero. If the second parameter is set to "false", single atom molecules * (bond count is 0) are accepted and not recommended to be filtered if they fulfill the other requirements. * - * @see ErtlFunctionalGroupsFinder#find(IAtomContainer, boolean) - * @see ErtlFunctionalGroupsFinderUtility#shouldBeFiltered(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#shouldBePreprocessed(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#applyFiltersAndPreprocessing(IAtomContainer, Aromaticity) * @param aMolecule the molecule to check * @param areSingleAtomsFiltered if false, molecules with bond count 0 but atom count 1 will return true (do not filter) * @return true if the given molecule is a valid parameter for ErtlFunctionalGroupsFinder.find() method @@ -391,15 +244,15 @@ public static boolean isValidArgumentForFindMethod(IAtomContainer aMolecule, boo boolean tmpIsValid; try { if (areSingleAtomsFiltered) { - tmpIsValid = !(ErtlFunctionalGroupsFinderUtility.containsInvalidAtomicNumbers(aMolecule) + tmpIsValid = !(ErtlFunctionalGroupsFinder.containsMetalMetalloidOrPseudoAtom(aMolecule) || ErtlFunctionalGroupsFinderUtility.isAtomOrBondCountZero(aMolecule) - || ErtlFunctionalGroupsFinderUtility.isMoleculeCharged(aMolecule) - || ErtlFunctionalGroupsFinderUtility.isStructureUnconnected(aMolecule)); + || ErtlFunctionalGroupsFinder.containsChargedAtom(aMolecule) + || ErtlFunctionalGroupsFinder.isStructureUnconnected(aMolecule)); } else { - tmpIsValid = !(ErtlFunctionalGroupsFinderUtility.containsInvalidAtomicNumbers(aMolecule) + tmpIsValid = !(ErtlFunctionalGroupsFinder.containsMetalMetalloidOrPseudoAtom(aMolecule) || aMolecule.getAtomCount() == 0 - || ErtlFunctionalGroupsFinderUtility.isMoleculeCharged(aMolecule) - || ErtlFunctionalGroupsFinderUtility.isStructureUnconnected(aMolecule)); + || ErtlFunctionalGroupsFinder.containsChargedAtom(aMolecule) + || ErtlFunctionalGroupsFinder.isStructureUnconnected(aMolecule)); } } catch (Exception anException) { ErtlFunctionalGroupsFinderUtility.LOGGER.log(Level.SEVERE, @@ -477,7 +330,6 @@ public static void neutralizeCharges(IAtomContainer aMolecule) throws NullPointe * @throws NullPointerException if anAtom or aParentMolecule is 'null' * @throws CDKException if the atom is not part of the molecule or no matching atom type can be determined for the * atom or there is a problem with adding the implicit hydrogen atoms. - * @see ErtlFunctionalGroupsFinderUtility#neutralizeCharges(IAtomContainer) */ public static void neutralizeCharges(IAtom anAtom, IAtomContainer aParentMolecule) throws NullPointerException, CDKException { Objects.requireNonNull(anAtom, "Given atom is 'null'."); @@ -586,9 +438,6 @@ public static boolean applyAromaticityDetection(IAtomContainer aMolecule, Aromat * clone() method. *
NOTE2: The returned IAtomContainer object is the same as the one given as parameter! * - * @see ErtlFunctionalGroupsFinder#find(IAtomContainer, boolean) - * @see ErtlFunctionalGroupsFinderUtility#shouldBeFiltered(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#shouldBePreprocessed(IAtomContainer) * @param aMolecule the molecule to check and process * @param anAromaticityModel the aromaticity model to apply to the molecule in preprocessing; Note: The chosen * ElectronDonation model can massively influence the extracted function groups of a molecule when using @@ -614,12 +463,9 @@ public static IAtomContainer applyFiltersAndPreprocessing(IAtomContainer aMolecu * clone() method. *
NOTE2: The returned IAtomContainer object is the same as the one given as parameter! * - * @see ErtlFunctionalGroupsFinder#find(IAtomContainer, boolean) - * @see ErtlFunctionalGroupsFinderUtility#shouldBeFiltered(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#shouldBePreprocessed(IAtomContainer) * @param aMolecule the molecule to check and process * @param anAromaticityModel the aromaticity model to apply to the molecule in preprocessing; Note: The chosen - * ElectronDonation model can massively influence the extracted function groups of a molecule when using + * ElectronDonation model can massively influence the extracted functional groups of a molecule when using * ErtlFunctionGroupsFinder! * @param areSingleAtomsFiltered if false, molecules with bond count 0 but atom count 1 will be processed and * not return null @@ -645,17 +491,17 @@ public static IAtomContainer applyFiltersAndPreprocessing(IAtomContainer aMolecu } } //From structures containing two or more unconnected structures (e.g. ions) choose the largest structure - boolean tmpIsUnconnected = ErtlFunctionalGroupsFinderUtility.isStructureUnconnected(aMolecule); + boolean tmpIsUnconnected = ErtlFunctionalGroupsFinder.isStructureUnconnected(aMolecule); if (tmpIsUnconnected) { aMolecule = ErtlFunctionalGroupsFinderUtility.selectBiggestUnconnectedComponent(aMolecule); } //Filter - boolean tmpContainsInvalidAtoms = ErtlFunctionalGroupsFinderUtility.containsInvalidAtomicNumbers(aMolecule); + boolean tmpContainsInvalidAtoms = ErtlFunctionalGroupsFinder.containsMetalMetalloidOrPseudoAtom(aMolecule); if (tmpContainsInvalidAtoms) { return null; } //Neutralize charges if there are any - boolean tmpIsCharged = ErtlFunctionalGroupsFinderUtility.isMoleculeCharged(aMolecule); + boolean tmpIsCharged = ErtlFunctionalGroupsFinder.containsChargedAtom(aMolecule); if (tmpIsCharged) { ErtlFunctionalGroupsFinderUtility.neutralizeCharges(aMolecule); } diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java index 724a23f..2eb0a3b 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java @@ -221,10 +221,10 @@ public void testOnMolecule() throws Exception { SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); //CHEMBL1201736 IAtomContainer tmpMolecule = tmpSmiPar.parseSmiles("CO/N=C(\\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N+]3(C)CCCC3)CS[C@H]12)c1csc(N)n1.Cl"); - Assertions.assertTrue(ErtlFunctionalGroupsFinderUtility.isStructureUnconnected(tmpMolecule)); - Assertions.assertTrue(ErtlFunctionalGroupsFinderUtility.isMoleculeCharged(tmpMolecule)); + Assertions.assertTrue(ErtlFunctionalGroupsFinder.isStructureUnconnected(tmpMolecule)); + Assertions.assertTrue(ErtlFunctionalGroupsFinder.containsChargedAtom(tmpMolecule)); Assertions.assertFalse(ErtlFunctionalGroupsFinderUtility.isAtomOrBondCountZero(tmpMolecule)); - Assertions.assertFalse(ErtlFunctionalGroupsFinderUtility.containsInvalidAtomicNumbers(tmpMolecule)); + Assertions.assertFalse(ErtlFunctionalGroupsFinder.containsMetalMetalloidOrPseudoAtom(tmpMolecule)); Assertions.assertFalse(ErtlFunctionalGroupsFinderUtility.shouldBeFiltered(tmpMolecule)); Assertions.assertTrue(ErtlFunctionalGroupsFinderUtility.shouldBePreprocessed(tmpMolecule)); Assertions.assertFalse(ErtlFunctionalGroupsFinderUtility.isValidArgumentForFindMethod(tmpMolecule)); From 8b36bc13f16ada39cbd229af0dbec240b422217f Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Tue, 16 Jan 2024 10:22:01 +0100 Subject: [PATCH 15/27] Additional documentation; --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 130 ++++++++++++------ 1 file changed, 85 insertions(+), 45 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index edef7bf..1303dac 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -50,13 +50,43 @@ * Finds and extracts a molecule's functional groups in a purely rule-based manner. * This class implements Peter Ertl's algorithm for the automated detection and extraction * of functional groups in organic molecules - * ([Ertl P. An algorithm to identify functional groups in organic molecules. J Cheminform. 2017; 9:36.]). - *

- * Note: this implementation is not thread-safe. Each parallel thread should have its own instance of this class. - *

+ * ([Ertl P. An algorithm to identify functional groups in organic molecules. J Cheminform. 2017; 9:36.]) + * and has been described in a scientific publication + * ([Fritsch, S., Neumann, S., Schaub, J. et al. ErtlFunctionalGroupsFinder: automated rule-based functional group detection with the Chemistry Development Kit (CDK). J Cheminform. 2019; 11:37.]). + *

+ *

In brief, the algorithm iterates through all atoms in the input molecule and marks hetero atoms and specific carbon atoms + * (i.a. those in non-aromatic double or triple bonds etc.) as being part of a functional group. Connected groups of marked + * atoms are extracted as separate functional groups, together with their unmarked, "environmental" carbon atoms. These + * environments can be important, e.g. to differentiate an alcohol from a phenol, but are less important in other cases. + * To account for this, Ertl also devised a "generalization" scheme that generalizes the functional group environments + * in a way that accounts for their varying significance in different cases. Most environmental atoms are exchanged with + * pseudo ("R") atoms there. All these functionalities are available in ErtlFunctionalgroupsFinder. Additionally, only + * the marked atoms completely without their environments can be extracted. + *

+ *

To apply functional group detection to an input molecule, its atom types need to be set and aromaticity needs + * to be detected beforehand: + *
+ * //Prepare input
+ * SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance());
+ * IAtomContainer tmpInputMol = tmpSmiPar.parseSmiles("C[C@@H]1CN(C[C@H](C)N1)C2=C(C(=C3C(=C2F)N(C=C(C3=O)C(=O)O)C4CC4)N)F"); //PubChem CID 5257
+ * AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpInputMol);
+ * Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet());
+ * tmpAromaticity.apply(tmpInputMol);
+ * //Identify functional groups
+ * ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(); //default: generalization turned on
+ * List{@literal <}IAtomContainer{@literal >} tmpFunctionalGroupsList = tmpEFGF.find(tmpInputMol);
+ * 
+ * In order to only identify functional groups in standardised, organic structures, ErtlFunctionalGroupsFinder can + * be configured to only accept molecules that do *not* contain any metal, metalloid, or pseudo (R) atoms or formal charges. + * Also structures consisting of more than one unconnected component (e.g. ion and counter-ion) are not accepted if the + * strict input restrictions are turned on. This can be done via a boolean parameter in a variant of the central find() method. + * To identify molecules that need to be filtered from the input set or preprocessed in this use case, convenience methods are + * available in this class. + *

+ *

Note: this implementation is not thread-safe. Each parallel thread should have its own instance of this class. * * @author Sebastian Fritsch, Jonas Schaub - * @version 1.2.1 + * @version 1.3 */ public class ErtlFunctionalGroupsFinder { /** @@ -330,7 +360,7 @@ public List find(IAtomContainer aMolecule) throws CloneNotSuppor * @throws CloneNotSupportedException if cloning is not possible * @return a list with all functional groups found in the molecule */ - public List find (IAtomContainer aMolecule, boolean aShouldInputBeCloned) throws CloneNotSupportedException { + public List find(IAtomContainer aMolecule, boolean aShouldInputBeCloned) throws CloneNotSupportedException { return this.find(aMolecule, aShouldInputBeCloned, false); } // @@ -388,6 +418,32 @@ public List find(IAtomContainer aMolecule, boolean aShouldInputB return tmpFunctionalGroupsList; } // + /** + * Applies the always necessary preprocessing for functional group detection. Atom types are set and aromaticity detected + * in the input molecule. + *
NOTE: This changes properties and flags in the given atom container instance. If you + * want to retain your object unchanged for future calculations, use the IAtomContainer's + * clone() method. + * + * @param aMolecule the molecule to process + * @param anAromaticityModel the aromaticity model to apply to the molecule in preprocessing; Note: The chosen + * ElectronDonation model can massively influence the extracted functional groups of a molecule + * when using ErtlFunctionGroupsFinder! + * @throws NullPointerException if any parameter is null + * @throws IllegalArgumentException if the input molecule causes any other type of exception while processing + */ + public static void applyPreprocessing(IAtomContainer aMolecule, Aromaticity anAromaticityModel) throws NullPointerException, IllegalArgumentException { + Objects.requireNonNull(aMolecule, "Given atom container is 'null'."); + Objects.requireNonNull(anAromaticityModel, "Given aromaticity model is 'null'."); + try { + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(aMolecule); + anAromaticityModel.apply(aMolecule); + } catch (Exception anException) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.warn(anException); + throw new IllegalArgumentException(anException); + } + } + // /** * Returns the unmodifiable set containing the atomic numbers that can be passed on to ErtlFunctionalGroupsFinder.find() * if(!) input restrictions are enabled (turned off by default). These nonmetal elements include @@ -404,23 +460,26 @@ public static Set getNonmetalAtomicNumbers() { // /** * Checks whether a given atom is a metal, metalloid, or pseudo atom judging by its atomic number. These atoms - * cannot be passed on to ErtlFunctionalGroupsFinder.find() - * if(!) input restrictions are enabled (turned off by default). + * cannot be passed on to ErtlFunctionalGroupsFinder.find() if(!) input restrictions are enabled (turned off by default). * * @param anAtom the atom to check * @return true, if the atomic number is not in the nonmetal atomic numbers set or 'null' - * @throws NullPointerException if the given atom or its atomic number is 'null' + * @throws NullPointerException if the given atom is 'null' */ public static boolean isMetalMetalloidOrPseudoAtom(IAtom anAtom) throws NullPointerException { Objects.requireNonNull(anAtom, "Given atom is 'null'."); - Objects.requireNonNull(anAtom.getAtomicNumber(), "Atomic number is 'null'."); + if (Objects.isNull(anAtom.getAtomicNumber())) { + return true; + } return !ErtlFunctionalGroupsFinder.isNonmetal(anAtom); } // /** * Iterates through all atoms in the given molecule and checks them for metal, metalloid, and pseudo ("R") atoms. If this * method returns 'true', the molecule cannot be passed on to ErtlFunctionalGroupsFinder.find() - * if(!) input restrictions are enabled (turned off by default). + * if(!) input restrictions are enabled (turned off by default). If you are using the strict input restrictions to + * only identify functional groups in standardised, organic structures, you should filter the molecules where this + * method returns true from your input set. *
This method scales linearly with O(n) with n: number of atoms in the given * molecule. * @@ -432,7 +491,7 @@ public static boolean containsMetalMetalloidOrPseudoAtom(IAtomContainer aMolecul Objects.requireNonNull(aMolecule, "Given molecule is 'null'."); boolean tmpIsAtomicNumberInvalid; for (IAtom tmpAtom : aMolecule.atoms()) { - // throws NullPointerException if tmpAtom or its atomic number is 'null' + // throws NullPointerException if tmpAtom is 'null' tmpIsAtomicNumberInvalid = ErtlFunctionalGroupsFinder.isMetalMetalloidOrPseudoAtom(tmpAtom); if (tmpIsAtomicNumberInvalid) { return true; @@ -447,29 +506,32 @@ public static boolean containsMetalMetalloidOrPseudoAtom(IAtomContainer aMolecul * * @param anAtom the atom to check * @return true, if the atom is charged - * @throws NullPointerException if the given atom or its formal charge is 'null' + * @throws NullPointerException if the given atom is 'null' */ public static boolean isCharged(IAtom anAtom) throws NullPointerException { Objects.requireNonNull(anAtom, "Given atom is 'null'."); Integer tmpFormalCharge = anAtom.getFormalCharge(); - Objects.requireNonNull(tmpFormalCharge, "Formal charge is 'null'."); + if (Objects.isNull(tmpFormalCharge)) { + return false; + } return (tmpFormalCharge.intValue() != 0); } // /** * Iterates through all atoms in the given molecule and checks whether they are charged. If this * method returns 'true', the molecule cannot be passed on to ErtlFunctionalGroupsFinder.find() - * if(!) input restrictions are enabled (turned off by default). + * if(!) input restrictions are enabled (turned off by default). If you are using the strict input restrictions to + * only identify functional groups in standardised, organic structures, you can try to neutralise the charges in the + * molecules where this method returns true by standardisation routines. *
This method scales linearly with O(n) with n: number of atoms in the given * molecule. * * @param aMolecule the molecule to check * @return true, if the molecule contains one or more charged atoms - * @throws NullPointerException if the given molecule (or one of its atoms) is 'null' + * @throws NullPointerException if the given molecule is 'null' */ public static boolean containsChargedAtom(IAtomContainer aMolecule) throws NullPointerException { Objects.requireNonNull(aMolecule, "Given molecule is 'null'."); - int tmpAtomCount = aMolecule.getAtomCount(); boolean tmpIsAtomCharged; for (IAtom tmpAtom : aMolecule.atoms()) { //Throws NullPointerException if tmpAtom is 'null' @@ -484,6 +546,9 @@ public static boolean containsChargedAtom(IAtomContainer aMolecule) throws NullP /** * Checks whether the given molecule consists of two or more unconnected structures, e.g. ion and counter-ion. This * would make it unfit to be passed to ErtlFunctionalGroupsFinder.find() if(!) the input restrictions are turned on (turned off by default). + * If you are using the strict input restrictions to only identify functional groups in standardised, organic structures, + * you can try to select the biggest connected component in the input atom containers where this method returns true + * and only pass that to ErtlFunctionalGroupsFinder. * Note: this is a convenience method basically applying ConnectivityChecker.isConnected(aMolecule);. * * @param aMolecule the molecule to check @@ -500,9 +565,10 @@ public static boolean isStructureUnconnected(IAtomContainer aMolecule) throws Nu // /** * Checks whether the given molecule represented by an atom container can be passed on to the - * ErtlFunctionalGroupsFinder.find() method without problems. + * ErtlFunctionalGroupsFinder.find() method without problems even if(!) the input restrictions are turned on (turned off by default). *
This method will return false if the molecule contains any metal, metalloid, pseudo, or charged atoms or consists of - * multiple unconnected parts. + * multiple unconnected parts. Some of these issues (charges and multiple unconnected components) can be solved by + * respective standardisation routines. * * @param aMolecule the molecule to check * @return true if the given molecule is a valid parameter for ErtlFunctionalGroupsFinder.find() method if(!) the input restrictions are turned on (turned off by default) @@ -523,32 +589,6 @@ public static boolean isValidInputMoleculeWithRestrictionsTurnedOn(IAtomContaine return tmpIsValid; } // - /** - * Applies the necessary preprocessing for functional group detection. Atom types are set and aromaticity detected - * in the input molecule. - *
NOTE: This changes properties and flags in the given atom container instance. If you - * want to retain your object unchanged for future calculations, use the IAtomContainer's - * clone() method. - * - * @param aMolecule the molecule to process - * @param anAromaticityModel the aromaticity model to apply to the molecule in preprocessing; Note: The chosen - * ElectronDonation model can massively influence the extracted functional groups of a molecule - * when using ErtlFunctionGroupsFinder! - * @throws NullPointerException if any parameter is null - * @throws IllegalArgumentException if the input molecule causes any other type of exception while processing - */ - public static void applyPreprocessing(IAtomContainer aMolecule, Aromaticity anAromaticityModel) throws NullPointerException, IllegalArgumentException { - Objects.requireNonNull(aMolecule, "Given atom container is 'null'."); - Objects.requireNonNull(anAromaticityModel, "Given aromaticity model is 'null'."); - try { - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(aMolecule); - anAromaticityModel.apply(aMolecule); - } catch (Exception anException) { - ErtlFunctionalGroupsFinder.LOGGING_TOOL.warn(anException); - throw new IllegalArgumentException(anException); - } - } - // /** * Clear caches related to the input molecule. Note, these are not proper caches, there are no results cached. Here, * only data taken from the input molecule is saved for only one execution of the find() method, to facilitate From 8682e11d9e7f0e9336c52b6e19cae126e6ab80c8 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Tue, 16 Jan 2024 17:05:21 +0100 Subject: [PATCH 16/27] Overhaul of EFGFTest; --- .gitignore | 3 +- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 8 +- .../tools/ErtlFunctionalGroupsFinderTest.java | 786 ++++++++++++------ 3 files changed, 520 insertions(+), 277 deletions(-) diff --git a/.gitignore b/.gitignore index 06dfc56..b8d2a55 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,5 @@ build/ .gradle # EFGF evaluation test output -ErtlFunctionalGroupsFinderEvaluationTest_Output/ \ No newline at end of file +ErtlFunctionalGroupsFinderEvaluationTest_Output/ +ChEBI_complete.sdf \ No newline at end of file diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 1303dac..35d7224 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -81,7 +81,10 @@ * Also structures consisting of more than one unconnected component (e.g. ion and counter-ion) are not accepted if the * strict input restrictions are turned on. This can be done via a boolean parameter in a variant of the central find() method. * To identify molecules that need to be filtered from the input set or preprocessed in this use case, convenience methods are - * available in this class. + * available in this class. Please note that structural properties like formal charges and the others mentioned above + * are not expected to cause issues (exceptions) when processed by this class, but they are not explicitly regarded by + * the Ertl algorithm and hence this implementation, too. They might therefore cause unexpected behaviour in functional + * group identification. For example, a charge is not listed as a reason to mark a carbon atom. *

*

Note: this implementation is not thread-safe. Each parallel thread should have its own instance of this class. * @@ -759,6 +762,7 @@ private void markAtoms(IAtomContainer aMolecule) { try { tmpConnectedAtom = aMolecule.getAtom(this.adjListCache[idx][0]); } catch(ArrayIndexOutOfBoundsException anException) { + //TODO: this happens too often to ignore, investigate! ErtlFunctionalGroupsFinder.LOGGING_TOOL.warn("Explicit H was included in atom count but not correctly in adjacency list"); break; } @@ -1302,7 +1306,7 @@ private void checkConstraints(IAtomContainer aMolecule) throws IllegalArgumentEx //Developer's note: this method does not use the public isStructureUnconnected() method because it is intertwined with the // find() method for speed-up; but it basically does the same. ConnectedComponents tmpConnectedComponents = new ConnectedComponents(this.adjListCache); - if (tmpConnectedComponents.nComponents() != 1) { + if (tmpConnectedComponents.nComponents() > 1) { throw new IllegalArgumentException("Input molecule must consist of only a single connected structure."); } } diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java index 387303c..f876930 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java @@ -21,6 +21,7 @@ package org.openscience.cdk.tools; import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Assumptions; import org.junit.jupiter.api.Test; import org.openscience.cdk.AtomContainer; import org.openscience.cdk.PseudoAtom; @@ -52,193 +53,347 @@ /** * Test for ErtlFunctionalGroupsFinder. * - * @author Sebastian Fritsch - * @version 1.2 + * @author Sebastian Fritsch, Jonas Schaub + * @version 1.3 */ public class ErtlFunctionalGroupsFinderTest { - + /** + * Constructor. + */ public ErtlFunctionalGroupsFinderTest() { super(); } - + // + /** + * Example code to be used in the GitHub wiki of the project. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void gitHubWikiTest() throws Exception { + //Prepare input + SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + IAtomContainer tmpInputMol = tmpSmiPar.parseSmiles("C[C@@H]1CN(C[C@H](C)N1)C2=C(C(=C3C(=C2F)N(C=C(C3=O)C(=O)O)C4CC4)N)F"); //PubChem CID 5257 + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpInputMol); + Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); + tmpAromaticity.apply(tmpInputMol); + //Identify functional groups + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(); //default: generalization turned on + List tmpFunctionalGroupsList = tmpEFGF.find(tmpInputMol); + SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical | SmiFlavor.UseAromaticSymbols); + for (IAtomContainer tmpFunctionalGroup : tmpFunctionalGroupsList) { + String tmpSmilesString = tmpSmiGen.create(tmpFunctionalGroup); + System.out.println(tmpSmilesString); + } + //non-generalized functional groups + System.out.println("----------------"); + tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); + tmpFunctionalGroupsList = tmpEFGF.find(tmpInputMol); + for (IAtomContainer tmpFunctionalGroup : tmpFunctionalGroupsList) { + String tmpSmilesString = tmpSmiGen.create(tmpFunctionalGroup); + System.out.println(tmpSmilesString); + } + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind1() throws Exception { - String moleculeSmiles = "Cc1cc(C)nc(NS(=O)(=O)c2ccc(N)cc2)n1"; - String[] expectedFGs = new String[] {"[R]N([R])S(=O)(=O)[R]", "[c]N(H)H", "NarR3", "NarR3"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "Cc1cc(C)nc(NS(=O)(=O)c2ccc(N)cc2)n1"; + String[] tmpExpectedFGs = new String[] {"[R]N([R])S(=O)(=O)[R]", "[c]N(H)H", "NarR3", "NarR3"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test - public void testFind2() throws Exception{ - String moleculeSmiles = "NC(=N)c1ccc(\\\\C=C\\\\c2ccc(cc2O)C(=N)N)cc1"; - String[] expectedFGs = new String[] {"[R]N=C-N([R])[R]", "[C]=[C]", "[c]OH", "[R]N=C-N([R])[R]"}; - testFind(moleculeSmiles, expectedFGs); + public void testFind2() throws Exception { + String tmpMoleculeSmiles = "NC(=N)c1ccc(\\\\C=C\\\\c2ccc(cc2O)C(=N)N)cc1"; + String[] tmpExpectedFGs = new String[] {"[R]N=C-N([R])[R]", "[C]=[C]", "[c]OH", "[R]N=C-N([R])[R]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind3() throws Exception { - String moleculeSmiles = "CC(=O)Nc1nnc(s1)S(=O)(=O)N"; - String[] expectedFGs = new String[] {"[R]N([R])C(=O)[R]", "[R]S(=O)(=O)N([R])[R]", "NarR3", "NarR3", "SarR2"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "CC(=O)Nc1nnc(s1)S(=O)(=O)N"; + String[] tmpExpectedFGs = new String[] {"[R]N([R])C(=O)[R]", "[R]S(=O)(=O)N([R])[R]", "NarR3", "NarR3", "SarR2"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind4() throws Exception { - String moleculeSmiles = "NS(=O)(=O)c1cc2c(NCNS2(=O)=O)cc1Cl"; - String[] expectedFGs = new String[] {"[R]S(=O)(=O)N([R])[R]", "[R]S(=O)(=O)N([R])[C]N([R])[R]", "[R]Cl"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "NS(=O)(=O)c1cc2c(NCNS2(=O)=O)cc1Cl"; + String[] tmpExpectedFGs = new String[] {"[R]S(=O)(=O)N([R])[R]", "[R]S(=O)(=O)N([R])[C]N([R])[R]", "[R]Cl"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind5() throws Exception { - String moleculeSmiles = "CNC1=Nc2ccc(Cl)cc2C(=N(=O)C1)c3ccccc3"; - String[] expectedFGs = new String[] {"[R]N([R])[C]=N[R]", "[R]Cl", "[R]N(=O)=[C]"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "CNC1=Nc2ccc(Cl)cc2C(=N(=O)C1)c3ccccc3"; + String[] tmpExpectedFGs = new String[] {"[R]N([R])[C]=N[R]", "[R]Cl", "[R]N(=O)=[C]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind6() throws Exception { - String moleculeSmiles = "Cc1onc(c2ccccc2)c1C(=O)N[C@H]3[C@H]4SC(C)(C)[C@@H](N4C3=O)C(=O)O"; - String[] expectedFGs = new String[] {"O=C([R])N([R])[R]", "O=C([R])N([R])[C]S[R]", "O=C([R])OH", "OarR2", "NarR3"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "Cc1onc(c2ccccc2)c1C(=O)N[C@H]3[C@H]4SC(C)(C)[C@@H](N4C3=O)C(=O)O"; + String[] tmpExpectedFGs = new String[] {"O=C([R])N([R])[R]", "O=C([R])N([R])[C]S[R]", "O=C([R])OH", "OarR2", "NarR3"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind7() throws Exception { - String moleculeSmiles = "Clc1ccccc1C2=NCC(=O)Nc3ccc(cc23)N(=O)=O"; - String[] expectedFGs = new String[] {"[R]Cl", "[R]N=[C]", "[R]C(=O)N([R])[R]", "O=N([R])=O"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "Clc1ccccc1C2=NCC(=O)Nc3ccc(cc23)N(=O)=O"; + String[] tmpExpectedFGs = new String[] {"[R]Cl", "[R]N=[C]", "[R]C(=O)N([R])[R]", "O=N([R])=O"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind8() throws Exception { - String moleculeSmiles = "COc1cc(cc(C(=O)NCC2CCCN2CC=C)c1OC)S(=O)(=O)N"; - String[] expectedFGs = new String[] {"[R]O[R]", "[R]N([R])C(=O)[R]", "N([R])([R])[R]", "[C]=[C]", "[R]O[R]", "[R]S(=O)(=O)N([R])[R]"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "COc1cc(cc(C(=O)NCC2CCCN2CC=C)c1OC)S(=O)(=O)N"; + String[] tmpExpectedFGs = new String[] {"[R]O[R]", "[R]N([R])C(=O)[R]", "N([R])([R])[R]", "[C]=[C]", "[R]O[R]", "[R]S(=O)(=O)N([R])[R]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind9() throws Exception { - String moleculeSmiles = "Cc1ccc(Cl)c(Nc2ccccc2C(=O)O)c1Cl"; - String[] expectedFGs = new String[] {"[R]Cl", "[R]N(H)[R]", "O=C(OH)[R]", "[R]Cl"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "Cc1ccc(Cl)c(Nc2ccccc2C(=O)O)c1Cl"; + String[] tmpExpectedFGs = new String[] {"[R]Cl", "[R]N(H)[R]", "O=C(OH)[R]", "[R]Cl"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind10() throws Exception { - String moleculeSmiles = "Clc1ccc2Oc3ccccc3N=C(N4CCNCC4)c2c1"; - String[] expectedFGs = new String[] {"[R]Cl", "[R]O[R]", "[R]N([R])[C]=N[R]", "[R]N([H])[R]"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "Clc1ccc2Oc3ccccc3N=C(N4CCNCC4)c2c1"; + String[] tmpExpectedFGs = new String[] {"[R]Cl", "[R]O[R]", "[R]N([R])[C]=N[R]", "[R]N([H])[R]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind11() throws Exception { - String moleculeSmiles = "FC(F)(F)CN1C(=O)CN=C(c2ccccc2)c3cc(Cl)ccc13"; - String[] expectedFGs = new String[] {"[R]F", "[R]F", "[R]F", "O=C([R])N([R])[R]", "[R]N=[C]", "[R]Cl"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "FC(F)(F)CN1C(=O)CN=C(c2ccccc2)c3cc(Cl)ccc13"; + String[] tmpExpectedFGs = new String[] {"[R]F", "[R]F", "[R]F", "O=C([R])N([R])[R]", "[R]N=[C]", "[R]Cl"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind12() throws Exception { - String moleculeSmiles = "OC[C@H]1O[C@H](C[C@@H]1O)n2cnc3[C@H](O)CNC=Nc23";; - String[] expectedFGs = new String[] {"[C]O[H]", "[R]O[R]", "[C]OH", "[C]OH", "[R]N=CN([R])[R]", "NarR3", "NarR3"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "OC[C@H]1O[C@H](C[C@@H]1O)n2cnc3[C@H](O)CNC=Nc23"; + String[] tmpExpectedFGs = new String[] {"[C]O[H]", "[R]O[R]", "[C]OH", "[C]OH", "[R]N=CN([R])[R]", "NarR3", "NarR3"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind13() throws Exception { - String moleculeSmiles = "CCN[C@H]1C[C@H](C)S(=O)(=O)c2sc(cc12)S(=O)(=O)N"; - String[] expectedFGs = new String[] {"[R]N([R])H", "O=S(=O)([R])[R]", "[R]S(=O)(=O)N([R])[R]", "SarR2"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "CCN[C@H]1C[C@H](C)S(=O)(=O)c2sc(cc12)S(=O)(=O)N"; + String[] tmpExpectedFGs = new String[] {"[R]N([R])H", "O=S(=O)([R])[R]", "[R]S(=O)(=O)N([R])[R]", "SarR2"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind14() throws Exception { - String moleculeSmiles = "C[C@@H](O)[C@@H]1[C@H]2[C@@H](C)C(=C(N2C1=O)C(=O)O)S[C@@H]3CN[C@@H](C3)C(=O)N(C)C"; - String[] expectedFGs = new String[] {"[C]O[H]", "O=C([R])N([R])C(C(=O)(OH))=[C]S[R]", "[R]N(H)[R]", "[R]N([R])C([R])=O"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "C[C@@H](O)[C@@H]1[C@H]2[C@@H](C)C(=C(N2C1=O)C(=O)O)S[C@@H]3CN[C@@H](C3)C(=O)N(C)C"; + String[] tmpExpectedFGs = new String[] {"[C]O[H]", "O=C([R])N([R])C(C(=O)(OH))=[C]S[R]", "[R]N(H)[R]", "[R]N([R])C([R])=O"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind15() throws Exception { - String moleculeSmiles = "C[C@@H]1CN(C[C@H](C)N1)c2c(F)c(N)c3C(=O)C(=CN(C4CC4)c3c2F)C(=O)O"; - String[] expectedFGs = new String[] {"[R]N([R])[R]", "[R]N([H])[R]", "[R]F", "[c]N(H)H", "[c]=O", "[R]F", "[R]C(=O)OH", "NarR3"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "C[C@@H]1CN(C[C@H](C)N1)c2c(F)c(N)c3C(=O)C(=CN(C4CC4)c3c2F)C(=O)O"; + String[] tmpExpectedFGs = new String[] {"[R]N([R])[R]", "[R]N([H])[R]", "[R]F", "[c]N(H)H", "[c]=O", "[R]F", "[R]C(=O)OH", "NarR3"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind16() throws Exception { - String moleculeSmiles = "CC(=CCC1C(=O)N(N(C1=O)c2ccccc2)c3ccccc3)C"; - String[] expectedFGs = new String[] {"[C]=[C]", "[R]C(=O)N([R])N([R])C(=O)[R]"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "CC(=CCC1C(=O)N(N(C1=O)c2ccccc2)c3ccccc3)C"; + String[] tmpExpectedFGs = new String[] {"[C]=[C]", "[R]C(=O)N([R])N([R])C(=O)[R]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind17() throws Exception { - String moleculeSmiles = "Clc1ccc2N=C3NC(=O)CN3Cc2c1Cl"; - String[] expectedFGs = new String[] {"Cl[R]", "[R]N=C(N([R])[R])N([R])C(=O)[R]", "Cl[R]"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "Clc1ccc2N=C3NC(=O)CN3Cc2c1Cl"; + String[] tmpExpectedFGs = new String[] {"Cl[R]", "[R]N=C(N([R])[R])N([R])C(=O)[R]", "Cl[R]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind18() throws Exception { - String moleculeSmiles = "CC(=O)N[C@@H]1[C@@H](NC(=N)N)C=C(O[C@H]1[C@H](O)[C@H](O)CO)C(=O)O"; - String[] expectedFGs = new String[] {"[R]N([R])C(=O)[R]", "[R]N([R])C(=N[R])N([R])[R]", "O=C(OH)C(=[C])O[R]" , "[C]OH", "[C]OH", "[C]OH"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "CC(=O)N[C@@H]1[C@@H](NC(=N)N)C=C(O[C@H]1[C@H](O)[C@H](O)CO)C(=O)O"; + String[] tmpExpectedFGs = new String[] {"[R]N([R])C(=O)[R]", "[R]N([R])C(=N[R])N([R])[R]", "O=C(OH)C(=[C])O[R]" , "[C]OH", "[C]OH", "[C]OH"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind19() throws Exception { - String moleculeSmiles = "C[C@H](O)[C@H](O)[C@H]1CNc2nc(N)nc(O)c2N1"; - String[] expectedFGs = new String[] {"[C]OH", "[C]OH", "[R]N(H)[R]" , "[c]N(H)H", "[c]OH", "[R]N(H)[R]", "NarR3", "NarR3"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "C[C@H](O)[C@H](O)[C@H]1CNc2nc(N)nc(O)c2N1"; + String[] tmpExpectedFGs = new String[] {"[C]OH", "[C]OH", "[R]N(H)[R]" , "[c]N(H)H", "[c]OH", "[R]N(H)[R]", "NarR3", "NarR3"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ @Test public void testFind20() throws Exception { - String moleculeSmiles = "N[C@@H]1CCCCN(C1)c2c(Cl)cc3C(=O)C(=CN(C4CC4)c3c2Cl)C(=O)O"; - String[] expectedFGs = new String[] {"[C]N([H])[H]", "[R]N([R])[R]", "[R]Cl" , "[c]=O", "[R]Cl", "[R]C(=O)OH", "NarR3"}; - testFind(moleculeSmiles, expectedFGs); + String tmpMoleculeSmiles = "N[C@@H]1CCCCN(C1)c2c(Cl)cc3C(=O)C(=CN(C4CC4)c3c2Cl)C(=O)O"; + String[] tmpExpectedFGs = new String[] {"[C]N([H])[H]", "[R]N([R])[R]", "[R]Cl" , "[c]=O", "[R]Cl", "[R]C(=O)OH", "NarR3"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - + // /** - * Example code to be used in the GitHub wiki of the project. - * - * @throws Exception if anything goes wrong - * @author Jonas Schaub + * TODO: more testing necessary */ @Test - public void gitHubWikiTest() throws Exception { - //Prepare input + public void testOnlyMarkedAtoms() throws Exception { SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); - IAtomContainer tmpInputMol = tmpSmiPar.parseSmiles("C[C@@H]1CN(C[C@H](C)N1)C2=C(C(=C3C(=C2F)N(C=C(C3=O)C(=O)O)C4CC4)N)F"); //PubChem CID 5257 - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpInputMol); - Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); - tmpAromaticity.apply(tmpInputMol); - //Identify functional groups - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(); //default: generalization turned on - List tmpFunctionalGroupsList = tmpEFGF.find(tmpInputMol); SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical | SmiFlavor.UseAromaticSymbols); - for (IAtomContainer tmpFunctionalGroup : tmpFunctionalGroupsList) { - String tmpSmilesString = tmpSmiGen.create(tmpFunctionalGroup); - System.out.println(tmpSmilesString); + Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); + + IAtomContainer tmpTetraethylOrthosilicate = tmpSmiPar.parseSmiles("CCO[Si](OCC)(OCC)OCC"); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpTetraethylOrthosilicate); + tmpAromaticity.apply(tmpTetraethylOrthosilicate); + + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.ONLY_MARKED_ATOMS); + List tmpFGList = tmpEFGF.find(tmpTetraethylOrthosilicate); + + System.out.println("Tetraethyl Orthosilicate:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); } - //non-generalized functional groups - System.out.println("----------------"); - tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); - tmpFunctionalGroupsList = tmpEFGF.find(tmpInputMol); - for (IAtomContainer tmpFunctionalGroup : tmpFunctionalGroupsList) { - String tmpSmilesString = tmpSmiGen.create(tmpFunctionalGroup); - System.out.println(tmpSmilesString); + + IAtomContainer tmpCHEMBL1201736 = tmpSmiPar.parseSmiles("CO/N=C(\\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N+]3(C)CCCC3)CS[C@H]12)c1csc(N)n1.Cl"); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpCHEMBL1201736); + tmpAromaticity.apply(tmpCHEMBL1201736); + + tmpFGList = tmpEFGF.find(tmpCHEMBL1201736); + + System.out.println("CHEMBL1201736:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); } } - + // /** - * TODO: Investigate code for possible problems with charged atoms? + * Tests functional group identification on example molecules that have formal charges. * - * TODO: Test carbon ions. - * - * @throws Exception + * @throws Exception if anything goes wrong + * @author Jonas Schaub */ @Test public void testChargedMolecules() throws Exception { @@ -268,12 +423,56 @@ public void testChargedMolecules() throws Exception { for (IAtomContainer tmpFG : tmpFGList) { System.out.println(tmpSmiGen.create(tmpFG)); } - } + IAtomContainer tmpTetraMethylAmmonium = tmpSmiPar.parseSmiles("C[N+](C)(C)C"); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpTetraMethylAmmonium); + tmpAromaticity.apply(tmpTetraMethylAmmonium); + + tmpFGList = tmpEFGF.find(tmpTetraMethylAmmonium); + + System.out.println("Tetramethylammonium:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); + } + + IAtomContainer tmpCarbeniumIonInBetaPositionToBr = tmpSmiPar.parseSmiles("c1ccccc1[CH+]C(Br)C"); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpCarbeniumIonInBetaPositionToBr); + tmpAromaticity.apply(tmpCarbeniumIonInBetaPositionToBr); + + tmpEFGF.setEnvMode(ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); + tmpFGList = tmpEFGF.find(tmpCarbeniumIonInBetaPositionToBr); + + //Result: carbenium ion is ignored since a charge is not a reason to mark carbon atom + System.out.println("Carbenium ion in beta position to Br:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); + } + + IAtomContainer tmpCarbeniumIonInAlphaPositionToBr = tmpSmiPar.parseSmiles("c1ccccc1[C+](Br)C"); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpCarbeniumIonInAlphaPositionToBr); + tmpAromaticity.apply(tmpCarbeniumIonInAlphaPositionToBr); + + tmpFGList = tmpEFGF.find(tmpCarbeniumIonInAlphaPositionToBr, false); + + //Result: carbenium ion is extracted as environmental carbon and replaced by a new atom instance as all env carbon atoms in EFGF; so it lost its charge! + System.out.println("Carbenium ion in alpha position to Br:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); + } + + //restore carbenium ion using the EFGFUtility: + ErtlFunctionalGroupsFinderUtility.restoreOriginalEnvironmentalCarbons(tmpFGList, tmpCarbeniumIonInAlphaPositionToBr, false, false, SilentChemObjectBuilder.getInstance()); + System.out.println("Environmental carbon atoms restored on carbenium in alpha position to Br:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); + } + } + // /** - * TODO: Investigate code for possible problems with disconnected structures? + * Tests functional group identification on example molecules that consist of more than one disconnected structure. * - * @throws Exception + * @throws Exception if anything goes wrong + * @author Jonas Schaub */ @Test public void testDisconnectedMolecules() throws Exception { @@ -304,14 +503,15 @@ public void testDisconnectedMolecules() throws Exception { System.out.println(tmpSmiGen.create(tmpFG)); } } - + // /** - * + * Tests functional group identification on example molecules that contain metal or metalloid atoms. * * Note: all atoms are marked as hetero atoms by EFGF that are not H or C. So, metals and metalloids get treated like - * any other hetero atom and should not cause problems. + * any other hetero atom. * - * @throws Exception + * @throws Exception if anything goes wrong + * @author Jonas Schaub */ @Test public void testMetalsMetalloids() throws Exception { @@ -342,27 +542,47 @@ public void testMetalsMetalloids() throws Exception { System.out.println(tmpSmiGen.create(tmpFG)); } } - - //TODO: Clean-up check constraints and add test molecules for these special cases to the testFind#() methods. - + //TODO: add test molecules for these special cases to the testFind#() methods? After Achim agreed /** - * TODO: test complete ChEBI? + * TODO: what to do with this method, keep the analysis of the subset? * - * Note: ChEBI lite 3-star subset SDF contains 251 molecules with charges or metal/metalloid atoms or more than one - * disconnected structure (comment-in checkConstraints in EFGF.find() method to check). + * ChEBI complete (184933 structures in file (some will automatically be skipped by SDF reader)): + * Number of parsed molecules: 184930 + * Exceptions while preprocessing: 0 + * Molecules that would be filtered due to input restrictions: 29637 + * Exceptions with restrictions (prefiltered): 0 + * Exceptions without restrictions: 0 * - * @throws Exception + * ChEBI lite 3-star subset SDF (1396 structures in file (some will automatically be skipped by SDF reader)): + * Number of parsed molecules: 1396 + * Exceptions while preprocessing: 0 + * Molecules that would be filtered due to input restrictions: 251 + * Exceptions with restrictions (prefiltered): 0 + * Exceptions without restrictions: 0 + * + * + * @throws Exception if anything goes wrong */ @Test public void readChebiLite3StarSubset() throws Exception { - IteratingSDFReader tmpChebiSDFReader = new IteratingSDFReader( - ErtlFunctionalGroupsFinderTest.class.getResourceAsStream("ChEBI_lite_3star_subset.sdf"), - SilentChemObjectBuilder.getInstance(), - false); + IteratingSDFReader tmpChebiSDFReader = null; + try { + tmpChebiSDFReader = new IteratingSDFReader( + ErtlFunctionalGroupsFinderTest.class.getResourceAsStream("ChEBI_lite_3star_subset.sdf"), + SilentChemObjectBuilder.getInstance(), + true); + } catch (Exception e) { + System.out.println("\nSD file could not be found. Test is ignored."); + Assumptions.assumeTrue(false); + return; + } Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); int tmpMoleculeCouter = 0; int tmpExceptionsCounter = 0; + int tmpExceptionsWithRestrictionsCounter = 0; + int tmpExceptionsWithoutRestrictionsCounter = 0; + int tmpMoleculesFilteredCounter = 0; while (tmpChebiSDFReader.hasNext()) { IAtomContainer tmpMolecule = null; tmpMoleculeCouter++; @@ -370,8 +590,7 @@ public void readChebiLite3StarSubset() throws Exception { tmpMolecule = tmpChebiSDFReader.next(); AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpMolecule); tmpAromaticity.apply(tmpMolecule); - - List tmpFGList = tmpEFGF.find(tmpMolecule); + tmpMolecule = tmpMolecule.clone(); } catch (Exception anException) { tmpExceptionsCounter++; if (!Objects.isNull(tmpMolecule)) { @@ -379,192 +598,211 @@ public void readChebiLite3StarSubset() throws Exception { } else { System.out.println("Could not parse molecule! Counter: " + tmpMoleculeCouter); } + continue; + } + try { + if (ErtlFunctionalGroupsFinder.isValidInputMoleculeWithRestrictionsTurnedOn(tmpMolecule)) { + List tmpFGList = tmpEFGF.find(tmpMolecule, false, true); + } else { + //TODO: save these structures somewhere for inspection? + tmpMoleculesFilteredCounter++; + } + } catch (Exception anException) { + tmpExceptionsWithRestrictionsCounter++; + if (!Objects.isNull(tmpMolecule)) { + System.out.println(tmpMolecule.getProperty("ChEBI ID") + "," + anException.toString() + "," + tmpMoleculeCouter); + } else { + System.out.println("Could not identify FG in molecule! Counter: " + tmpMoleculeCouter); + } + } + try { + List tmpFGList = tmpEFGF.find(tmpMolecule, false, false); + } catch (Exception anException) { + tmpExceptionsWithoutRestrictionsCounter++; + if (!Objects.isNull(tmpMolecule)) { + System.out.println(tmpMolecule.getProperty("ChEBI ID") + "," + anException.toString() + "," + tmpMoleculeCouter); + } else { + System.out.println("Could not identify FG in molecule! Counter: " + tmpMoleculeCouter); + } + continue; } - } - System.out.println(tmpMoleculeCouter); - System.out.println(tmpExceptionsCounter); + System.out.println("Number of parsed molecules: " + tmpMoleculeCouter); + System.out.println("Exceptions while preprocessing: " + tmpExceptionsCounter); + System.out.println("Molecules that would be filtered due to input restrictions: " + tmpMoleculesFilteredCounter); + System.out.println("Exceptions with restrictions (prefiltered): " + tmpExceptionsWithRestrictionsCounter); + System.out.println("Exceptions without restrictions: " + tmpExceptionsWithoutRestrictionsCounter); } - + // /** - * TODO: more testing necessary + * Applies EFGF to detect functional groups in the given molecule and compares the identified FG to the given + * expected FG, using i.a. an identity search. Note that the order of the given FG must match the order of the detected + * FG. The expected FG can contain pseudo-SMILES code for some specific cases, where aromatic atoms are marked using + * "-ar" and pseudo-atoms (R) can be included. Uses the electron donation model daylight and the cycle finder "all" + * for aromaticity detection in the input molecule. + * + * @param aMoleculeSmiles input molecule to detect FG in + * @param anExpectedFGPseudoSmilesArray expected FG + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch */ - @Test - public void testOnlyMarkedAtoms() throws Exception { - SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); - SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical | SmiFlavor.UseAromaticSymbols); - Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); - - IAtomContainer tmpTetraethylOrthosilicate = tmpSmiPar.parseSmiles("CCO[Si](OCC)(OCC)OCC"); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpTetraethylOrthosilicate); - tmpAromaticity.apply(tmpTetraethylOrthosilicate); - - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.ONLY_MARKED_ATOMS); - List tmpFGList = tmpEFGF.find(tmpTetraethylOrthosilicate); - - System.out.println("Tetraethyl Orthosilicate:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } - - IAtomContainer tmpCHEMBL1201736 = tmpSmiPar.parseSmiles("CO/N=C(\\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N+]3(C)CCCC3)CS[C@H]12)c1csc(N)n1.Cl"); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpCHEMBL1201736); - tmpAromaticity.apply(tmpCHEMBL1201736); - - tmpFGList = tmpEFGF.find(tmpCHEMBL1201736); - - System.out.println("CHEMBL1201736:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } + private void testFind(String aMoleculeSmiles, String[] anExpectedFGPseudoSmilesArray) throws Exception { + this.testFind(aMoleculeSmiles, anExpectedFGPseudoSmilesArray, new Aromaticity(ElectronDonation.daylight(), Cycles.all())); } - - private void testFind(String moleculeSmiles, String[] fGStrings) throws Exception { - testFind(moleculeSmiles, fGStrings, new Aromaticity(ElectronDonation.daylight(), Cycles.all())); - } - - private void testFind(String moleculeSmiles, String[] fGStrings, Aromaticity aromaticity) throws Exception { + // + /** + * Applies EFGF to detect functional groups in the given molecule and compares the identified FG to the given + * expected FG, using i.a. an identity search. Note that the order of the given FG must match the order of the detected + * FG. The expected FG can contain pseudo-SMILES code for some specific cases, where aromatic atoms are marked using + * "-ar" and pseudo-atoms (R) can be included. The given aromaticity model is used for preprocessing the input molecule. + * + * @param aMoleculeSmiles input molecule to detect FG in + * @param anExpectedFGPseudoSmilesArray expected FG + * @param anAromaticityModel for aromaticity detection in preprocessing of the input molecule + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + private void testFind(String aMoleculeSmiles, String[] anExpectedFGPseudoSmilesArray, Aromaticity anAromaticityModel) throws Exception { // prepare input - SmilesParser smilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); - IAtomContainer mol = smilesParser.parseSmiles(moleculeSmiles); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol); - aromaticity.apply(mol); - + SmilesParser tmpSmilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); + IAtomContainer tmpMolecule = tmpSmilesParser.parseSmiles(aMoleculeSmiles); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpMolecule); + anAromaticityModel.apply(tmpMolecule); // find functional groups - ErtlFunctionalGroupsFinder fgFinder = new ErtlFunctionalGroupsFinder(); - List fGs = fgFinder.find(mol); - + ErtlFunctionalGroupsFinder tmpFGFinder = new ErtlFunctionalGroupsFinder(); + List tmpFunctionalgroupsList = tmpFGFinder.find(tmpMolecule); // get expected groups - List expectedFGs = new LinkedList<>(); - for (String fGString : fGStrings) { - expectedFGs.add(buildFunctionalGroup(fGString)); + List tmpExpectedFGs = new LinkedList<>(); + for (String tmpFGString : anExpectedFGPseudoSmilesArray) { + tmpExpectedFGs.add(this.buildFunctionalGroup(tmpFGString)); } - // compare - this.assertIsomorphism(expectedFGs, fGs); + this.assertIsomorphism(tmpExpectedFGs, tmpFunctionalgroupsList); } - + // /** + * Asserts the isomorphism between two lists of functional group atom containers. Compares their sizes, atom counts, + * bond counts, performs an identity match using the Vento-Foggia algorithm, and checks that aromaticity annotations + * match for the atoms and bonds. * NOTE: actual and expected functional groups must be in the same order! * - * @param expectedFGs list of expected functional groups - * @param actualFGs list of actual functional groups - * @throws Exception if anything does not work as planned + * @param anExpectedFGsList list of expected functional groups + * @param anActualFGsList list of actual functional groups + * @author Sebastian Fritsch */ - private void assertIsomorphism(List expectedFGs, List actualFGs) { - Assertions.assertEquals(expectedFGs.size(), actualFGs.size(), + private void assertIsomorphism(List anExpectedFGsList, List anActualFGsList) { + Assertions.assertEquals(anExpectedFGsList.size(), anActualFGsList.size(), "Number of functional groups does not match the expected number of groups"); - - for(int i = 0; i < expectedFGs.size(); i++) { - IAtomContainer cExp = expectedFGs.get(i); - IAtomContainer cAct = actualFGs.get(i); - - Assertions.assertEquals(cExp.getAtomCount(), cAct.getAtomCount(), + for (int i = 0; i < anExpectedFGsList.size(); i++) { + IAtomContainer tmpExpectedFG = anExpectedFGsList.get(i); + IAtomContainer tmpActualFG = anActualFGsList.get(i); + Assertions.assertEquals(tmpExpectedFG.getAtomCount(), tmpActualFG.getAtomCount(), "Groups #" + i + ": different atom count"); - Assertions.assertEquals(cExp.getBondCount(), cAct.getBondCount(), + Assertions.assertEquals(tmpExpectedFG.getBondCount(), tmpActualFG.getBondCount(), "Groups #" + i + ": different bond count"); - - Pattern pattern = VentoFoggia.findIdentical(cExp); - - Assertions.assertTrue(pattern.matches(cAct), "Groups #" + i + ": not isomorph"); - - Mappings mappings = pattern.matchAll(cAct); - - Map atomMap = mappings.toAtomMap().iterator().next(); - for (Map.Entry e : atomMap.entrySet()) { - IAtom atomExp = e.getKey(); - IAtom atomAct = e.getValue(); - Assertions.assertEquals(atomExp.isAromatic(), atomAct.isAromatic(), - "Groups #" + i + ": Atom aromaticity does not match" - + atomAct.getSymbol() + atomAct.isAromatic() + atomExp.getSymbol() - + atomExp.isAromatic()); + Pattern tmpExpectedFGPattern = VentoFoggia.findIdentical(tmpExpectedFG); + Assertions.assertTrue(tmpExpectedFGPattern.matches(tmpActualFG), "Groups #" + i + ": not isomorphic"); + Mappings tmpExpFGinActFGmappings = tmpExpectedFGPattern.matchAll(tmpActualFG); + Map tmpAtomMap = tmpExpFGinActFGmappings.toAtomMap().iterator().next(); + for (Map.Entry tmpMapEntry : tmpAtomMap.entrySet()) { + IAtom tmpExpectedAtom = tmpMapEntry.getKey(); + IAtom tmpActualAtom = tmpMapEntry.getValue(); + Assertions.assertEquals(tmpExpectedAtom.isAromatic(), tmpActualAtom.isAromatic(), + "Groups #" + i + ": Atom aromaticity does not match (" + + tmpActualAtom.getSymbol() + tmpActualAtom.isAromatic() + + ":" + + tmpExpectedAtom.getSymbol() + tmpExpectedAtom.isAromatic() + + ")"); } - - Map bondMap = mappings.toBondMap().iterator().next(); - for (Map.Entry e : bondMap.entrySet()) { - IBond bondExp = e.getKey(); - IBond bondAct = e.getValue(); - Assertions.assertEquals(bondExp.isAromatic(), bondAct.isAromatic(), + Map tmpBondMap = tmpExpFGinActFGmappings.toBondMap().iterator().next(); + for (Map.Entry tmpMapEntry : tmpBondMap.entrySet()) { + IBond tmpExpectedBond = tmpMapEntry.getKey(); + IBond tmpActualBond = tmpMapEntry.getValue(); + Assertions.assertEquals(tmpExpectedBond.isAromatic(), tmpActualBond.isAromatic(), "Groups #" + i + ": Bond aromaticity does not match"); } } } - - private IAtomContainer buildFunctionalGroup(String string) { + // + /** + * Constructs a functional group atom container object from a given SMILES or pseudo-SMILES code. + * Pseudo-SMILES codes have aromatic atoms marked by "-ar", e.g. "Nar", and contain pseudo-atoms given as "R". + * But the only available cases here are "NarR3", "SarR2", and "OarR2". There is no general treatment of any pseudo-SMILES + * code! If the given string does not match any of the given three templates, it has to be a valid SMILES string! + * + * @param aFunctionalGroupPseudoSmiles SMILES code or specific pseudo-SMILES code + * @return functional group atom container built from the given code + * @author Sebastian Fritsch + */ + private IAtomContainer buildFunctionalGroup(String aFunctionalGroupPseudoSmiles) { IAtom a1, a2, a3, a4, a5, a6, a7, a8, a9; IBond b1, b2, b3, b4, b5, b6, b7, b8, b9; - IChemObjectBuilder builder = SilentChemObjectBuilder.getInstance(); - IAtomContainer container; - - // custom templates - switch(string) { + IChemObjectBuilder tmpBuilder = SilentChemObjectBuilder.getInstance(); + IAtomContainer tmpFunctionalGroup; + // custom templates: + switch (aFunctionalGroupPseudoSmiles) { case "NarR3": - a1 = builder.newInstance(IPseudoAtom.class, "R"); - a2 = builder.newInstance(IPseudoAtom.class, "R"); - a3 = builder.newInstance(IPseudoAtom.class, "R"); - a4 = builder.newInstance(IAtom.class, "N"); + a1 = tmpBuilder.newInstance(IPseudoAtom.class, "R"); + a2 = tmpBuilder.newInstance(IPseudoAtom.class, "R"); + a3 = tmpBuilder.newInstance(IPseudoAtom.class, "R"); + a4 = tmpBuilder.newInstance(IAtom.class, "N"); a4.setIsAromatic(true); - b1 = builder.newInstance(IBond.class, a1, a4, Order.SINGLE); - b2 = builder.newInstance(IBond.class, a2, a4, Order.SINGLE); - b3 = builder.newInstance(IBond.class, a3, a4, Order.SINGLE); - - container = new AtomContainer(); - container.setAtoms(new IAtom[] {a1, a2, a3, a4}); - container.setBonds(new IBond[] {b1, b2, b3}); - return container; + b1 = tmpBuilder.newInstance(IBond.class, a1, a4, Order.SINGLE); + b2 = tmpBuilder.newInstance(IBond.class, a2, a4, Order.SINGLE); + b3 = tmpBuilder.newInstance(IBond.class, a3, a4, Order.SINGLE); + tmpFunctionalGroup = new AtomContainer(); + tmpFunctionalGroup.setAtoms(new IAtom[] {a1, a2, a3, a4}); + tmpFunctionalGroup.setBonds(new IBond[] {b1, b2, b3}); + return tmpFunctionalGroup; case "SarR2": - a1 = builder.newInstance(IPseudoAtom.class, "R"); - a2 = builder.newInstance(IPseudoAtom.class, "R"); - a3 = builder.newInstance(IAtom.class, "S"); + a1 = tmpBuilder.newInstance(IPseudoAtom.class, "R"); + a2 = tmpBuilder.newInstance(IPseudoAtom.class, "R"); + a3 = tmpBuilder.newInstance(IAtom.class, "S"); a3.setIsAromatic(true); - b1 = builder.newInstance(IBond.class, a1, a3, Order.SINGLE); - b2 = builder.newInstance(IBond.class, a2, a3, Order.SINGLE); - - container = new AtomContainer(); - container.setAtoms(new IAtom[] {a1, a2, a3}); - container.setBonds(new IBond[] {b1, b2}); - return container; + b1 = tmpBuilder.newInstance(IBond.class, a1, a3, Order.SINGLE); + b2 = tmpBuilder.newInstance(IBond.class, a2, a3, Order.SINGLE); + tmpFunctionalGroup = new AtomContainer(); + tmpFunctionalGroup.setAtoms(new IAtom[] {a1, a2, a3}); + tmpFunctionalGroup.setBonds(new IBond[] {b1, b2}); + return tmpFunctionalGroup; case "OarR2": - a1 = builder.newInstance(IPseudoAtom.class, "R"); - a2 = builder.newInstance(IPseudoAtom.class, "R"); - a3 = builder.newInstance(IAtom.class, "O"); + a1 = tmpBuilder.newInstance(IPseudoAtom.class, "R"); + a2 = tmpBuilder.newInstance(IPseudoAtom.class, "R"); + a3 = tmpBuilder.newInstance(IAtom.class, "O"); a3.setIsAromatic(true); - b1 = builder.newInstance(IBond.class, a1, a3, Order.SINGLE); - b2 = builder.newInstance(IBond.class, a2, a3, Order.SINGLE); - - container = new AtomContainer(); - container.setAtoms(new IAtom[] {a1, a2, a3}); - container.setBonds(new IBond[] {b1, b2}); - return container; + b1 = tmpBuilder.newInstance(IBond.class, a1, a3, Order.SINGLE); + b2 = tmpBuilder.newInstance(IBond.class, a2, a3, Order.SINGLE); - // smiles + tmpFunctionalGroup = new AtomContainer(); + tmpFunctionalGroup.setAtoms(new IAtom[] {a1, a2, a3}); + tmpFunctionalGroup.setBonds(new IBond[] {b1, b2}); + return tmpFunctionalGroup; default: + // treat as normal SMILES code try { - SmilesParser smilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); + SmilesParser tmpSmilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); try { - if(string.equals("[c]=O")) - smilesParser.kekulise(false); - container = smilesParser.parseSmiles(string); - } - catch(InvalidSmilesException e) { - smilesParser.kekulise(false); - container = smilesParser.parseSmiles(string); + if (aFunctionalGroupPseudoSmiles.equals("[c]=O")) { + tmpSmilesParser.kekulise(false); + } + tmpFunctionalGroup = tmpSmilesParser.parseSmiles(aFunctionalGroupPseudoSmiles); + } catch(InvalidSmilesException e) { + tmpSmilesParser.kekulise(false); + tmpFunctionalGroup = tmpSmilesParser.parseSmiles(aFunctionalGroupPseudoSmiles); } - - for(IAtom a : container.atoms()) { - if(a instanceof PseudoAtom) { + for(IAtom a : tmpFunctionalGroup.atoms()) { + if (a instanceof PseudoAtom) { a.setSymbol("R"); } } - return container; - } - catch(InvalidSmilesException e) { - throw new IllegalArgumentException("Input string '" + string + " could not be found as a template " + + return tmpFunctionalGroup; + } catch(InvalidSmilesException e) { + throw new IllegalArgumentException("Input string '" + aFunctionalGroupPseudoSmiles + " could not be found as a template " + "and is not a valid SMILES string."); } } From 53704f8ac004c3ef7d9e1ebca101d90f13b853b0 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Wed, 17 Jan 2024 10:20:49 +0100 Subject: [PATCH 17/27] Fixed failing javadoc build --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 35d7224..d361cbb 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -53,8 +53,8 @@ * ([Ertl P. An algorithm to identify functional groups in organic molecules. J Cheminform. 2017; 9:36.]) * and has been described in a scientific publication * ([Fritsch, S., Neumann, S., Schaub, J. et al. ErtlFunctionalGroupsFinder: automated rule-based functional group detection with the Chemistry Development Kit (CDK). J Cheminform. 2019; 11:37.]). - *

- *

In brief, the algorithm iterates through all atoms in the input molecule and marks hetero atoms and specific carbon atoms + *
+ *
In brief, the algorithm iterates through all atoms in the input molecule and marks hetero atoms and specific carbon atoms * (i.a. those in non-aromatic double or triple bonds etc.) as being part of a functional group. Connected groups of marked * atoms are extracted as separate functional groups, together with their unmarked, "environmental" carbon atoms. These * environments can be important, e.g. to differentiate an alcohol from a phenol, but are less important in other cases. @@ -62,8 +62,8 @@ * in a way that accounts for their varying significance in different cases. Most environmental atoms are exchanged with * pseudo ("R") atoms there. All these functionalities are available in ErtlFunctionalgroupsFinder. Additionally, only * the marked atoms completely without their environments can be extracted. - *

- *

To apply functional group detection to an input molecule, its atom types need to be set and aromaticity needs + *
+ *
To apply functional group detection to an input molecule, its atom types need to be set and aromaticity needs * to be detected beforehand: *
  * //Prepare input
@@ -85,8 +85,8 @@
  * are not expected to cause issues (exceptions) when processed by this class, but they are not explicitly regarded by
  * the Ertl algorithm and hence this implementation, too. They might therefore cause unexpected behaviour in functional
  * group identification. For example, a charge is not listed as a reason to mark a carbon atom.
- * 

- *

Note: this implementation is not thread-safe. Each parallel thread should have its own instance of this class. + *
+ *
Note: this implementation is not thread-safe. Each parallel thread should have its own instance of this class. * * @author Sebastian Fritsch, Jonas Schaub * @version 1.3 From ab4d3c4d43b57906c0f641c620544f4d3326c710 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Wed, 17 Jan 2024 11:11:36 +0100 Subject: [PATCH 18/27] Pseudo atoms are not identified as hetero atoms anymore; --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index d361cbb..1fa78d5 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -20,6 +20,7 @@ package org.openscience.cdk.tools; +import org.openscience.cdk.PseudoAtom; import org.openscience.cdk.aromaticity.Aromaticity; import org.openscience.cdk.graph.ConnectedComponents; import org.openscience.cdk.graph.ConnectivityChecker; @@ -755,7 +756,7 @@ private void markAtoms(IAtomContainer aMolecule) { continue; } // if none of the conditions 2.X apply, we have an unmarked C (not relevant here) - } else if (tmpAtomicNr == 1){ + } else if (tmpAtomicNr == 1) { // if H... // convert to implicit H IAtom tmpConnectedAtom; @@ -772,7 +773,7 @@ private void markAtoms(IAtomContainer aMolecule) { tmpConnectedAtom.setImplicitHydrogenCount(tmpConnectedAtom.getImplicitHydrogenCount() + 1); } continue; - } else { + } else if (this.isHeteroatom(tmpAtom)) { // if heteroatom... (CONDITION 1) this.markedAtomsCache.add(idx); if (ErtlFunctionalGroupsFinder.isDbg()) { @@ -782,6 +783,9 @@ private void markAtoms(IAtomContainer aMolecule) { tmpAtom.getSymbol())); } continue; + } else { + //pseudo (R) atom, ignored + continue; } } //end of for loop that iterates over all atoms in the mol if (ErtlFunctionalGroupsFinder.isDbg()) { @@ -1173,26 +1177,29 @@ private void expandEnvironmentGeneralized(IAtom aFunctionalGroupAtom, IAtomConta } // /** - * Checks whether the given atom is a hetero-atom (i.e. non-carbon and non-hydrogen, judged by atomic number). + * Checks whether the given atom is a hetero-atom (i.e. non-carbon and non-hydrogen). Pseudo (R) atoms will also return false! * * @param anAtom the atom to test - * @return true if the given atom is neither a carbon nor a hydrogen atom + * @return true if the given atom is neither a carbon nor a hydrogen or pseudo atom */ private boolean isHeteroatom(IAtom anAtom) { - int tmpAtomicNr = anAtom.getAtomicNumber(); - return tmpAtomicNr != 1 && tmpAtomicNr != 6; + Integer tmpAtomicNr = anAtom.getAtomicNumber(); + return tmpAtomicNr != 1 && tmpAtomicNr != 6 && tmpAtomicNr != 0 && tmpAtomicNr != null && !(anAtom instanceof PseudoAtom) && !(anAtom.getSymbol().contains("R")); } // /** * Checks whether the given atom is from an element in the organic subset, i.e. not a metal or metalloid atom. - * See the public constant set of non-metal atomic numbers declared in this class. Given as static here because it is - * used by static public utility methods + * See the public constant set of non-metal atomic numbers declared in this class. Pseudo (R) atoms will also return false. + * Given as static method here because it is used by static public utility methods (developer's note). * * @param anAtom atom to check * @return true if the given atom is organic and not a metal or metalloid atom */ private static boolean isNonmetal(IAtom anAtom) { Integer tmpAtomicNumber = anAtom.getAtomicNumber(); + if (Objects.isNull(tmpAtomicNumber)) { + return false; + } int tmpAtomicNumberInt = tmpAtomicNumber.intValue(); return ErtlFunctionalGroupsFinder.NONMETAL_ATOMIC_NUMBERS.contains(tmpAtomicNumberInt); } From 5aa839a57d5ac1de7e034761dd0f4f4e9d7582d8 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Wed, 17 Jan 2024 16:45:23 +0100 Subject: [PATCH 19/27] Moved ChEBI analysis to evaluation test; --- .gitignore | 3 +- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 2 +- ...lFunctionalGroupsFinderEvaluationTest.java | 107 ++++++++++++++++++ .../tools/ErtlFunctionalGroupsFinderTest.java | 97 +--------------- 4 files changed, 112 insertions(+), 97 deletions(-) diff --git a/.gitignore b/.gitignore index b8d2a55..24fcec7 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ build/ # EFGF evaluation test output ErtlFunctionalGroupsFinderEvaluationTest_Output/ -ChEBI_complete.sdf \ No newline at end of file +ChEBI_complete.sdf +/Output/ \ No newline at end of file diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 1fa78d5..781d27a 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -1184,7 +1184,7 @@ private void expandEnvironmentGeneralized(IAtom aFunctionalGroupAtom, IAtomConta */ private boolean isHeteroatom(IAtom anAtom) { Integer tmpAtomicNr = anAtom.getAtomicNumber(); - return tmpAtomicNr != 1 && tmpAtomicNr != 6 && tmpAtomicNr != 0 && tmpAtomicNr != null && !(anAtom instanceof PseudoAtom) && !(anAtom.getSymbol().contains("R")); + return tmpAtomicNr != 1 && tmpAtomicNr != 6 && tmpAtomicNr != 0 && tmpAtomicNr != null && !(anAtom instanceof PseudoAtom); } // /** diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java index dce85ec..0b63081 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java @@ -28,6 +28,7 @@ import org.openscience.cdk.aromaticity.Aromaticity; import org.openscience.cdk.aromaticity.ElectronDonation; import org.openscience.cdk.atomtype.CDKAtomTypeMatcher; +import org.openscience.cdk.depict.DepictionGenerator; import org.openscience.cdk.exception.CDKException; import org.openscience.cdk.graph.ConnectivityChecker; import org.openscience.cdk.graph.CycleFinder; @@ -65,6 +66,7 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; +import java.util.Objects; import java.util.Set; @@ -618,6 +620,111 @@ public void testPerformance() throws Exception { long tmpEndTime = System.currentTimeMillis(); System.out.println("\nExtraction of functional groups from these molecules took " + (tmpEndTime - tmpStartTime) + " ms.\n"); } + // + /** + * TODO: what to do with this method, keep the analysis of the subset? + * + * ChEBI complete (184933 structures in file (some will automatically be skipped by SDF reader)): + * Number of parsed molecules: 184930 + * Exceptions while preprocessing: 0 + * Molecules that would be filtered due to input restrictions: 29637 + * Exceptions with restrictions (prefiltered): 0 + * Exceptions without restrictions: 0 + * + * ChEBI lite 3-star subset SDF (1396 structures in file (some will automatically be skipped by SDF reader)): + * Number of parsed molecules: 1396 + * Exceptions while preprocessing: 0 + * Molecules that would be filtered due to input restrictions: 251 + * Exceptions with restrictions (prefiltered): 0 + * Exceptions without restrictions: 0 + * + * + * @throws Exception if anything goes wrong + */ + @Test + public void readChebiLite3StarSubset() throws Exception { + IteratingSDFReader tmpChebiSDFReader = null; + try { + tmpChebiSDFReader = new IteratingSDFReader( + ErtlFunctionalGroupsFinderTest.class.getResourceAsStream("ChEBI_lite_3star_subset.sdf"), + SilentChemObjectBuilder.getInstance(), + true); + } catch (Exception e) { + System.out.println("\nSD file could not be found. Test is ignored."); + Assumptions.assumeTrue(false); + return; + } + Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); + int tmpMoleculeCouter = 0; + int tmpExceptionsCounter = 0; + int tmpExceptionsWithRestrictionsCounter = 0; + int tmpExceptionsWithoutRestrictionsCounter = 0; + int tmpMoleculesFilteredCounter = 0; + while (tmpChebiSDFReader.hasNext()) { + IAtomContainer tmpMolecule = null; + tmpMoleculeCouter++; + try { + tmpMolecule = tmpChebiSDFReader.next(); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpMolecule); + tmpAromaticity.apply(tmpMolecule); + tmpMolecule = tmpMolecule.clone(); + } catch (Exception anException) { + tmpExceptionsCounter++; + if (!Objects.isNull(tmpMolecule)) { + System.out.println(tmpMolecule.getProperty("ChEBI ID") + "," + anException.toString() + "," + tmpMoleculeCouter); + } else { + System.out.println("Could not parse molecule! Counter: " + tmpMoleculeCouter); + } + continue; + } + try { + if (ErtlFunctionalGroupsFinder.isValidInputMoleculeWithRestrictionsTurnedOn(tmpMolecule)) { + List tmpFGList = tmpEFGF.find(tmpMolecule, false, true); + } else { + //TODO: save these structures somewhere for inspection? + tmpMoleculesFilteredCounter++; + try { + DepictionGenerator tmpDepictGen = new DepictionGenerator().withSize(712, 712).withFillToFit().withMargin(10); + String tmpSourceFolder = new File("").getAbsolutePath(); + tmpDepictGen.depict(tmpMolecule).writeTo(tmpSourceFolder + File.separator + "Output" + File.separator + tmpMolecule.getProperty("ChEBI ID").toString().replace(':', '_') + ".png"); + List tmpFGList = tmpEFGF.find(tmpMolecule, false, false); + int i = 0; + for (IAtomContainer tmpFG : tmpFGList) { + tmpDepictGen.depict(tmpFG).writeTo(tmpSourceFolder + File.separator + "Output" + File.separator + tmpMolecule.getProperty("ChEBI ID").toString().replace(':', '_') + "_" + i + ".png"); + i++; + } + } catch (Exception anException) { + anException.printStackTrace(); + break; + } + } + } catch (Exception anException) { + tmpExceptionsWithRestrictionsCounter++; + if (!Objects.isNull(tmpMolecule)) { + System.out.println(tmpMolecule.getProperty("ChEBI ID") + "," + anException.toString() + "," + tmpMoleculeCouter); + } else { + System.out.println("Could not identify FG in molecule! Counter: " + tmpMoleculeCouter); + } + } + try { + List tmpFGList = tmpEFGF.find(tmpMolecule, false, false); + } catch (Exception anException) { + tmpExceptionsWithoutRestrictionsCounter++; + if (!Objects.isNull(tmpMolecule)) { + System.out.println(tmpMolecule.getProperty("ChEBI ID") + "," + anException.toString() + "," + tmpMoleculeCouter); + } else { + System.out.println("Could not identify FG in molecule! Counter: " + tmpMoleculeCouter); + } + continue; + } + } + System.out.println("Number of parsed molecules: " + tmpMoleculeCouter); + System.out.println("Exceptions while preprocessing: " + tmpExceptionsCounter); + System.out.println("Molecules that would be filtered due to input restrictions: " + tmpMoleculesFilteredCounter); + System.out.println("Exceptions with restrictions (prefiltered): " + tmpExceptionsWithRestrictionsCounter); + System.out.println("Exceptions without restrictions: " + tmpExceptionsWithoutRestrictionsCounter); + } // // diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java index f876930..df01af2 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java @@ -21,7 +21,6 @@ package org.openscience.cdk.tools; import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Assumptions; import org.junit.jupiter.api.Test; import org.openscience.cdk.AtomContainer; import org.openscience.cdk.PseudoAtom; @@ -35,7 +34,6 @@ import org.openscience.cdk.interfaces.IBond.Order; import org.openscience.cdk.interfaces.IChemObjectBuilder; import org.openscience.cdk.interfaces.IPseudoAtom; -import org.openscience.cdk.io.iterator.IteratingSDFReader; import org.openscience.cdk.isomorphism.Mappings; import org.openscience.cdk.isomorphism.Pattern; import org.openscience.cdk.isomorphism.VentoFoggia; @@ -48,7 +46,6 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.Objects; /** * Test for ErtlFunctionalGroupsFinder. @@ -503,7 +500,7 @@ public void testDisconnectedMolecules() throws Exception { System.out.println(tmpSmiGen.create(tmpFG)); } } - // + //TODO: test R atoms! /** * Tests functional group identification on example molecules that contain metal or metalloid atoms. * @@ -523,7 +520,7 @@ public void testMetalsMetalloids() throws Exception { AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpTetraethylOrthosilicate); tmpAromaticity.apply(tmpTetraethylOrthosilicate); - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); List tmpFGList = tmpEFGF.find(tmpTetraethylOrthosilicate); System.out.println("Tetraethyl Orthosilicate:"); @@ -543,96 +540,6 @@ public void testMetalsMetalloids() throws Exception { } } //TODO: add test molecules for these special cases to the testFind#() methods? After Achim agreed - /** - * TODO: what to do with this method, keep the analysis of the subset? - * - * ChEBI complete (184933 structures in file (some will automatically be skipped by SDF reader)): - * Number of parsed molecules: 184930 - * Exceptions while preprocessing: 0 - * Molecules that would be filtered due to input restrictions: 29637 - * Exceptions with restrictions (prefiltered): 0 - * Exceptions without restrictions: 0 - * - * ChEBI lite 3-star subset SDF (1396 structures in file (some will automatically be skipped by SDF reader)): - * Number of parsed molecules: 1396 - * Exceptions while preprocessing: 0 - * Molecules that would be filtered due to input restrictions: 251 - * Exceptions with restrictions (prefiltered): 0 - * Exceptions without restrictions: 0 - * - * - * @throws Exception if anything goes wrong - */ - @Test - public void readChebiLite3StarSubset() throws Exception { - IteratingSDFReader tmpChebiSDFReader = null; - try { - tmpChebiSDFReader = new IteratingSDFReader( - ErtlFunctionalGroupsFinderTest.class.getResourceAsStream("ChEBI_lite_3star_subset.sdf"), - SilentChemObjectBuilder.getInstance(), - true); - } catch (Exception e) { - System.out.println("\nSD file could not be found. Test is ignored."); - Assumptions.assumeTrue(false); - return; - } - Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); - int tmpMoleculeCouter = 0; - int tmpExceptionsCounter = 0; - int tmpExceptionsWithRestrictionsCounter = 0; - int tmpExceptionsWithoutRestrictionsCounter = 0; - int tmpMoleculesFilteredCounter = 0; - while (tmpChebiSDFReader.hasNext()) { - IAtomContainer tmpMolecule = null; - tmpMoleculeCouter++; - try { - tmpMolecule = tmpChebiSDFReader.next(); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpMolecule); - tmpAromaticity.apply(tmpMolecule); - tmpMolecule = tmpMolecule.clone(); - } catch (Exception anException) { - tmpExceptionsCounter++; - if (!Objects.isNull(tmpMolecule)) { - System.out.println(tmpMolecule.getProperty("ChEBI ID") + "," + anException.toString() + "," + tmpMoleculeCouter); - } else { - System.out.println("Could not parse molecule! Counter: " + tmpMoleculeCouter); - } - continue; - } - try { - if (ErtlFunctionalGroupsFinder.isValidInputMoleculeWithRestrictionsTurnedOn(tmpMolecule)) { - List tmpFGList = tmpEFGF.find(tmpMolecule, false, true); - } else { - //TODO: save these structures somewhere for inspection? - tmpMoleculesFilteredCounter++; - } - } catch (Exception anException) { - tmpExceptionsWithRestrictionsCounter++; - if (!Objects.isNull(tmpMolecule)) { - System.out.println(tmpMolecule.getProperty("ChEBI ID") + "," + anException.toString() + "," + tmpMoleculeCouter); - } else { - System.out.println("Could not identify FG in molecule! Counter: " + tmpMoleculeCouter); - } - } - try { - List tmpFGList = tmpEFGF.find(tmpMolecule, false, false); - } catch (Exception anException) { - tmpExceptionsWithoutRestrictionsCounter++; - if (!Objects.isNull(tmpMolecule)) { - System.out.println(tmpMolecule.getProperty("ChEBI ID") + "," + anException.toString() + "," + tmpMoleculeCouter); - } else { - System.out.println("Could not identify FG in molecule! Counter: " + tmpMoleculeCouter); - } - continue; - } - } - System.out.println("Number of parsed molecules: " + tmpMoleculeCouter); - System.out.println("Exceptions while preprocessing: " + tmpExceptionsCounter); - System.out.println("Molecules that would be filtered due to input restrictions: " + tmpMoleculesFilteredCounter); - System.out.println("Exceptions with restrictions (prefiltered): " + tmpExceptionsWithRestrictionsCounter); - System.out.println("Exceptions without restrictions: " + tmpExceptionsWithoutRestrictionsCounter); - } // /** * Applies EFGF to detect functional groups in the given molecule and compares the identified FG to the given From a9b58d5a3d2b30c59e32bc50c31202157e616bbb Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Mon, 22 Jan 2024 16:53:42 +0100 Subject: [PATCH 20/27] Added molecule from test find 1 to test for extracting only marked atoms; --- .../cdk/tools/ErtlFunctionalGroupsFinderTest.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java index df01af2..8f85294 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java @@ -384,6 +384,17 @@ public void testOnlyMarkedAtoms() throws Exception { for (IAtomContainer tmpFG : tmpFGList) { System.out.println(tmpSmiGen.create(tmpFG)); } + + IAtomContainer tmpTestFind1 = tmpSmiPar.parseSmiles("Cc1cc(C)nc(NS(=O)(=O)c2ccc(N)cc2)n1"); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpTestFind1); + tmpAromaticity.apply(tmpTestFind1); + + tmpFGList = tmpEFGF.find(tmpTestFind1); + + System.out.println("Test Find 1:"); + for (IAtomContainer tmpFG : tmpFGList) { + System.out.println(tmpSmiGen.create(tmpFG)); + } } // /** From 82531db05c76bbb234f96868c474d486067e330f Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Wed, 24 Jan 2024 15:49:14 +0100 Subject: [PATCH 21/27] Test for charge neutralization; --- .../ErtlFunctionalGroupsFinderUtilityTest.java | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java index 2eb0a3b..de18491 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java @@ -240,4 +240,19 @@ public void testOnMolecule() throws Exception { System.out.println(ErtlFunctionalGroupsFinderUtility.createPseudoSmilesCode(tmpFG)); } } + + /** + * Test charge neutralization. + */ + @Test + public void testNeutralization() throws Exception { + SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + IAtomContainer tmpAmmonia = tmpSmiPar.parseSmiles("[NH4+]"); + ErtlFunctionalGroupsFinderUtility.neutralizeCharges(tmpAmmonia); + SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical); + System.out.println(tmpSmiGen.create(tmpAmmonia)); + IAtomContainer tmpNitro = tmpSmiPar.parseSmiles("C[N+](=O)[O-]"); + ErtlFunctionalGroupsFinderUtility.neutralizeCharges(tmpNitro); + System.out.println(tmpSmiGen.create(tmpNitro)); + } } From 77c822e4e461a29c4aa68c94a63628972569aed6 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Wed, 31 Jan 2024 17:17:23 +0100 Subject: [PATCH 22/27] Started cleaning up tests; --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 5 +- ...lFunctionalGroupsFinderEvaluationTest.java | 3 + .../tools/ErtlFunctionalGroupsFinderTest.java | 336 +++++++++--------- 3 files changed, 172 insertions(+), 172 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 781d27a..3db61a0 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -79,8 +79,9 @@ *
* In order to only identify functional groups in standardised, organic structures, ErtlFunctionalGroupsFinder can * be configured to only accept molecules that do *not* contain any metal, metalloid, or pseudo (R) atoms or formal charges. - * Also structures consisting of more than one unconnected component (e.g. ion and counter-ion) are not accepted if the - * strict input restrictions are turned on. This can be done via a boolean parameter in a variant of the central find() method. + * Also structures consisting of more than one unconnected component (e.g. ion and counter-ion) are not accepted if(!) the + * strict input restrictions are turned on (they are turned off by default). + * This can be done via a boolean parameter in a variant of the central find() method. * To identify molecules that need to be filtered from the input set or preprocessed in this use case, convenience methods are * available in this class. Please note that structural properties like formal charges and the others mentioned above * are not expected to cause issues (exceptions) when processed by this class, but they are not explicitly regarded by diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java index 0b63081..fa42a25 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java @@ -81,6 +81,9 @@ *

* Note for addition of new tests: Only one SD file should be analyzed per test method (since some mechanisms work under * that assumption). + *

+ * Note that this code was written before the class ErtlFunctionalGroupsFinderUtility was implemented to make this type + * of analyses more straightforward using its utility method. This test class here therefore does not use the EFGFUtility class. * * @author Jonas Schaub * @version 1.2 diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java index 8f85294..db358c7 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java @@ -354,164 +354,142 @@ public void testFind20() throws Exception { } // /** - * TODO: more testing necessary + * Tests correct functional group identification on an example molecule. Specifically, the extraction of only the marked atoms + * in a functional group is tested. This feature was added in a later version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub */ @Test - public void testOnlyMarkedAtoms() throws Exception { - SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); - SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical | SmiFlavor.UseAromaticSymbols); - Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); - - IAtomContainer tmpTetraethylOrthosilicate = tmpSmiPar.parseSmiles("CCO[Si](OCC)(OCC)OCC"); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpTetraethylOrthosilicate); - tmpAromaticity.apply(tmpTetraethylOrthosilicate); - - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.ONLY_MARKED_ATOMS); - List tmpFGList = tmpEFGF.find(tmpTetraethylOrthosilicate); - - System.out.println("Tetraethyl Orthosilicate:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } - - IAtomContainer tmpCHEMBL1201736 = tmpSmiPar.parseSmiles("CO/N=C(\\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N+]3(C)CCCC3)CS[C@H]12)c1csc(N)n1.Cl"); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpCHEMBL1201736); - tmpAromaticity.apply(tmpCHEMBL1201736); - - tmpFGList = tmpEFGF.find(tmpCHEMBL1201736); - - System.out.println("CHEMBL1201736:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } - - IAtomContainer tmpTestFind1 = tmpSmiPar.parseSmiles("Cc1cc(C)nc(NS(=O)(=O)c2ccc(N)cc2)n1"); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpTestFind1); - tmpAromaticity.apply(tmpTestFind1); - - tmpFGList = tmpEFGF.find(tmpTestFind1); - - System.out.println("Test Find 1:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } + public void testOnlyMarkedAtoms1() throws Exception { + String tmpMoleculeSmiles = "CCO[Si](OCC)(OCC)OCC"; //Tetraethyl Orthosilicate + String[] tmpExpectedFGs = new String[]{"[O][Si]([O])([O])[O]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs, new Aromaticity(ElectronDonation.daylight(), Cycles.all()), ErtlFunctionalGroupsFinder.Mode.ONLY_MARKED_ATOMS); } // /** - * Tests functional group identification on example molecules that have formal charges. + * Tests correct functional group identification on an example molecule. Specifically, the extraction of only the marked atoms + * in a functional group is tested. This feature was added in a later version. * * @throws Exception if anything goes wrong * @author Jonas Schaub */ @Test - public void testChargedMolecules() throws Exception { - SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); - SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical | SmiFlavor.UseAromaticSymbols); - Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); - - IAtomContainer tmpChargedASA = tmpSmiPar.parseSmiles("CC(=O)OC1=CC=CC=C1C(=O)[O+]"); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpChargedASA); - tmpAromaticity.apply(tmpChargedASA); - - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); - List tmpFGList = tmpEFGF.find(tmpChargedASA); - - System.out.println("Charged ASA:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } - - IAtomContainer tmpNitroPhenol = tmpSmiPar.parseSmiles("C1=CC(=CC=C1[N+](=O)[O-])O"); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpNitroPhenol); - tmpAromaticity.apply(tmpNitroPhenol); - - tmpFGList = tmpEFGF.find(tmpNitroPhenol); - - System.out.println("Nitrophenol:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } - - IAtomContainer tmpTetraMethylAmmonium = tmpSmiPar.parseSmiles("C[N+](C)(C)C"); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpTetraMethylAmmonium); - tmpAromaticity.apply(tmpTetraMethylAmmonium); - - tmpFGList = tmpEFGF.find(tmpTetraMethylAmmonium); - - System.out.println("Tetramethylammonium:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } - - IAtomContainer tmpCarbeniumIonInBetaPositionToBr = tmpSmiPar.parseSmiles("c1ccccc1[CH+]C(Br)C"); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpCarbeniumIonInBetaPositionToBr); - tmpAromaticity.apply(tmpCarbeniumIonInBetaPositionToBr); - - tmpEFGF.setEnvMode(ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); - tmpFGList = tmpEFGF.find(tmpCarbeniumIonInBetaPositionToBr); - - //Result: carbenium ion is ignored since a charge is not a reason to mark carbon atom - System.out.println("Carbenium ion in beta position to Br:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } - - IAtomContainer tmpCarbeniumIonInAlphaPositionToBr = tmpSmiPar.parseSmiles("c1ccccc1[C+](Br)C"); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpCarbeniumIonInAlphaPositionToBr); - tmpAromaticity.apply(tmpCarbeniumIonInAlphaPositionToBr); - - tmpFGList = tmpEFGF.find(tmpCarbeniumIonInAlphaPositionToBr, false); - - //Result: carbenium ion is extracted as environmental carbon and replaced by a new atom instance as all env carbon atoms in EFGF; so it lost its charge! - System.out.println("Carbenium ion in alpha position to Br:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } - - //restore carbenium ion using the EFGFUtility: - ErtlFunctionalGroupsFinderUtility.restoreOriginalEnvironmentalCarbons(tmpFGList, tmpCarbeniumIonInAlphaPositionToBr, false, false, SilentChemObjectBuilder.getInstance()); - System.out.println("Environmental carbon atoms restored on carbenium in alpha position to Br:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } + public void testOnlyMarkedAtoms2() throws Exception { + String tmpMoleculeSmiles = "Cc1cc(C)nc(NS(=O)(=O)c2ccc(N)cc2)n1"; //same mol as testFind1() from the Ertl figure + String[] tmpExpectedFGs = new String[] {"O=[S](=O)[NH]", "[NH2]", "Nar" , "Nar"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs, new Aromaticity(ElectronDonation.daylight(), Cycles.all()), ErtlFunctionalGroupsFinder.Mode.ONLY_MARKED_ATOMS); } // /** - * Tests functional group identification on example molecules that consist of more than one disconnected structure. + * Tests correct functional group identification on an example molecule. Specifically, the extraction of only the marked atoms + * in a functional group is tested. This feature was added in a later version. * * @throws Exception if anything goes wrong * @author Jonas Schaub */ @Test - public void testDisconnectedMolecules() throws Exception { - SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); - SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical | SmiFlavor.UseAromaticSymbols); - Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); - - IAtomContainer tmpChlorhexidineDiacetate = tmpSmiPar.parseSmiles("CC(=O)O.CC(=O)O.C1=CC(=CC=C1NC(=NC(=NCCCCCCN=C(N)N=C(N)NC2=CC=C(C=C2)Cl)N)N)Cl"); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpChlorhexidineDiacetate); - tmpAromaticity.apply(tmpChlorhexidineDiacetate); - - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); - List tmpFGList = tmpEFGF.find(tmpChlorhexidineDiacetate); - - System.out.println("Chlorhexidine Diacetate:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } - - IAtomContainer tmpSodiumEdetate = tmpSmiPar.parseSmiles("C(CN(CC(=O)[O-])CC(=O)[O-])N(CC(=O)[O-])CC(=O)[O-].[Na+].[Na+].[Na+].[Na+]"); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpSodiumEdetate); - tmpAromaticity.apply(tmpSodiumEdetate); - - tmpFGList = tmpEFGF.find(tmpSodiumEdetate); - - System.out.println("Sodium edetate:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } + public void testOnlyMarkedAtoms3() throws Exception { + String tmpMoleculeSmiles = "CO/N=C(\\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N+]3(C)CCCC3)CS[C@H]12)c1csc(N)n1.Cl"; //CHEMBL1201736 + String[] tmpExpectedFGs = new String[] {"[O]N=[C]C(=O)[NH]", "[C]=C(C(=O)[O-])N([C]=O)[CH][S]", "[N+]", "[NH2]", "Cl", "Sar", "Nar"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs, new Aromaticity(ElectronDonation.daylight(), Cycles.all()), ErtlFunctionalGroupsFinder.Mode.ONLY_MARKED_ATOMS); + } + // + /** + * Tests correct functional group identification on an example molecule with formal charges. + * This was not allowed in a previous version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testChargedMolecules1() throws Exception { + String tmpMoleculeSmiles = "CC(=O)OC1=CC=CC=C1C(=O)[O+]"; //charged ASA + String[] tmpExpectedFGs = new String[] {"*OC(*)=O", "*C(=O)[O+]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule with formal charges. + * This was not allowed in a previous version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testChargedMolecules2() throws Exception { + String tmpMoleculeSmiles = "C1=CC(=CC=C1[N+](=O)[O-])O"; //Nitrophenol + String[] tmpExpectedFGs = new String[] {"*[N+](=O)[O-]", "[H]O[c]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - //TODO: test R atoms! + // + /** + * Tests correct functional group identification on an example molecule with formal charges. + * This was not allowed in a previous version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testChargedMolecules3() throws Exception { + String tmpMoleculeSmiles = "C[N+](C)(C)C"; //Tetramethylammonium + String[] tmpExpectedFGs = new String[] {"*[N+](*)(*)*"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule with formal charges. + * This was not allowed in a previous version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testChargedMolecules4() throws Exception { + String tmpMoleculeSmiles = "c1ccccc1[CH+]C(Br)C"; //Carbenium ion in beta position to Br + // carbenium ion is ignored since a charge is not a reason to mark carbon atom + String[] tmpExpectedFGs = new String[] {"[C]Br"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs, new Aromaticity(ElectronDonation.daylight(), Cycles.all()), ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); + + tmpMoleculeSmiles = "c1ccccc1[CH+]C(Br)C"; //Carbenium ion in beta position to Br + // carbenium ion is ignored since a charge is not a reason to mark carbon atom + tmpExpectedFGs = new String[] {"[C]Br"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs, new Aromaticity(ElectronDonation.daylight(), Cycles.all()), ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); + + tmpMoleculeSmiles = "c1ccccc1[C+](Br)C"; //Carbenium ion in alpha position to Br + // carbenium ion is extracted as environmental carbon and replaced by a new atom instance as all env carbon atoms in EFGF; so it lost its charge! + tmpExpectedFGs = new String[] {"[C]Br"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs, new Aromaticity(ElectronDonation.daylight(), Cycles.all()), ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); + } + // + /** + * Tests correct functional group identification on an example molecule with a disconnected structure. + * This was not allowed in a previous version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testDisconnectedMolecules1() throws Exception { + String tmpMoleculeSmiles = "CC(=O)O.CC(=O)O.C1=CC(=CC=C1NC(=NC(=NCCCCCCN=C(N)N=C(N)NC2=CC=C(C=C2)Cl)N)N)Cl"; //Chlorhexidine Diacetate + String[] tmpExpectedFGs = new String[] {"*C(=O)O[H]", "*C(=O)O[H]", "*N=C(N=C(N(*)*)N(*)*)N(*)*", "*N=C(N=C(N(*)*)N(*)*)N(*)*", "*Cl", "*Cl"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule with a disconnected structure. + * This was not allowed in a previous version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testDisconnectedMolecules2() throws Exception { + String tmpMoleculeSmiles = "C(CN(CC(=O)[O-])CC(=O)[O-])N(CC(=O)[O-])CC(=O)[O-].[Na+].[Na+].[Na+].[Na+]"; //Sodium edetate + String[] tmpExpectedFGs = new String[] {"*N(*)*", "*C(=O)[O-]", "*C(=O)[O-]", "*N(*)*", "*C(=O)[O-]", "*C(=O)[O-]", "[Na+]", "[Na+]", "[Na+]", "[Na+]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // /** * Tests functional group identification on example molecules that contain metal or metalloid atoms. * @@ -522,35 +500,37 @@ public void testDisconnectedMolecules() throws Exception { * @author Jonas Schaub */ @Test - public void testMetalsMetalloids() throws Exception { - SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); - SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical | SmiFlavor.UseAromaticSymbols); - Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); - - IAtomContainer tmpTetraethylOrthosilicate = tmpSmiPar.parseSmiles("CCO[Si](OCC)(OCC)OCC"); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpTetraethylOrthosilicate); - tmpAromaticity.apply(tmpTetraethylOrthosilicate); - - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); - List tmpFGList = tmpEFGF.find(tmpTetraethylOrthosilicate); - - System.out.println("Tetraethyl Orthosilicate:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } - - IAtomContainer tmpKaolin = tmpSmiPar.parseSmiles("O.O.O=[Al]O[Si](=O)O[Si](=O)O[Al]=O"); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpKaolin); - tmpAromaticity.apply(tmpKaolin); - - tmpFGList = tmpEFGF.find(tmpKaolin); - - System.out.println("Kaolin:"); - for (IAtomContainer tmpFG : tmpFGList) { - System.out.println(tmpSmiGen.create(tmpFG)); - } + public void testMetalsMetalloids1() throws Exception { + String tmpMoleculeSmiles = "CCO[Si](OCC)(OCC)OCC"; //Tetraethyl Orthosilicate + String[] tmpExpectedFGs = new String[]{"*O[Si](O*)(O*)O*"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests functional group identification on example molecules that contain metal or metalloid atoms. + * + * Note: all atoms are marked as hetero atoms by EFGF that are not H or C. So, metals and metalloids get treated like + * any other hetero atom. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testMetalsMetalloids2() throws Exception { + String tmpMoleculeSmiles = "O.O.O=[Al]O[Si](=O)O[Si](=O)O[Al]=O"; //Kaolin + String[] tmpExpectedFGs = new String[]{"*O*", "*O*", "O=[Al]O[Si](=O)O[Si](=O)O[Al]=O"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * + */ + @Test + public void testRAtoms1() throws Exception { + String tmpMoleculeSmiles = "OCC(CO[*])OC([*])=O"; //CHEBI:598 + String[] tmpExpectedFGs = new String[]{"[H]O[C]", "[C][O]", "*O[C]=O"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } - //TODO: add test molecules for these special cases to the testFind#() methods? After Achim agreed // /** * Applies EFGF to detect functional groups in the given molecule and compares the identified FG to the given @@ -565,7 +545,8 @@ public void testMetalsMetalloids() throws Exception { * @author Sebastian Fritsch */ private void testFind(String aMoleculeSmiles, String[] anExpectedFGPseudoSmilesArray) throws Exception { - this.testFind(aMoleculeSmiles, anExpectedFGPseudoSmilesArray, new Aromaticity(ElectronDonation.daylight(), Cycles.all())); + this.testFind(aMoleculeSmiles, anExpectedFGPseudoSmilesArray, new Aromaticity(ElectronDonation.daylight(), Cycles.all()), + ErtlFunctionalGroupsFinder.Mode.DEFAULT); } // /** @@ -577,17 +558,20 @@ private void testFind(String aMoleculeSmiles, String[] anExpectedFGPseudoSmilesA * @param aMoleculeSmiles input molecule to detect FG in * @param anExpectedFGPseudoSmilesArray expected FG * @param anAromaticityModel for aromaticity detection in preprocessing of the input molecule + * @param aFunctionalGroupEnvironmentMode to configure the EFGF used here * @throws Exception if anything goes wrong * @author Sebastian Fritsch */ - private void testFind(String aMoleculeSmiles, String[] anExpectedFGPseudoSmilesArray, Aromaticity anAromaticityModel) throws Exception { + private void testFind(String aMoleculeSmiles, String[] anExpectedFGPseudoSmilesArray, Aromaticity anAromaticityModel, + ErtlFunctionalGroupsFinder.Mode aFunctionalGroupEnvironmentMode) + throws Exception { // prepare input SmilesParser tmpSmilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); IAtomContainer tmpMolecule = tmpSmilesParser.parseSmiles(aMoleculeSmiles); AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpMolecule); anAromaticityModel.apply(tmpMolecule); // find functional groups - ErtlFunctionalGroupsFinder tmpFGFinder = new ErtlFunctionalGroupsFinder(); + ErtlFunctionalGroupsFinder tmpFGFinder = new ErtlFunctionalGroupsFinder(aFunctionalGroupEnvironmentMode); List tmpFunctionalgroupsList = tmpFGFinder.find(tmpMolecule); // get expected groups List tmpExpectedFGs = new LinkedList<>(); @@ -700,6 +684,18 @@ private IAtomContainer buildFunctionalGroup(String aFunctionalGroupPseudoSmiles) tmpFunctionalGroup.setAtoms(new IAtom[] {a1, a2, a3}); tmpFunctionalGroup.setBonds(new IBond[] {b1, b2}); return tmpFunctionalGroup; + case "Nar": + a1 = tmpBuilder.newInstance(IAtom.class, "N"); + a1.setIsAromatic(true); + tmpFunctionalGroup = new AtomContainer(); + tmpFunctionalGroup.setAtoms(new IAtom[] {a1}); + return tmpFunctionalGroup; + case "Sar": + a1 = tmpBuilder.newInstance(IAtom.class, "S"); + a1.setIsAromatic(true); + tmpFunctionalGroup = new AtomContainer(); + tmpFunctionalGroup.setAtoms(new IAtom[] {a1}); + return tmpFunctionalGroup; default: // treat as normal SMILES code try { From f97e24aaa14e91c949442581deb702595b763e8a Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Thu, 1 Feb 2024 14:02:31 +0100 Subject: [PATCH 23/27] Started looking at evaluation test; --- ...lFunctionalGroupsFinderEvaluationTest.java | 170 +++++++++--------- .../tools/ErtlFunctionalGroupsFinderTest.java | 9 +- 2 files changed, 93 insertions(+), 86 deletions(-) diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java index fa42a25..410ddbb 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java @@ -73,17 +73,20 @@ /** * This test class can be used to read an SD file containing chemical structures, to extract their functional groups using * the ErtlFunctionalGroupsFinder with different settings (i.e. electron donation model and cycle finder algorithm), and write - * the functional groups with their associated frequency under the given settings in this SD file to a CSV file. + * the identified functional groups with their associated frequency under the given settings in this SD file to a CSV file. *

- * To run correctly the constant SD_FILE_PATH must be set to where to find the specific file on the local system. + * To run correctly, the constant SD_FILE_TEST_RESOURCE_NAME must be set to the name of the SD file to analyse which must be + * situated in the test resources folder. *

- * All written files will be placed in a new folder in the same directory as the read SD file. + * All written files will be placed in the output folder. *

* Note for addition of new tests: Only one SD file should be analyzed per test method (since some mechanisms work under * that assumption). *

- * Note that this code was written before the class ErtlFunctionalGroupsFinderUtility was implemented to make this type - * of analyses more straightforward using its utility method. This test class here therefore does not use the EFGFUtility class. + * NOTE that this code was written before the class ErtlFunctionalGroupsFinderUtility was implemented to make this type + * of analyses more straightforward using its utility methods. This test class here therefore does not use the EFGFUtility class. + * This test class was also developed and used before EFGF was reworked before version 1.3. It can now only bee seen as outdated example code + * on how to analyse larger datasets using EFGF!!! * * @author Jonas Schaub * @version 1.2 @@ -92,11 +95,11 @@ public class ErtlFunctionalGroupsFinderEvaluationTest { // - // + // /** - * Path to SD file that should be analyzed + * Name of SD file in test resources folder that should be analyzed */ - private static final String SD_FILE_PATH = "...\\ChEBI_lite_3star_subset.sdf"; + private static final String SD_FILE_TEST_RESOURCE_NAME = "ChEBI_lite_3star_subset.sdf"; /** * Directory for output files; Will be created as sub-folder in the working directory (the directory of the read SD file) @@ -111,7 +114,7 @@ public class ErtlFunctionalGroupsFinderEvaluationTest { /** * Separator for file name segments (test identifier, file name, time stamp) */ - private static final String FILE_NAME_ADDITION_SEPERATOR = "_"; + private static final String FILE_NAME_ADDITION_SEPARATOR = "_"; // /** @@ -239,7 +242,7 @@ public class ErtlFunctionalGroupsFinderEvaluationTest { /** * Separator for the output file's values */ - private static final String OUTPUT_FILE_SEPERATOR = ","; + private static final String OUTPUT_FILE_SEPARATOR = ","; /** * Placeholder String for every functional group's SMILES code whose real SMILES representation could not be @@ -368,7 +371,7 @@ public class ErtlFunctionalGroupsFinderEvaluationTest { // /** - * Directory for all produced files; It will be the directory where th SD file that is analyzed was loaded from + * Directory for all produced files; It will be the directory where the SD file that is analyzed was loaded from */ private String outputDirectory; @@ -465,7 +468,7 @@ public class ErtlFunctionalGroupsFinderEvaluationTest { /** * Constructor *

- * Note: it does not initialize any class variables (except 5) because that would be unnecessary when it is called by a + * Note: it does not initialize any class variables because that would be unnecessary when it is called by a * test method inherited from CDKTestCase; these initializations are done by initialize(). */ public ErtlFunctionalGroupsFinderEvaluationTest() { @@ -491,7 +494,7 @@ public ErtlFunctionalGroupsFinderEvaluationTest() { */ @Test public void testElectronDonationDependency() throws Exception { - this.analyzeElectronDonationDependency(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_PATH, + this.analyzeElectronDonationDependency(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME, ErtlFunctionalGroupsFinderEvaluationTest.ELECTRON_DONATION_TEST_IDENTIFIER, true); } @@ -507,7 +510,7 @@ public void testElectronDonationDependency() throws Exception { */ @Test public void testElectronDonationDependencyNoMultiples() throws Exception { - this.analyzeElectronDonationDependency(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_PATH, + this.analyzeElectronDonationDependency(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME, ErtlFunctionalGroupsFinderEvaluationTest.ELECTRON_DONATION_NO_MULTIPLES_TEST_IDENTIFIER, false); } @@ -522,12 +525,12 @@ public void testElectronDonationDependencyNoMultiples() throws Exception { */ @Test public void testCycleFinderDependency() throws Exception { - this.initializeWithFileOperations(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_PATH, + this.initializeWithFileOperations(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME, ErtlFunctionalGroupsFinderEvaluationTest.CYCLE_FINDER_TEST_IDENTIFIER); Assumptions.assumeTrue(this.isTestAbleToRun); - System.out.println("\nLoading file with path: " + ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_PATH); - File tmpSDFile = new File(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_PATH); + System.out.println("\nLoading file with path: " + ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME); + File tmpSDFile = new File(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME); int tmpRequiredNumberOfReaders = 6; IteratingSDFReader[] tmpReaders = new IteratingSDFReader[tmpRequiredNumberOfReaders]; try { @@ -564,7 +567,7 @@ public void testCycleFinderDependency() throws Exception { } this.saveData(); System.out.println("\nFinished!"); - System.out.println("\nNumber of occured exceptions: " + this.exceptionsCounter); + System.out.println("\nNumber of occurred exceptions: " + this.exceptionsCounter); } /** @@ -577,7 +580,7 @@ public void testCycleFinderDependency() throws Exception { public void testPerformance() throws Exception { this.initialize(true, "PerformanceTest"); //First, check if the SD file is present and ignore test if it is not - String tmpPathToSDFile = ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_PATH; + String tmpPathToSDFile = ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME; System.out.println("\nLoading file with path: " + tmpPathToSDFile); File tmpSDFile = new File(tmpPathToSDFile); if (!tmpSDFile.canRead()) { @@ -625,27 +628,17 @@ public void testPerformance() throws Exception { } // /** - * TODO: what to do with this method, keep the analysis of the subset? - * - * ChEBI complete (184933 structures in file (some will automatically be skipped by SDF reader)): - * Number of parsed molecules: 184930 - * Exceptions while preprocessing: 0 - * Molecules that would be filtered due to input restrictions: 29637 - * Exceptions with restrictions (prefiltered): 0 - * Exceptions without restrictions: 0 - * - * ChEBI lite 3-star subset SDF (1396 structures in file (some will automatically be skipped by SDF reader)): - * Number of parsed molecules: 1396 - * Exceptions while preprocessing: 0 - * Molecules that would be filtered due to input restrictions: 251 - * Exceptions with restrictions (prefiltered): 0 - * Exceptions without restrictions: 0 - * + * Reads the ChEBI lite 3-star subset and determines the functional groups in it to compare how many input molecules + * cause exceptions with vs. without the earlier EFGF input restrictions. + * ChEBI lite 3-star subset SDF: 1396 structures in file (some will automatically be skipped by SDF reader). + * If needed, it can also generate images of those molecules that would be filtered and their functional groups. * * @throws Exception if anything goes wrong */ @Test public void readChebiLite3StarSubset() throws Exception { + // change to true to activate image generation! + boolean tmpDepictFilteredMols = false; IteratingSDFReader tmpChebiSDFReader = null; try { tmpChebiSDFReader = new IteratingSDFReader( @@ -685,17 +678,18 @@ public void readChebiLite3StarSubset() throws Exception { if (ErtlFunctionalGroupsFinder.isValidInputMoleculeWithRestrictionsTurnedOn(tmpMolecule)) { List tmpFGList = tmpEFGF.find(tmpMolecule, false, true); } else { - //TODO: save these structures somewhere for inspection? tmpMoleculesFilteredCounter++; try { - DepictionGenerator tmpDepictGen = new DepictionGenerator().withSize(712, 712).withFillToFit().withMargin(10); - String tmpSourceFolder = new File("").getAbsolutePath(); - tmpDepictGen.depict(tmpMolecule).writeTo(tmpSourceFolder + File.separator + "Output" + File.separator + tmpMolecule.getProperty("ChEBI ID").toString().replace(':', '_') + ".png"); - List tmpFGList = tmpEFGF.find(tmpMolecule, false, false); - int i = 0; - for (IAtomContainer tmpFG : tmpFGList) { - tmpDepictGen.depict(tmpFG).writeTo(tmpSourceFolder + File.separator + "Output" + File.separator + tmpMolecule.getProperty("ChEBI ID").toString().replace(':', '_') + "_" + i + ".png"); - i++; + if (tmpDepictFilteredMols) { + DepictionGenerator tmpDepictGen = new DepictionGenerator().withSize(712, 712).withFillToFit().withMargin(10); + String tmpSourceFolder = new File("").getAbsolutePath(); + tmpDepictGen.depict(tmpMolecule).writeTo(tmpSourceFolder + File.separator + "Output" + File.separator + tmpMolecule.getProperty("ChEBI ID").toString().replace(':', '_') + ".png"); + List tmpFGList = tmpEFGF.find(tmpMolecule, false, false); + int i = 0; + for (IAtomContainer tmpFG : tmpFGList) { + tmpDepictGen.depict(tmpFG).writeTo(tmpSourceFolder + File.separator + "Output" + File.separator + tmpMolecule.getProperty("ChEBI ID").toString().replace(':', '_') + "_" + i + ".png"); + i++; + } } } catch (Exception anException) { anException.printStackTrace(); @@ -818,8 +812,8 @@ public void testPreprocessing() throws Exception { SmilesParser tmpSmilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); IAtomContainer tmpMol = tmpSmilesParser.parseSmiles(tmpSmiles); tmpMol = this.applyFiltersAndPreprocessing(tmpMol); - SmilesGenerator tmpGenerator = SmilesGenerator.unique(); - Assertions.assertEquals("OCC", tmpGenerator.create(tmpMol)); + SmilesGenerator tmpGenerator = SmilesGenerator.unique(); + Assertions.assertEquals("OCC", tmpGenerator.create(tmpMol)); } // @@ -830,21 +824,23 @@ public void testPreprocessing() throws Exception { * Analyzes molecules in an SD file for all four different electron donation models supplied by the cdk: * daylight, cdk, piBonds, cdkAllowingExocyclic and the aromaticity model cdkLegacy. * - * @param anSDFilePath absolute path of the SD file to analyze - * @param aTestIdentifier a folder with this name will be created in the output directory and it will be added to + * @param anSDFileResourceName name of the SD file to analyse which must be situated in the test resources folder + * @param aTestIdentifier a folder with this name will be created in the output directory, and it will be added to * the output and log files' names for association of test and files; may be null or empty * @param anAreMultiplesCounted if false, functional groups that occur multiple times in the same molecule will * only be counted once * @throws java.lang.Exception if initializeWithFileOperations() throws an exception or an unexpected exception occurs */ - private void analyzeElectronDonationDependency(String anSDFilePath, + private void analyzeElectronDonationDependency( + String anSDFileResourceName, String aTestIdentifier, - boolean anAreMultiplesCounted) throws Exception { - this.initializeWithFileOperations(anSDFilePath, aTestIdentifier); + boolean anAreMultiplesCounted) + throws Exception { + this.initializeWithFileOperations(anSDFileResourceName, aTestIdentifier); Assumptions.assumeTrue(this.isTestAbleToRun); - System.out.println("\nLoading file with path: " + anSDFilePath); - File tmpSDFile = new File(anSDFilePath); + System.out.println("\nLoading file with path: " + anSDFileResourceName); + File tmpSDFile = new File(ErtlFunctionalGroupsFinderEvaluationTest.class.getResource(anSDFileResourceName).getPath()); int tmpRequiredNumberOfReaders = 5; IteratingSDFReader[] tmpReaders = new IteratingSDFReader[tmpRequiredNumberOfReaders]; try { @@ -884,7 +880,7 @@ private void analyzeElectronDonationDependency(String anSDFilePath, } this.saveData(); System.out.println("\nFinished!"); - System.out.println("\nNumber of occured exceptions: " + this.exceptionsCounter); + System.out.println("\nNumber of occurred exceptions: " + this.exceptionsCounter); } /** @@ -956,25 +952,31 @@ private void initialize(boolean aShouldPrintHeader, String aTestIdentifier) { /** * Initializes all class variables and determines the output directory. * - * @param anSDFilePath absolute path of the SD file to analyze for a quick pre-check if it is present and the test - * is therefore meant to run; may be empty but not null + * @param anSDFileResourceName name of the SD file to analyse which must be situated in the test resources folder * @param aTestIdentifier a folder with this name will be created in the output directory and it will be added to * the output and log files' names for association of test and files; may be null or empty * @throws java.lang.Exception if one the FileWriter instances can not be instantiated, more than * Integer.MAX-VALUE tests are to be run this minute (error in the naming of output files), aPathOfSDFile is null or * an unexpected exception occurs. */ - private void initializeWithFileOperations(String anSDFilePath, String aTestIdentifier) throws Exception { + private void initializeWithFileOperations(String anSDFileResourceName, String aTestIdentifier) throws Exception { System.out.println("\n#########################################################################\n"); System.out.println("Starting new test, identifier: " + aTestIdentifier); System.out.println("\nInitializing class variables..."); this.isTestAbleToRun = true; //First, check if the SD file is present and ignore test if it is not - File tmpSDFile = new File(anSDFilePath); - if (!tmpSDFile.canRead() || tmpSDFile.getAbsoluteFile().getParent() == null) { - System.out.println("\n\tUnable to find or read a file with path \"" + anSDFilePath + "\" or to get its parent directory."); - System.out.println("\nTest is ignored."); + File tmpSDFile = null; + try { + tmpSDFile = new File(ErtlFunctionalGroupsFinderEvaluationTest.class.getResource(anSDFileResourceName).getPath()); + } catch (Exception e) { + this.isTestAbleToRun = false; + } + if (!tmpSDFile.canRead()) { this.isTestAbleToRun = false; + } + if (!this.isTestAbleToRun) { + System.out.println("\n\tUnable to find or read a file with path \"" + anSDFileResourceName + "\"."); + System.out.println("\nTest is ignored."); Assumptions.assumeTrue(false); return; } @@ -995,9 +997,9 @@ private void initializeWithFileOperations(String anSDFilePath, String aTestIdent ErtlFunctionalGroupsFinderEvaluationTest.DATE_TIME_FORMAT_PATTERN)); //Set up exceptions log file File tmpExceptionsLogFile = new File(this.outputDirectory + File.separator + aTestIdentifier - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.EXCEPTIONS_LOG_FILE_NAME - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + tmpDateTimeAddition + ErtlFunctionalGroupsFinderEvaluationTest.EXCEPTIONS_LOG_FILE_TYPE); int tmpFilesInThisMinuteCounter = 1; @@ -1007,9 +1009,9 @@ private void initializeWithFileOperations(String anSDFilePath, String aTestIdent tmpNumberAddedToFileName = true; while (tmpFilesInThisMinuteCounter <= Integer.MAX_VALUE) { tmpExceptionsLogFile = new File(this.outputDirectory + File.separator + aTestIdentifier - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.EXCEPTIONS_LOG_FILE_NAME - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + tmpDateTimeAddition + "(" + tmpFilesInThisMinuteCounter + ")" + ErtlFunctionalGroupsFinderEvaluationTest.EXCEPTIONS_LOG_FILE_TYPE); @@ -1032,16 +1034,16 @@ private void initializeWithFileOperations(String anSDFilePath, String aTestIdent File tmpFilteredMoleculesFile; if (tmpNumberAddedToFileName) { tmpFilteredMoleculesFile = new File(this.outputDirectory+ File.separator + aTestIdentifier - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.FILTERED_MOLECULES_FILE_NAME - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + tmpDateTimeAddition + "(" + tmpFilesInThisMinuteCounter + ")" + ErtlFunctionalGroupsFinderEvaluationTest.FILTERED_MOLECULES_FILE_TYPE); } else { tmpFilteredMoleculesFile = new File(this.outputDirectory+ File.separator + aTestIdentifier - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.FILTERED_MOLECULES_FILE_NAME - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + tmpDateTimeAddition + ErtlFunctionalGroupsFinderEvaluationTest.FILTERED_MOLECULES_FILE_TYPE); } @@ -1054,17 +1056,17 @@ private void initializeWithFileOperations(String anSDFilePath, String aTestIdent File tmpOutputFile; if (tmpNumberAddedToFileName) { tmpOutputFile = new File(this.outputDirectory+ File.separator + aTestIdentifier - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_NAME - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + tmpDateTimeAddition + "(" + tmpFilesInThisMinuteCounter + ")" + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_TYPE); } else { tmpOutputFile = new File(this.outputDirectory+ File.separator + aTestIdentifier - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_NAME - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + tmpDateTimeAddition + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_TYPE); } @@ -1437,14 +1439,14 @@ private void saveData() { System.out.println("\nWriting to file..."); //Writing the output file's header String tmpFileHeader = ErtlFunctionalGroupsFinderEvaluationTest.HASH_CODE_KEY - + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.PSEUDO_SMILES_CODE_KEY - + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.SMILES_CODE_KEY; for (String tmpSettingsKey : this.settingsKeysList) { - tmpFileHeader += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + tmpSettingsKey; + tmpFileHeader += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + tmpSettingsKey; } - tmpFileHeader += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + tmpFileHeader += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.MOLECULE_OF_ORIGIN_KEY; this.dataOutputPrintWriter.println(tmpFileHeader); this.dataOutputPrintWriter.flush(); @@ -1457,15 +1459,15 @@ private void saveData() { String tmpPseudoSmilesCode = (String) tmpInnerMap.get(ErtlFunctionalGroupsFinderEvaluationTest.PSEUDO_SMILES_CODE_KEY); //Writing the record for this functional group String tmpRecord = tmpHashCode - + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + tmpPseudoSmilesCode - + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + tmpSmilesCode; for (String tmpSettingsKey : this.settingsKeysList) { if (tmpInnerMap.get(tmpSettingsKey) == null) { tmpInnerMap.put(tmpSettingsKey, 0); } - tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + tmpInnerMap.get(tmpSettingsKey); } IAtomContainer tmpMoleculeOfOrigin = (IAtomContainer)tmpInnerMap.get( @@ -1474,13 +1476,13 @@ private void saveData() { String tmpChemblId = tmpMoleculeOfOrigin.getProperty("chembl_id"); String tmpCdkTitle = tmpMoleculeOfOrigin.getProperty(CDKConstants.TITLE); if (tmpChebiId != null) { - tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + tmpChebiId; + tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + tmpChebiId; } else if (tmpChemblId != null) { - tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + tmpChemblId; + tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + tmpChemblId; } else if (tmpCdkTitle != null) { - tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + tmpCdkTitle; + tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + tmpCdkTitle; } else { - tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.MOLECULE_OF_ORIGIN_ID_PLACEHOLDER; } this.dataOutputPrintWriter.println(tmpRecord); diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java index db358c7..0cf2e41 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java @@ -491,7 +491,7 @@ public void testDisconnectedMolecules2() throws Exception { } // /** - * Tests functional group identification on example molecules that contain metal or metalloid atoms. + * Tests correct functional group identification on an example molecule with metal/metalloid atoms. * * Note: all atoms are marked as hetero atoms by EFGF that are not H or C. So, metals and metalloids get treated like * any other hetero atom. @@ -507,7 +507,7 @@ public void testMetalsMetalloids1() throws Exception { } // /** - * Tests functional group identification on example molecules that contain metal or metalloid atoms. + * Tests correct functional group identification on an example molecule with metal/metalloid atoms. * * Note: all atoms are marked as hetero atoms by EFGF that are not H or C. So, metals and metalloids get treated like * any other hetero atom. @@ -523,7 +523,12 @@ public void testMetalsMetalloids2() throws Exception { } // /** + * Tests correct functional group identification on an example molecule with pseudo (R) atoms. * + * Note: these pseudo (R) atoms are simply ignored by EFGF. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub */ @Test public void testRAtoms1() throws Exception { From 6214bac4d66a158be3360f77a0e73e72405a001e Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Thu, 1 Feb 2024 17:16:07 +0100 Subject: [PATCH 24/27] Finished cleaning up the evaluation test; some more refactoring and documentation; --- .../ErtlFunctionalGroupsFinderUtility.java | 42 +++++------ ...nalGroupsFinderPerformanceSnapshotApp.java | 24 +++--- .../efgf/app/ExtractFunctionalGroupsTask.java | 9 +-- .../openscience/cdk/tools/efgf/app/Main.java | 7 +- ...lFunctionalGroupsFinderEvaluationTest.java | 73 ++++++++++--------- ...ErtlFunctionalGroupsFinderUtilityTest.java | 10 +-- 6 files changed, 83 insertions(+), 82 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java index 60b5b27..f47803c 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java @@ -93,7 +93,7 @@ public int encode(IAtom anAtom, IAtomContainer aContainer) { // // private ErtlFunctionalGroupsFinderUtility() { - + // only created because JavaDoc task complained. } // // @@ -136,10 +136,10 @@ public static boolean isAtomOrBondCountZero(IAtomContainer aMolecule) throws Nul int tmpBondCount = aMolecule.getBondCount(); return (tmpAtomCount == 0 || tmpBondCount == 0); } - + // /** * Checks whether the given molecule represented by an atom container should NOT be passed on to the - * ErtlFunctionalGroupsFinder.find() method but instead be filtered. + * ErtlFunctionalGroupsFinder.find() method but instead be filtered if(!) strict input restrictions are turned on (turned off by default). *
In detail, this function returns true if the given atom container contains metal, metalloid, or pseudo atoms * or has an atom or bond count equal to zero. *
If this method returns false, this does NOT mean the molecule can be passed on to find() without a problem. It @@ -152,10 +152,10 @@ public static boolean isAtomOrBondCountZero(IAtomContainer aMolecule) throws Nul public static boolean shouldBeFiltered(IAtomContainer aMolecule) throws NullPointerException { return ErtlFunctionalGroupsFinderUtility.shouldBeFiltered(aMolecule, true); } - + // /** * Checks whether the given molecule represented by an atom container should NOT be passed on to the - * ErtlFunctionalGroupsFinder.find() method but instead be filtered. + * ErtlFunctionalGroupsFinder.find() method but instead be filtered if(!) strict input restrictions are turned on (turned off by default). *
In detail, this function returns true if the given atom container contains metal, metalloid, or pseudo atoms * or has an atom or bond count equal to zero. If the second parameter is set to "false", single atom molecules * (bond count is 0) are accepted and not recommended to be filtered if they fulfill the other requirements. @@ -187,10 +187,11 @@ public static boolean shouldBeFiltered(IAtomContainer aMolecule, boolean areSing } return tmpShouldBeFiltered; } - + // /** * Checks whether the given molecule represented by an atom container needs to be preprocessed before it is passed - * on to the ErtlFunctionalGroupsFinder.find() method because it is unconnected or contains charged atoms. + * on to the ErtlFunctionalGroupsFinder.find() method because it is unconnected or contains charged atoms if(!) + * strict input restrictions are turned on (turned off by default). *
It is advised to check via shouldBeFiltered() whether the given molecule should be discarded anyway before * calling this function. * @@ -212,10 +213,10 @@ public static boolean shouldBePreprocessed(IAtomContainer aMolecule) throws Null } return tmpNeedsPreprocessing; } - + // /** * Checks whether the given molecule represented by an atom container can be passed on to the - * ErtlFunctionalGroupsFinder.find() method without problems. + * ErtlFunctionalGroupsFinder.find() method without problems if(!) strict input restrictions are turned on (turned off by default). *
This method will return false if the molecule contains any metal, metalloid, pseudo, or charged atoms, contains * multiple unconnected parts, or has an atom or bond count of zero. * @@ -226,10 +227,10 @@ public static boolean shouldBePreprocessed(IAtomContainer aMolecule) throws Null public static boolean isValidArgumentForFindMethod(IAtomContainer aMolecule) throws NullPointerException { return ErtlFunctionalGroupsFinderUtility.isValidArgumentForFindMethod(aMolecule, true); } - + // /** * Checks whether the given molecule represented by an atom container can be passed on to the - * ErtlFunctionalGroupsFinder.find() method without problems. + * ErtlFunctionalGroupsFinder.find() method without problems if(!) strict input restrictions are turned on (turned off by default). *
This method will return false if the molecule contains any metal, metalloid, pseudo, or charged atoms, contains * multiple unconnected parts, or has an atom or bond count of zero. If the second parameter is set to "false", single atom molecules * (bond count is 0) are accepted and not recommended to be filtered if they fulfill the other requirements. @@ -291,7 +292,7 @@ public static IAtomContainer selectBiggestUnconnectedComponent(IAtomContainer aM tmpBiggestComponent.setProperties(aMolecule.getProperties()); return tmpBiggestComponent; } - + // /** * Neutralizes charged atoms in the given atom container by zeroing the formal atomic charges and filling up free * valences with implicit hydrogen atoms (according to the CDK atom types). This procedure allows a more general @@ -317,7 +318,7 @@ public static void neutralizeCharges(IAtomContainer aMolecule) throws NullPointe ErtlFunctionalGroupsFinderUtility.neutralizeCharges(tmpAtom, aMolecule); } } - + // /** * Neutralizes a charged atom in the given parent atom container by zeroing the formal atomic charge and filling up free * valences with implicit hydrogen atoms (according to the CDK atom types). @@ -361,7 +362,7 @@ public static void neutralizeCharges(IAtom anAtom, IAtomContainer aParentMolecul tmpHAdder.addImplicitHydrogens(aParentMolecule, anAtom); } } - + // /** * Convenience method to perceive atom types for all IAtoms in the IAtomContainer, using the * CDK AtomContainerManipulator or rather the CDKAtomTypeMatcher. If the matcher finds a matching atom type, the @@ -385,7 +386,7 @@ public static void perceiveAtomTypesAndConfigureAtoms(IAtomContainer aMolecule) //Might throw CDKException but it is unclear in what case AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(aMolecule); } - + // /** * Convenience method for applying the given aromaticity model to the given molecule. Any existing aromaticity flags * are removed - even if no aromatic bonds were found. This follows the idea of applying an aromaticity model to a @@ -424,7 +425,7 @@ public static boolean applyAromaticityDetection(IAtomContainer aMolecule, Aromat } return tmpIsAromatic; } - + // /** * Checks whether the given molecule represented by an atom container should be filtered instead of being passed * on to the ErtlFunctionalGroupsFinder.find() method and if not, applies necessary preprocessing steps. @@ -432,7 +433,7 @@ public static boolean applyAromaticityDetection(IAtomContainer aMolecule, Aromat * to the given atom container that is always needed (setting atom types and applying an aromaticity model) and * preprocessing steps that are only needed in specific cases (selecting the biggest unconnected component, neutralizing * charges). Molecules processed by this method can be passed on to find() without problems (Caution: The return value - * of this method is 'null' if the molecule should be filtered!). + * of this method is 'null' if the molecule should be filtered!) if(!) strict input restrictions are turned on (turned off by default). *
NOTE: This method changes major properties and the composition of the given IAtomContainer object! If you * want to retain your object unchanged for future calculations, use the IAtomContainer's * clone() method. @@ -449,7 +450,7 @@ public static boolean applyAromaticityDetection(IAtomContainer aMolecule, Aromat public static IAtomContainer applyFiltersAndPreprocessing(IAtomContainer aMolecule, Aromaticity anAromaticityModel) throws NullPointerException { return ErtlFunctionalGroupsFinderUtility.applyFiltersAndPreprocessing(aMolecule, anAromaticityModel, true); } - + // /** * Checks whether the given molecule represented by an atom container should be filtered instead of being passed * on to the ErtlFunctionalGroupsFinder.find() method and if not, applies necessary preprocessing steps. @@ -457,7 +458,7 @@ public static IAtomContainer applyFiltersAndPreprocessing(IAtomContainer aMolecu * to the given atom container that is always needed (setting atom types and applying an aromaticity model) and * preprocessing steps that are only needed in specific cases (selecting the biggest unconnected component, neutralizing * charges). Molecules processed by this method can be passed on to find() without problems (Caution: The return value - * of this method is 'null' if the molecule should be filtered!). + * of this method is 'null' if the molecule should be filtered!) if(!) strict input restrictions are turned on (turned off by default). *
NOTE: This method changes major properties and the composition of the given IAtomContainer object! If you * want to retain your object unchanged for future calculations, use the IAtomContainer's * clone() method. @@ -518,7 +519,6 @@ public static IAtomContainer applyFiltersAndPreprocessing(IAtomContainer aMolecu //
// // - /** * Replaces the environmental carbon or pseudo-atoms (new IAtom objects) inserted by the EFGF in an identified * functional group with the carbon IAtom objects from the original molecule object. @@ -617,7 +617,7 @@ public static void restoreOriginalEnvironmentalCarbons( } } } - + // /** * Gives the pseudo SMILES code for a given molecule / functional group. In this notation, aromatic atoms are marked * by asterisks (*) and pseudo atoms are indicated by 'R'. diff --git a/src/main/java/org/openscience/cdk/tools/efgf/app/ErtlFunctionalGroupsFinderPerformanceSnapshotApp.java b/src/main/java/org/openscience/cdk/tools/efgf/app/ErtlFunctionalGroupsFinderPerformanceSnapshotApp.java index d9f137f..2524b03 100644 --- a/src/main/java/org/openscience/cdk/tools/efgf/app/ErtlFunctionalGroupsFinderPerformanceSnapshotApp.java +++ b/src/main/java/org/openscience/cdk/tools/efgf/app/ErtlFunctionalGroupsFinderPerformanceSnapshotApp.java @@ -62,58 +62,60 @@ /** * An application for testing the performance of the ErtlFunctionalGroupsFinder.find() method under parallelization on * multiple threads. + *

+ * Legacy code that still assumes that the old input restrictions are turned on. * * @author Jonas Schaub * @version 1.2.0.0 */ public class ErtlFunctionalGroupsFinderPerformanceSnapshotApp { - + // // /** * Name of file for logging occurred exceptions */ private static final String EXCEPTIONS_LOG_FILE_NAME = "Exceptions_Log.txt"; - + // /** * Name of file for writing results */ private static final String RESULTS_FILE_NAME = "Results.txt"; - + // /** * All allowed atomic numbers to pass to the ErtlFunctionalGroupsFinder; * String will be split and resulting integers passed to a set */ private static final String NON_METALLIC_ATOMIC_NUMBERS = "1,2,6,7,8,9,10,15,16,17,18,34,35,36,53,54,86"; // - + // // /** * All allowed atomic numbers to pass to the ErtlFunctionalGroupsFinder as a set of integers (will be parsed from * NON_METALLIC_ATOMIC_NUMBERS) */ private Set nonMetallicAtomicNumbersSet; - + // /** * The working directory (the jar-file's directory) */ private String workingPath; - + // /** * The given number of different threads to use */ private int numberOfThreadsToUse; - + // /** * All molecules loaded from the SD file */ private IAtomContainer[] moleculesArray; - + // /** * The aromaticity model in use */ private Aromaticity aromaticityModel; // - + // // /** * Instantiates and starts the application. It first loads all molecules from a given SD file into memory and then @@ -265,7 +267,7 @@ public ErtlFunctionalGroupsFinderPerformanceSnapshotApp(String[] anArgs) throws } } // - + // // /** * Performs all preprocessing needed for the ErtlFunctionalGroupsFinder and throws an IllegalArgumentException @@ -311,7 +313,7 @@ private IAtomContainer applyFiltersAndPreprocessing(IAtomContainer aMolecule) th this.aromaticityModel.apply(aMolecule); return aMolecule; } - + // /** * Appends the given exception's stack trace to a log file. * diff --git a/src/main/java/org/openscience/cdk/tools/efgf/app/ExtractFunctionalGroupsTask.java b/src/main/java/org/openscience/cdk/tools/efgf/app/ExtractFunctionalGroupsTask.java index 7fc144f..90ee117 100644 --- a/src/main/java/org/openscience/cdk/tools/efgf/app/ExtractFunctionalGroupsTask.java +++ b/src/main/java/org/openscience/cdk/tools/efgf/app/ExtractFunctionalGroupsTask.java @@ -33,11 +33,11 @@ * @version 1.2 */ public class ExtractFunctionalGroupsTask implements Callable { - + // private final IAtomContainer[] moleculesArray; - + // private final ErtlFunctionalGroupsFinder ertlFinder; - + // /** * Instantiates the thread. * @@ -48,7 +48,7 @@ public ExtractFunctionalGroupsTask(IAtomContainer[] aListOfMolecules) { this.moleculesArray = aListOfMolecules; this.ertlFinder = new ErtlFunctionalGroupsFinder(); } - + // /** * Applies the ErtlFunctionalGroupsFinder.find(IAtomContainer container, boolean clone) method on all given * molecules (parameter clone = false) and counts the occurring exceptions. @@ -68,5 +68,4 @@ public Integer call() throws Exception { } return tmpExceptionsCounter; } - } diff --git a/src/main/java/org/openscience/cdk/tools/efgf/app/Main.java b/src/main/java/org/openscience/cdk/tools/efgf/app/Main.java index 0ae5470..954af5d 100644 --- a/src/main/java/org/openscience/cdk/tools/efgf/app/Main.java +++ b/src/main/java/org/openscience/cdk/tools/efgf/app/Main.java @@ -27,11 +27,11 @@ * @version 1.2 */ public class Main { - + // private Main() { - + // only created because JavaDoc task complained. } - + // /** * Starts the application. Command line arguments must be the name of an SD-file to read (must be located in the * same directory as the application's .jar file) and the number of different threads to use for calculation. @@ -46,5 +46,4 @@ public static void main(String[] args) { System.exit(1); } } - } diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java index 410ddbb..f018670 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java @@ -80,16 +80,18 @@ *

* All written files will be placed in the output folder. *

- * Note for addition of new tests: Only one SD file should be analyzed per test method (since some mechanisms work under + * Note for addition of new tests: Only one SD file should be analysed per test method (since some mechanisms work under * that assumption). *

* NOTE that this code was written before the class ErtlFunctionalGroupsFinderUtility was implemented to make this type * of analyses more straightforward using its utility methods. This test class here therefore does not use the EFGFUtility class. - * This test class was also developed and used before EFGF was reworked before version 1.3. It can now only bee seen as outdated example code - * on how to analyse larger datasets using EFGF!!! + * In fact, the routines used here were extracted to develop the EFGFUtility class. + * This test class was also developed and used before EFGF was reworked before version 1.3. It can now only bee seen as outdated example/legacy code + * on how to analyse larger datasets using EFGF!!! It is also documentation on how the analyses presented in the scientific + * article about EFGF were conducted. * * @author Jonas Schaub - * @version 1.2 + * @version 1.3 */ public class ErtlFunctionalGroupsFinderEvaluationTest { @@ -102,7 +104,7 @@ public class ErtlFunctionalGroupsFinderEvaluationTest { private static final String SD_FILE_TEST_RESOURCE_NAME = "ChEBI_lite_3star_subset.sdf"; /** - * Directory for output files; Will be created as sub-folder in the working directory (the directory of the read SD file) + * Folder name for output files; will be created in repo root directory */ private static final String OUTPUT_FOLDER_FROM_WORKING_DIRECTORY = "ErtlFunctionalGroupsFinderEvaluationTest_Output"; @@ -148,10 +150,10 @@ public class ErtlFunctionalGroupsFinderEvaluationTest { * First lines in the exceptions log file */ private static final String EXCEPTIONS_LOG_FILE_HEADER = "Following molecules led to the specified exceptions:" - + System.getProperty("line.separator") + + System.lineSeparator() + "(Note: If too many exceptions are thrown too fast the JVM stops filling in the complete stack trace. " + "You need to be looking at an earlier stack trace to see the details.)" - + System.getProperty("line.separator"); + + System.lineSeparator(); // // @@ -438,7 +440,7 @@ public class ErtlFunctionalGroupsFinderEvaluationTest { * functional groups and its values are inner HashMaps that hold the (pseudo) SMILES representation of a functional * group and its frequencies for different settings as String-Object pairs, plus an exemplary molecule of origin */ - private HashMap masterHashMap; + private HashMap> masterHashMap; /** * A map that gives a certain element symbol for a placeholder atom marking a specific aromatic atom in pseudo SMILES @@ -530,7 +532,7 @@ public void testCycleFinderDependency() throws Exception { Assumptions.assumeTrue(this.isTestAbleToRun); System.out.println("\nLoading file with path: " + ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME); - File tmpSDFile = new File(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME); + File tmpSDFile = new File(ErtlFunctionalGroupsFinderEvaluationTest.class.getResource(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME).getPath()); int tmpRequiredNumberOfReaders = 6; IteratingSDFReader[] tmpReaders = new IteratingSDFReader[tmpRequiredNumberOfReaders]; try { @@ -580,11 +582,10 @@ public void testCycleFinderDependency() throws Exception { public void testPerformance() throws Exception { this.initialize(true, "PerformanceTest"); //First, check if the SD file is present and ignore test if it is not - String tmpPathToSDFile = ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME; - System.out.println("\nLoading file with path: " + tmpPathToSDFile); - File tmpSDFile = new File(tmpPathToSDFile); + System.out.println("\nLoading file with path: " + ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME); + File tmpSDFile = new File(ErtlFunctionalGroupsFinderEvaluationTest.class.getResource(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME).getPath()); if (!tmpSDFile.canRead()) { - System.out.println("\n\tUnable to find or read a file with path \"" + tmpPathToSDFile + "\"."); + System.out.println("\n\tUnable to find or read a file with path \"" + ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME + "\"."); System.out.println("\nTest is ignored."); Assumptions.assumeTrue(false); return; @@ -910,7 +911,7 @@ private void initialize(boolean aShouldPrintHeader, String aTestIdentifier) { .molecular(); this.ertlFGFinderGenOff = new ErtlFunctionalGroupsFinder(Mode.NO_GENERALIZATION); this.ertlFGFinderGenOn = new ErtlFunctionalGroupsFinder(Mode.DEFAULT); - this.masterHashMap = new HashMap(ErtlFunctionalGroupsFinderEvaluationTest.MASTER_HASHMAP_INITIAL_CAPACITY, + this.masterHashMap = new HashMap<>(ErtlFunctionalGroupsFinderEvaluationTest.MASTER_HASHMAP_INITIAL_CAPACITY, ErtlFunctionalGroupsFinderEvaluationTest.MASTER_HASHMAP_LOAD_FACTOR); this.settingsKeysList = new LinkedList<>(); this.exceptionsCounter = 0; @@ -920,7 +921,7 @@ private void initialize(boolean aShouldPrintHeader, String aTestIdentifier) { for (int i = 0; i < tmpMetalNumbersStrings.length; i++) { tmpMetalNumbersInt[i] = Integer.parseInt(tmpMetalNumbersStrings[i]); } - this.nonMetallicAtomicNumbersSet = new HashSet(Arrays.asList(tmpMetalNumbersInt)); + this.nonMetallicAtomicNumbersSet = new HashSet<>(Arrays.asList(tmpMetalNumbersInt)); this.pseudoSmilesAromaticElementToPlaceholderElementMap = new HashMap<>(10, 1); this.pseudoSmilesAromaticElementToPlaceholderElementMap.put("C", "Ce"); this.pseudoSmilesAromaticElementToPlaceholderElementMap.put("N", "Nd"); @@ -981,7 +982,7 @@ private void initializeWithFileOperations(String anSDFileResourceName, String aT return; } //Determine the output directory - String tmpOutputRootDirectory = tmpSDFile.getAbsoluteFile().getParent() + File.separator; + String tmpOutputRootDirectory = new File("").getAbsolutePath() + File.separator; this.outputDirectory = tmpOutputRootDirectory + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FOLDER_FROM_WORKING_DIRECTORY + File.separator @@ -1187,9 +1188,10 @@ private void calculateAbsoluteFGFrequencies( } try { aReader.close(); - } catch (IOException anIOException) { } - //Since the filters remain the same in every iteration filtered molecules must be logged only once - //(assuming that only one SD file is analyzed in a test) + } catch (IOException anIOException) { + //Since the filters remain the same in every iteration filtered molecules must be logged only once + //(assuming that only one SD file is analyzed in a test) + } if (!this.areFilteredMoleculesLogged) { this.areFilteredMoleculesLogged = true; } @@ -1385,7 +1387,7 @@ private void enterFunctionalGroupsIntoMasterMap( } //Case: functional group is already in the master HashMap if (this.masterHashMap.containsKey(tmpHashCode)) { - HashMap tmpInnerMap = (HashMap)this.masterHashMap.get(tmpHashCode); + HashMap tmpInnerMap = this.masterHashMap.get(tmpHashCode); //And a key-value pair for this settings key is already present too -> raise frequency by one if (tmpInnerMap.containsKey(aSettingsKey)) { int tmpFrequency = (int)tmpInnerMap.get(aSettingsKey); @@ -1397,7 +1399,7 @@ private void enterFunctionalGroupsIntoMasterMap( } //The functional group did not occur before -> create a new inner HashMap for this molecule } else { - HashMap tmpNewInnerMap = new HashMap( + HashMap tmpNewInnerMap = new HashMap<>( ErtlFunctionalGroupsFinderEvaluationTest.INNER_HASHMAPS_INITIAL_CAPACITY); tmpNewInnerMap.put(ErtlFunctionalGroupsFinderEvaluationTest.MOLECULE_OF_ORIGIN_KEY, anFGContainingMolecule); tmpNewInnerMap.put(aSettingsKey, 1); @@ -1438,38 +1440,37 @@ private void saveData() { } System.out.println("\nWriting to file..."); //Writing the output file's header - String tmpFileHeader = ErtlFunctionalGroupsFinderEvaluationTest.HASH_CODE_KEY + StringBuilder tmpFileHeaderBuilder = new StringBuilder(ErtlFunctionalGroupsFinderEvaluationTest.HASH_CODE_KEY + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.PSEUDO_SMILES_CODE_KEY + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR - + ErtlFunctionalGroupsFinderEvaluationTest.SMILES_CODE_KEY; + + ErtlFunctionalGroupsFinderEvaluationTest.SMILES_CODE_KEY); for (String tmpSettingsKey : this.settingsKeysList) { - tmpFileHeader += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + tmpSettingsKey; + tmpFileHeaderBuilder.append(ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR).append(tmpSettingsKey); } + String tmpFileHeader = tmpFileHeaderBuilder.toString(); tmpFileHeader += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.MOLECULE_OF_ORIGIN_KEY; this.dataOutputPrintWriter.println(tmpFileHeader); this.dataOutputPrintWriter.flush(); - Iterator tmpFunctionalGroupsIterator = this.masterHashMap.keySet().iterator(); + Iterator tmpFunctionalGroupsIterator = this.masterHashMap.keySet().iterator(); //Iteration for all molecules in the master HashMap while (tmpFunctionalGroupsIterator.hasNext()) { - long tmpHashCode = (long)tmpFunctionalGroupsIterator.next(); - HashMap tmpInnerMap = (HashMap)this.masterHashMap.get(tmpHashCode); + long tmpHashCode = tmpFunctionalGroupsIterator.next(); + HashMap tmpInnerMap = this.masterHashMap.get(tmpHashCode); String tmpSmilesCode = (String) tmpInnerMap.get(ErtlFunctionalGroupsFinderEvaluationTest.SMILES_CODE_KEY); String tmpPseudoSmilesCode = (String) tmpInnerMap.get(ErtlFunctionalGroupsFinderEvaluationTest.PSEUDO_SMILES_CODE_KEY); //Writing the record for this functional group - String tmpRecord = tmpHashCode + StringBuilder tmpRecordBuilder = new StringBuilder(tmpHashCode + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + tmpPseudoSmilesCode + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR - + tmpSmilesCode; + + tmpSmilesCode); for (String tmpSettingsKey : this.settingsKeysList) { - if (tmpInnerMap.get(tmpSettingsKey) == null) { - tmpInnerMap.put(tmpSettingsKey, 0); - } - tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR - + tmpInnerMap.get(tmpSettingsKey); + tmpInnerMap.putIfAbsent(tmpSettingsKey, 0); + tmpRecordBuilder.append(ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR).append(tmpInnerMap.get(tmpSettingsKey)); } + String tmpRecord = tmpRecordBuilder.toString(); IAtomContainer tmpMoleculeOfOrigin = (IAtomContainer)tmpInnerMap.get( ErtlFunctionalGroupsFinderEvaluationTest.MOLECULE_OF_ORIGIN_KEY); String tmpChebiId = tmpMoleculeOfOrigin.getProperty("ChEBI ID"); @@ -1553,7 +1554,7 @@ private String getPseudoSmilesCode(IAtomContainer aMolecule) throws CDKException */ private void logFilteredMolecule(IAtomContainer aMolecule, int aCounter, String aCause) { if(!this.areFileOperationsActivated) { - System.out.println("\nFile operations are not activated, invokation of logFilteredMolecule() is therefore not possible."); + System.out.println("\nFile operations are not activated, invocation of logFilteredMolecule() is therefore not possible."); return; } this.filteredMoleculesPrintWriter.println(); @@ -1591,7 +1592,7 @@ private void logFilteredMolecule(IAtomContainer aMolecule, int aCounter, String */ private void logException(Exception anException, String aSettingsKey, IAtomContainer aMolecule) { if(!this.areFileOperationsActivated) { - System.out.println("\nFile operations are not activated, invokation of logException() is therefore not possible."); + System.out.println("\nFile operations are not activated, invocation of logException() is therefore not possible."); return; } this.exceptionsCounter++; diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java index de18491..4f46b75 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java @@ -71,7 +71,7 @@ public void testPseudoSmilesGeneration() throws Exception { Assertions.assertEquals(tmpTestPairsMap.get(tmpSmilesCode), tmpPseudoSmilesCode); } } - + // /** * Test for correct MoleculeHashGenerator settings/performance on some examples. * @@ -143,7 +143,7 @@ public void testMoleculeHashGeneratorSettings() throws Exception { Assertions.assertEquals(tmpHashGenerator.generate(tmpKeyMol), tmpHashGenerator.generate(tmpValueMol)); } } - + // /** * Test for correct preprocessing (neutralization of charges and selection of biggest fragment). * @@ -160,7 +160,7 @@ public void testPreprocessing() throws Exception { SmilesGenerator tmpGenerator = new SmilesGenerator(SmiFlavor.Unique); Assertions.assertEquals("OCC", tmpGenerator.create(tmpMol)); } - + // /** * Tests the restoration of environmental carbon atom objects on one example molecule. Nothing is asserted here, it * is meant for visual inspection. @@ -209,7 +209,7 @@ public void testRestorationOfEnvironmentalCarbons() throws Exception { System.out.println(tmpSmiGen.create(tmpFG)); } } - + // /** * Imports a charged molecule with a counter-ion from ChEMBL to test the filtering and preprocessing routines * of ErtlFunctionalGroupsFinderUtility. @@ -240,7 +240,7 @@ public void testOnMolecule() throws Exception { System.out.println(ErtlFunctionalGroupsFinderUtility.createPseudoSmilesCode(tmpFG)); } } - + // /** * Test charge neutralization. */ From 9f4f57c3a834364db89d4e272b3947fc07931325 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Thu, 1 Feb 2024 18:05:18 +0100 Subject: [PATCH 25/27] Fixed hydrogen bug and disabled evaluation test; --- .../cdk/tools/ErtlFunctionalGroupsFinder.java | 10 +++++----- ...rtlFunctionalGroupsFinderEvaluationTest.java | 2 ++ .../tools/ErtlFunctionalGroupsFinderTest.java | 17 +++++++++++++++++ 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 3db61a0..499487d 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -761,12 +761,12 @@ private void markAtoms(IAtomContainer aMolecule) { // if H... // convert to implicit H IAtom tmpConnectedAtom; - try { + if (this.adjListCache[idx].length > 0) { tmpConnectedAtom = aMolecule.getAtom(this.adjListCache[idx][0]); - } catch(ArrayIndexOutOfBoundsException anException) { - //TODO: this happens too often to ignore, investigate! - ErtlFunctionalGroupsFinder.LOGGING_TOOL.warn("Explicit H was included in atom count but not correctly in adjacency list"); - break; + } else { + //unconnected, explicit hydrogen atoms (like e.g. in CHEBI:365445) have an array of bond partners of size 0 + // nothing to do about them, but they also do not concern us + continue; } if (Objects.isNull(tmpConnectedAtom.getImplicitHydrogenCount())) { tmpConnectedAtom.setImplicitHydrogenCount(1); diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java index f018670..d914a88 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java @@ -22,6 +22,7 @@ import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.openscience.cdk.Atom; import org.openscience.cdk.CDKConstants; @@ -93,6 +94,7 @@ * @author Jonas Schaub * @version 1.3 */ +@Disabled("Legacy code and lengthy analyses") public class ErtlFunctionalGroupsFinderEvaluationTest { // diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java index 0cf2e41..5319b61 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java @@ -537,6 +537,23 @@ public void testRAtoms1() throws Exception { this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); } // + /** + * Tests that a former bug concerning unconnected, explicit hydrogen atoms does not occur anymore. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testHydrogenBug() throws Exception { + String tmpMoleculeSmiles = "[H+].[H+].[O-]C(=O)\\C=C/C([O-])=O.[H][C@@]12Cc3c[nH]c4cccc(C1=C[C@@H](COC(=O)C1CCCCC1)CN2C)c34"; //CHEBI:365445 + String[] tmpExpectedFGs = new String[]{"O=C([O-])[C]=[C]C(=O)[O-]", "[C]=[C]", "*OC(*)=O", "[R]N([R])[R]", "NarR3"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + + tmpMoleculeSmiles = "[HH].O=C1N([C@H](C)C(C1=C(O)[C@]2([C@]3([C@H](C=C([C@H]2[C@@H](C(=O)O)CC)C)C[C@H](C)CC3)C)C)=O)C"; //CHEBI:223373 + tmpExpectedFGs = new String[]{"*C(=O)C(=[C]O[H])C(=O)N(*)*", "[C]=[C]", "*C(=O)O[H]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // /** * Applies EFGF to detect functional groups in the given molecule and compares the identified FG to the given * expected FG, using i.a. an identity search. Note that the order of the given FG must match the order of the detected From 4a2f907079d72751ea71b5df8b9084d7765e6ee4 Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Fri, 2 Feb 2024 10:14:01 +0100 Subject: [PATCH 26/27] Removed duplicated test methods from evaluation test; --- ...lFunctionalGroupsFinderEvaluationTest.java | 97 ------------------- 1 file changed, 97 deletions(-) diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java index d914a88..ef5781d 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java @@ -486,8 +486,6 @@ public ErtlFunctionalGroupsFinderEvaluationTest() { // // - - // /** * Test for analyzing molecules in an SD file for all four different electron donation models supplied by the cdk: * daylight, cdk, piBonds, cdkAllowingExocyclic and the aromaticity model cdkLegacy. @@ -727,101 +725,6 @@ public void readChebiLite3StarSubset() throws Exception { } // - // - /** - * Test for correct MoleculeHashGenerator settings/performance on some examples. - * - * @throws java.lang.Exception if initialize() throws an exception or a SMILES code can not be parsed into a molecule - */ - @Test - public void testMoleculeHashGeneratorSettings() throws Exception { - this.initialize(false, ""); - SmilesParser tmpSmilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); - - /*Chebi70986, Chebi16238 and Chebi57692 all contain the same functional group with pseudo SMILES code - "O=C1N=C(C(=NR)C(=O)N1R)N(R)R", but different hybridizations in the resulting atom containers. But their hash - codes should be the same under the given settings. This is tested exemplary for many similar cases*/ - String[] tmpSmilesArray = {"OC[C@@H](O)[C@@H](O)[C@@H](O)CN1CC(CO)N=C2C(=O)NC(=O)N=C12", - "Cc1cc2nc3c(nc(=O)[nH]c3=O)n(C[C@H](O)[C@H](O)[C@H](O)COP(O)(=O)OP(O)(=O)OC[C@H]3O[C@H]([C@H](O)[C@@H]3O)n3cnc4c(N)ncnc34)c2cc1C", - "Cc1cc2nc3c(nc(=O)[n-]c3=O)n(C[C@H](O)[C@H](O)[C@H](O)COP([O-])(=O)OP([O-])(=O)OC[C@H]3O[C@H]([C@H](O)[C@@H]3O)n3cnc4c(N)ncnc34)c2cc1C"}; - List tmpHashCodesList = new LinkedList<>(); - for (String tmpSmilesCode : tmpSmilesArray) { - IAtomContainer tmpParsedMolecule = tmpSmilesParser.parseSmiles(tmpSmilesCode); - tmpParsedMolecule = this.applyFiltersAndPreprocessing(tmpParsedMolecule); - Aromaticity.cdkLegacy().apply(tmpParsedMolecule); - List tmpFunctionalGroups = this.ertlFGFinderGenOn.find(tmpParsedMolecule); - for (IAtomContainer tmpFunctionalGroup : tmpFunctionalGroups) { - if (this.getPseudoSmilesCode(tmpFunctionalGroup).equals("O=C1N=C(C(=NR)C(=O)N1R)N(R)R")) { - tmpHashCodesList.add(this.molHashGenerator.generate(tmpFunctionalGroup)); - } - } - } - for (Long tmpHashCode1 : tmpHashCodesList) { - for (Long tmpHashCode2 : tmpHashCodesList) { - Assertions.assertEquals(tmpHashCode1.longValue(), tmpHashCode2.longValue()); - } - } - - /*Functional groups like the tertiary amine or the hydroxyl group appear with aromatic and non-aromatic central - atoms. These two cases should be discrimated by the MoleculeHashGenerator under the given settings*/ - String tmpTertiaryAmineSmiles = "*N(*)*"; - IAtomContainer tmpAromMol = tmpSmilesParser.parseSmiles(tmpTertiaryAmineSmiles); - IAtomContainer tmpNonAromMol = tmpSmilesParser.parseSmiles(tmpTertiaryAmineSmiles); - for (IAtom tmpAtom : tmpAromMol.atoms()) { - if (tmpAtom.getSymbol().equals("N")) - tmpAtom.setIsAromatic(true); - } - Assertions.assertNotEquals(this.molHashGenerator.generate(tmpAromMol), this.molHashGenerator.generate(tmpNonAromMol)); - String tmpHydroxylGroupSmiles = "[H]O[C]"; - tmpAromMol = tmpSmilesParser.parseSmiles(tmpHydroxylGroupSmiles); - tmpNonAromMol = tmpSmilesParser.parseSmiles(tmpHydroxylGroupSmiles); - for (IAtom tmpAtom : tmpAromMol.atoms()) { - if (tmpAtom.getSymbol().equals("C")) - tmpAtom.setIsAromatic(true); - } - Assertions.assertNotEquals(this.molHashGenerator.generate(tmpAromMol), this.molHashGenerator.generate(tmpNonAromMol)); - - /*The following are examples of different (unique!) SMILES codes representing the same functional groups. - They should be assigned the same hash code*/ - HashMap tmpEquivalentSmilesMap = new HashMap<>(20); - tmpEquivalentSmilesMap.put("*[N](*)=C(N(*)*)N(*)*", "*N(*)C(=[N](*)*)N(*)*"); - tmpEquivalentSmilesMap.put("*SC1=[N](*)[C]=[C]N1*", "*SC=1N(*)[C]=[C][N]1*"); - tmpEquivalentSmilesMap.put("*[N]1=[C][C]=[C]N1*", "*N1[C]=[C][C]=[N]1*"); - tmpEquivalentSmilesMap.put("*[N](*)=[C]N(*)*", "*N(*)[C]=[N](*)*"); - tmpEquivalentSmilesMap.put("*N(*)[C]=[C][C]=[C][C]=[C][C]=[C][C]=[N](*)*", "*[N](*)=[C][C]=[C][C]=[C][C]=[C][C]=[C]N(*)*"); - tmpEquivalentSmilesMap.put("*[N](*)=C(N(*)*)N(*)P(=O)(O[H])O[H]", "*N(*)C(=[N](*)*)N(*)P(=O)(O[H])O[H]"); - tmpEquivalentSmilesMap.put("[O]I(=O)=O", "O=I(=O)[O]"); - tmpEquivalentSmilesMap.put("[O]Br(=O)=O", "O=Br(=O)[O]"); - tmpEquivalentSmilesMap.put("[O]Cl(=O)(=O)=O", "O=Cl(=O)(=O)[O]"); - tmpEquivalentSmilesMap.put("[C]=[C][C]=[C]C#C[C]=[C]C#[C]", "[C]#C[C]=[C]C#C[C]=[C][C]=[C]"); - tmpEquivalentSmilesMap.put("*N1[C]=[C][C]=[N]1*", "*[N]1=[C][C]=[C]N1*"); - tmpEquivalentSmilesMap.put("O=C(*)O*", "*OC(*)=O"); - for (String tmpKeySmiles : tmpEquivalentSmilesMap.keySet()) { - IAtomContainer tmpKeyMol = tmpSmilesParser.parseSmiles(tmpKeySmiles); - IAtomContainer tmpValueMol = tmpSmilesParser.parseSmiles(tmpEquivalentSmilesMap.get(tmpKeySmiles)); - Assertions.assertEquals(this.molHashGenerator.generate(tmpKeyMol), this.molHashGenerator.generate(tmpValueMol)); - } - } - - /** - * Test for correct preprocessing (neutralization of charges and selection of biggest fragment). - * - * @throws Exception if initialize() throws an exception or a SMILES code can not be parsed into a molecule - */ - @Test - public void testPreprocessing() throws Exception { - this.initialize(false, ""); - String tmpSmiles = "CC[O-].C"; - SmilesParser tmpSmilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); - IAtomContainer tmpMol = tmpSmilesParser.parseSmiles(tmpSmiles); - tmpMol = this.applyFiltersAndPreprocessing(tmpMol); - SmilesGenerator tmpGenerator = SmilesGenerator.unique(); - Assertions.assertEquals("OCC", tmpGenerator.create(tmpMol)); - } - // - - // - // /** * Analyzes molecules in an SD file for all four different electron donation models supplied by the cdk: From 07e47c8eb977f0765d9d753e95b8f4aeb9aeb15e Mon Sep 17 00:00:00 2001 From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com> Date: Fri, 2 Feb 2024 10:18:06 +0100 Subject: [PATCH 27/27] Removed unused imports; --- .../cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java index ef5781d..4aedfe7 100644 --- a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java @@ -20,7 +20,6 @@ package org.openscience.cdk.tools; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assumptions; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -48,7 +47,6 @@ import org.openscience.cdk.silent.SilentChemObjectBuilder; import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; -import org.openscience.cdk.smiles.SmilesParser; import org.openscience.cdk.tools.ErtlFunctionalGroupsFinder.Mode; import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; import org.openscience.cdk.tools.manipulator.AtomTypeManipulator;