Merge pull request #465 from carlocamilloni/mglob_prior

Mglob prior
multi-ego · Sep 5, 2024 · ae2b31a · ae2b31a
2 parents 32a7809 + 5e31096
commit ae2b31a
Show file tree

Hide file tree

Showing 5 changed files with 399 additions and 83 deletions.
diff --git a/multiego.py b/multiego.py
@@ -2,6 +2,7 @@
 import sys
 import os
 import numpy as np
+import pandas as pd
 
 from src.multiego import ensemble
 from src.multiego import io
@@ -406,9 +407,38 @@ def get_meGO_LJ(meGO_ensemble, args):
     """
     pairs14, exclusion_bonds14 = ensemble.generate_14_data(meGO_ensemble)
     if args.egos == "rc":
-        meGO_LJ = ensemble.generate_basic_LJ(meGO_ensemble, args)
-        meGO_LJ_14 = pairs14
+        meGO_LJ, meGO_LJ_14 = ensemble.generate_basic_LJ(meGO_ensemble, args)
+        # meGO_LJ_14 = pairs14
+        meGO_LJ_14 = pd.concat([meGO_LJ_14, pairs14])
+        needed_fields = [
+            "ai",
+            "aj",
+            "type",
+            "c6",
+            "c12",
+            "sigma",
+            "epsilon",
+            "probability",
+            "rc_probability",
+            "md_threshold",
+            "rc_threshold",
+            "rep",
+            "cutoff",
+            "molecule_name_ai",
+            "molecule_name_aj",
+            "same_chain",
+            "source",
+            "number_ai",
+            "number_aj",
+        ]
+        meGO_LJ_14 = meGO_LJ_14[needed_fields]
         meGO_LJ_14["epsilon"] = -meGO_LJ_14["c12"]
+        meGO_LJ_14.reset_index(inplace=True)
+        # Sorting the pairs prioritising intermolecular interactions
+        meGO_LJ_14.sort_values(by=["ai", "aj", "c12"], ascending=[True, True, True], inplace=True)
+        # Cleaning the duplicates
+        meGO_LJ_14 = meGO_LJ_14.drop_duplicates(subset=["ai", "aj"], keep="first")
+        print(meGO_LJ_14)
     else:
         train_dataset, check_dataset = ensemble.init_LJ_datasets(meGO_ensemble, pairs14, exclusion_bonds14, args)
         meGO_LJ, meGO_LJ_14 = ensemble.generate_LJ(meGO_ensemble, train_dataset, check_dataset, args)

diff --git a/src/multiego/ensemble.py b/src/multiego/ensemble.py
@@ -493,7 +493,7 @@ def init_meGO_ensemble(args):
         return ensemble
 
     reference_set = set(ensemble["topology_dataframe"]["name"].to_list())
-    unique_ref_molecule_names = topology_dataframe["molecule_name"].unique()
+    # unique_ref_molecule_names = topology_dataframe["molecule_name"].unique()
 
     # now we process the train contact matrices
     train_contact_matrices = {}
@@ -521,7 +521,7 @@ def init_meGO_ensemble(args):
             _,
         ) = initialize_topology(topology, custom_dict, args)
         # check that the molecules defined have a reference
-        unique_temp_molecule_names = temp_topology_dataframe["molecule_name"].unique()
+        # unique_temp_molecule_names = temp_topology_dataframe["molecule_name"].unique()
         # check_molecule_names(unique_ref_molecule_names, unique_temp_molecule_names)
 
         train_topology_dataframe = pd.concat(
@@ -596,7 +596,7 @@ def init_meGO_ensemble(args):
             _,
         ) = initialize_topology(topology, custom_dict, args)
         # check that the molecules defined have a reference
-        unique_temp_molecule_names = temp_topology_dataframe["molecule_name"].unique()
+        # unique_temp_molecule_names = temp_topology_dataframe["molecule_name"].unique()
         # check_molecule_names(unique_ref_molecule_names, unique_temp_molecule_names)
         check_topology_dataframe = pd.concat(
             [check_topology_dataframe, temp_topology_dataframe],
@@ -762,6 +762,7 @@ def generate_14_data(meGO_ensemble):
             pairs["aj"] = pairs["aj"].map(type_atnum_dict)
             pairs["rep"] = pairs["c12"]
             pairs["same_chain"] = True
+            pairs["1-4"] = "1_4"
         else:
             pairs["ai"] = meGO_ensemble["user_pairs"][molecule].ai.astype(str)
             pairs["aj"] = meGO_ensemble["user_pairs"][molecule].aj.astype(str)
@@ -979,6 +980,10 @@ def init_LJ_datasets(meGO_ensemble, pairs14, exclusion_bonds14, args):
     return train_dataset, check_dataset
 
 
+def get_residue_number(s):
+    return int(s.split("_")[-1])
+
+
 def generate_basic_LJ(meGO_ensemble, args):
     """
     Generates basic LJ (Lennard-Jones) interactions DataFrame within a molecular ensemble.
@@ -1021,6 +1026,7 @@ def generate_basic_LJ(meGO_ensemble, args):
         "cutoff",
         "rep",
         "att",
+        "bare",
     ]
 
     basic_LJ = pd.DataFrame()
@@ -1059,15 +1065,37 @@ def generate_basic_LJ(meGO_ensemble, args):
         c6_list = ai_name.map(name_to_c6).to_numpy()
         ai_name = ai_name.to_numpy(dtype=str)
         oxygen_mask = masking.create_array_mask(ai_name, ai_name, [("O", "OM"), ("O", "O"), ("OM", "OM")], symmetrize=True)
-        bb_mask = masking.create_array_mask(
+        ca_mask = masking.create_array_mask(
             ai_name,
             ai_name,
             [
                 ("CH1", "CH1"),
+                ("CH1", "C"),
+                ("CH1", "N"),
+                ("CH1", "S"),
+                ("CH1", "CH"),
                 ("CH1", "CH2"),
-                ("CH1", "CH3"),
                 ("CH1", "CH2r"),
-                ("CH1", "CH"),
+                ("CH1", "CH3"),
+                ("CH1", "OA"),
+                ("NL", "CH1"),
+                ("NZ", "CH1"),
+                ("NE", "CH1"),
+                ("NT", "CH1"),
+                ("NR", "CH1"),
+                ("N", "N"),
+                ("N", "NT"),
+                ("NT", "NT"),
+                ("C", "C"),
+                ("S", "S"),
+            ],
+            symmetrize=True,
+        )
+        bb_mask = masking.create_array_mask(
+            ai_name,
+            ai_name,
+            [
+                ("CH1", "CH1"),
                 ("CH1", "C"),
                 ("CH1", "N"),
                 ("C", "C"),
@@ -1138,36 +1166,74 @@ def generate_basic_LJ(meGO_ensemble, args):
             ],
             symmetrize=True,
         )
+        hydrophobic_w_mask = masking.create_array_mask(
+            ai_name,
+            ai_name,
+            [
+                ("CH1", "CH2"),
+                ("CH1", "CH3"),
+                ("CH1", "CH2r"),
+                ("CH1", "CH"),
+            ],
+            symmetrize=True,
+        )
         basic_LJ["type"] = 1
         basic_LJ["source"] = "basic"
         basic_LJ["same_chain"] = True
+        basic_LJ.c6 = 0.0
         basic_LJ.c12 = np.sqrt(bare_c12_list * bare_c12_list[:, np.newaxis]).flatten()
+        basic_LJ.bare = np.sqrt(bare_c12_list * bare_c12_list[:, np.newaxis]).flatten()
         basic_LJ.rep = np.sqrt(c12_list * c12_list[:, np.newaxis]).flatten()
         basic_LJ.att = np.sqrt(c6_list * c6_list[:, np.newaxis]).flatten()
-        oxygen_LJ = basic_LJ[oxygen_mask].copy()
-        oxygen_LJ["c12"] *= 11.4
-        oxygen_LJ["c6"] = 0.0
-        hydrophobic_LJ = basic_LJ[hydrophobic_mask].copy()
-        hydrophobic_LJ["c12"] = 0.20 * hydrophobic_LJ["rep"]
-        hydrophobic_LJ["c6"] = 0.20 * hydrophobic_LJ["att"]
-        bb_LJ = basic_LJ[bb_mask].copy()
-        bb_LJ["c12"] = 0.25 * bb_LJ["rep"]
-        bb_LJ["c6"] = 0.25 * bb_LJ["att"]
-        hbond_LJ = basic_LJ[hbond_mask].copy()
-        hbond_LJ["c12"] = 0.25 * hbond_LJ["rep"]
-        hbond_LJ["c6"] = 0.25 * hbond_LJ["att"]
-        catpi_LJ = basic_LJ[catpi_mask].copy()
-        catpi_LJ["c12"] = 0.20 * catpi_LJ["rep"]
-        catpi_LJ["c6"] = 0.20 * catpi_LJ["att"]
-        basic_LJ = pd.concat([oxygen_LJ, hbond_LJ, hydrophobic_LJ, catpi_LJ, bb_LJ])
+        # oxygen_LJ = basic_LJ[oxygen_mask].copy()
+        # oxygen_LJ["c12"] *= 11.4
+        # oxygen_LJ["c6"] = 0.0
+        # hydrophobic_LJ = basic_LJ[hydrophobic_mask].copy()
+        # hydrophobic_LJ["c12"] = 0.25 * hydrophobic_LJ["rep"]
+        # hydrophobic_LJ["c6"] = 0.25 * hydrophobic_LJ["att"]
+        # hydrophobic_w_LJ = basic_LJ[hydrophobic_w_mask].copy()
+        # hydrophobic_w_LJ["c12"] = 0.12 * hydrophobic_w_LJ["rep"]
+        # hydrophobic_w_LJ["c6"] = 0.12 * hydrophobic_w_LJ["att"]
+        # bb_LJ = basic_LJ[bb_mask].copy()
+        # bb_LJ["c12"] = 0.275 * bb_LJ["rep"]
+        # bb_LJ["c6"] = 0.275 * bb_LJ["att"]
+        # hbond_LJ = basic_LJ[hbond_mask].copy()
+        # hbond_LJ["c12"] = 0.275 * hbond_LJ["rep"]
+        # hbond_LJ["c6"] = 0.275 * hbond_LJ["att"]
+        # catpi_LJ = basic_LJ[catpi_mask].copy()
+        # catpi_LJ["c12"] = 0.20 * catpi_LJ["rep"]
+        # catpi_LJ["c6"] = 0.20 * catpi_LJ["att"]
+        # basic_LJ = pd.concat([oxygen_LJ, hbond_LJ, hydrophobic_LJ, hydrophobic_w_LJ, catpi_LJ, bb_LJ])
         basic_LJ["intra_domain"] = True
-        basic_LJ["rep"] = basic_LJ["c12"]
-        basic_LJ = basic_LJ.loc[(basic_LJ["c6"] == 0.0) | ((basic_LJ["c6"] ** 2 / (4.0 * basic_LJ["c12"])) > args.epsilon_min)]
+        basic_LJ["residue_ai"] = basic_LJ["ai"].apply(get_residue_number)
+        basic_LJ["residue_aj"] = basic_LJ["aj"].apply(get_residue_number)
+        basic_LJ.loc[oxygen_mask, "c12"] *= 11.4
+        basic_LJ.loc[oxygen_mask, "c6"] = 0.0
+        basic_LJ.loc[~oxygen_mask, "c12"] = 0.195 * basic_LJ["rep"]
+        basic_LJ.loc[~oxygen_mask, "c6"] = 0.195 * basic_LJ["att"]
+        basic_LJ_14 = basic_LJ.copy()
+        basic_LJ_14 = basic_LJ_14[(~oxygen_mask & ~ca_mask)]
+        basic_LJ_14 = basic_LJ_14.loc[
+            (basic_LJ["same_chain"]) & ((basic_LJ["residue_ai"] - basic_LJ["residue_aj"]).abs() <= 2)
+        ]
+        basic_LJ = basic_LJ[~ca_mask]
+        basic_LJ["rep"] = basic_LJ["bare"]
+        basic_LJ_14["c12"] = basic_LJ_14["bare"]
+        basic_LJ_14["c6"] = 0.0
+        basic_LJ_14["rep"] = basic_LJ_14["c12"]
+        # basic_LJ = basic_LJ.loc[(basic_LJ["c6"] == 0.0) | ((basic_LJ["c6"] ** 2 / (4.0 * basic_LJ["c12"])) > args.epsilon_min)]
         basic_LJ["index_ai"], basic_LJ["index_aj"] = basic_LJ[["index_ai", "index_aj"]].min(axis=1), basic_LJ[
             ["index_ai", "index_aj"]
         ].max(axis=1)
         basic_LJ = basic_LJ.drop_duplicates(subset=["index_ai", "index_aj", "same_chain"], keep="first")
+        basic_LJ.sort_values(by=["index_ai", "index_aj"], inplace=True)
         basic_LJ = basic_LJ.drop(["index_ai", "index_aj"], axis=1)
+        basic_LJ_14["index_ai"], basic_LJ_14["index_aj"] = basic_LJ_14[["index_ai", "index_aj"]].min(axis=1), basic_LJ_14[
+            ["index_ai", "index_aj"]
+        ].max(axis=1)
+        basic_LJ_14 = basic_LJ_14.drop_duplicates(subset=["index_ai", "index_aj", "same_chain"], keep="first")
+        basic_LJ_14.sort_values(by=["index_ai", "index_aj"], inplace=True)
+        basic_LJ_14 = basic_LJ_14.drop(["index_ai", "index_aj"], axis=1)
 
     for name in meGO_ensemble["reference_matrices"].keys():
         temp_basic_LJ = pd.DataFrame(columns=columns)
@@ -1218,8 +1284,24 @@ def generate_basic_LJ(meGO_ensemble, args):
     basic_LJ.sort_values(by=["ai", "aj", "same_chain"], ascending=[True, True, True], inplace=True)
     # Cleaning the duplicates
     basic_LJ = basic_LJ.drop_duplicates(subset=["ai", "aj"], keep="first")
+    basic_LJ_14["probability"] = 1.0
+    basic_LJ_14["rc_probability"] = 1.0
+    basic_LJ_14["rc_threshold"] = 1.0
+    basic_LJ_14["md_threshold"] = 1.0
+    basic_LJ_14["epsilon"] = -basic_LJ_14["c12"]
+    basic_LJ_14.loc[basic_LJ_14["c6"] > 0, "epsilon"] = basic_LJ_14["c6"] ** 2 / (4.0 * basic_LJ_14["c12"])
+    basic_LJ_14["cutoff"] = 1.45 * basic_LJ_14["c12"] ** (1.0 / 12.0)
+    basic_LJ_14["sigma"] = basic_LJ_14["cutoff"] / (2.0 ** (1.0 / 6.0))
+    basic_LJ_14.loc[basic_LJ_14["c6"] > 0, "sigma"] = (basic_LJ_14["c12"] / basic_LJ_14["c6"]) ** (1 / 6)
+    basic_LJ_14["distance"] = basic_LJ_14["cutoff"]
+    basic_LJ_14["learned"] = 0
+    basic_LJ_14["1-4"] = "1>4"
+    # Sorting the pairs prioritising intermolecular interactions
+    basic_LJ_14.sort_values(by=["ai", "aj", "same_chain"], ascending=[True, True, True], inplace=True)
+    # Cleaning the duplicates
+    basic_LJ_14 = basic_LJ_14.drop_duplicates(subset=["ai", "aj"], keep="first")
 
-    return basic_LJ
+    return basic_LJ, basic_LJ_14
 
 
 def set_epsilon(meGO_LJ):

diff --git a/tools/box_concentration/get_box.py b/tools/box_concentration/get_box.py
@@ -1,13 +1,64 @@
 import argparse
+import numpy as np
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="TODO!")
-    parser.add_argument("--n_mol", type=int, required=True, help="Number of molecules to simulate.")
-    parser.add_argument("--conc", type=float, required=True, help="Concentration in Molar")
+    parser = argparse.ArgumentParser(
+        description="Calculate number of moles, concentration, box size or volume based on provided parameters."
+    )
+    parser.add_argument(
+        "--n_mol",
+        type=int,
+        required=False,
+        help="Number of molecules, to be combined with --conc or one from (--volume, --sphere_r, --cubic_side)",
+    )
+    parser.add_argument(
+        "--conc",
+        type=float,
+        required=False,
+        help="Concentration in Molar, to be combined with --n_mol or one from (--volume, --sphere_r, --cubic_side)",
+    )
+    parser.add_argument("--volume", type=float, required=False, help="Volume in nm^3, to be combined with --n_mol or --conc")
+    parser.add_argument("--sphere_r", type=float, required=False, help="Radius of a sphere in nm")
+    parser.add_argument("--cubic_side", type=float, required=False, help="Side of a cube in nm")
 
     args = parser.parse_args()
 
+    # Print help if no arguments are provided
+    if not any(vars(args).values()):
+        parser.print_help()
+        exit(0)
+
     n_mol = args.n_mol
     conc = args.conc
-    box = 10**8 * (n_mol / (conc * 6.022 * 10**23)) ** (1 / 3)
-    print("cubic box of side: %.5f nm" % box)
+    volume = args.volume
+    sphere_r = args.sphere_r
+    cubic_side = args.cubic_side
+
+    if n_mol is not None and n_mol > 0 and conc is not None and conc > 0:
+        box = 10**8 * (n_mol / (conc * 6.022 * 10**23)) ** (1 / 3)
+        print("cubic box of side: %.5f nm" % box)
+        exit()
+
+    if sphere_r is not None and sphere_r > 0:
+        volume = (4 / 3) * np.pi * sphere_r**3
+
+    if cubic_side is not None and cubic_side > 0:
+        volume = cubic_side**3
+
+    if n_mol is not None and n_mol > 0 and volume is not None and volume > 0:
+        conc = 10**24 * (n_mol / (volume * 6.022 * 10**23))
+        print("concentration is: %.12f M" % conc)
+        exit()
+
+    if conc is not None and conc > 0 and volume is not None and volume > 0:
+        n_mol = int(round((volume * conc * 6.022 * 10**23) / 10**24))
+        print("n_mol is: %i" % n_mol)
+        exit()
+
+    if sphere_r is not None and sphere_r > 0:
+        volume = (4 / 3) * np.pi * sphere_r**3
+        side = volume ** (1 / 3)
+        print("volume is %.6f nm^3 and cubic side would be %.6f nm" % (volume, side))
+        exit()
+
+    parser.print_help()
diff --git a/tools/cmdata/main.cpp b/tools/cmdata/main.cpp
@@ -40,7 +40,7 @@ int main(int argc, const char** argv)
     {"mol_cutoff",  '\0', POPT_ARG_DOUBLE | POPT_ARGFLAG_OPTIONAL,  &mol_cutoff,      0, "Molecule cutoff distance",    "DOUBLE"},
     {"nskip",       '\0', POPT_ARG_INT | POPT_ARGFLAG_OPTIONAL,     &nskip,           0, "Number of frames to skip",    "INT"},
     {"num_threads", '\0', POPT_ARG_INT | POPT_ARGFLAG_OPTIONAL,     &num_threads,     0, "Number of threads",           "INT"},
-    {"num_threads", '\0', POPT_ARG_INT | POPT_ARGFLAG_OPTIONAL,     &mol_threads,     0, "Number of molecule threads",  "INT"},
+    {"mol_threads", '\0', POPT_ARG_INT | POPT_ARGFLAG_OPTIONAL,     &mol_threads,     0, "Number of molecule threads",  "INT"},
     {"mode",        '\0', POPT_ARG_STRING | POPT_ARGFLAG_OPTIONAL,  &p_mode,          0, "Mode of operation",           "STRING"},
     {"weights",     '\0', POPT_ARG_STRING | POPT_ARGFLAG_OPTIONAL,  &p_weights_path,  0, "Weights file",                "FILE"},
     {"no_pbc",      '\0', POPT_ARG_NONE | POPT_ARGFLAG_OPTIONAL,    &p_nopbc,         0, "Ignore pbcs",                 0},