From 7786126295ff7b0a37854131a78a93a72266c626 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 11 Jun 2024 09:40:35 -0400
Subject: [PATCH] breaking: use all sets for training and test (#3862)

Fix #3860.

Remove `train_dirs` and `test_dir` in `DeepmdData`.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- All data sets are now trained and tested by default, simplifying the
training process.

- **Bug Fixes**
- Improved logic for handling training directories and test set merging.

- **Tests**
  - Added new test cases for the updated data handling methods.
- Updated existing tests to reflect changes in data set handling and
batch sizes.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/utils/data.py                    |  61 +++++------
 source/tests/tf/test_deepmd_data.py     |  33 ++++--
 source/tests/tf/test_deepmd_data_sys.py | 132 +++++++++++++++++++++---
 source/tests/tf/test_gen_stat_data.py   |   4 +-
 4 files changed, 167 insertions(+), 63 deletions(-)

diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
index 91782d898f..86681ddb07 100644
--- a/deepmd/utils/data.py
+++ b/deepmd/utils/data.py
@@ -42,7 +42,7 @@ class DeepmdData:
     modifier
             Data modifier that has the method `modify_data`
     trn_all_set
-            Use all sets as training dataset. Otherwise, if the number of sets is more than 1, the last set is left for test.
+            [DEPRECATED] Deprecated. Now all sets are trained and tested.
     sort_atoms : bool
             Sort atoms by atom types. Required to enable when the data is directly feeded to
             descriptors except mixed types.
@@ -109,15 +109,6 @@ def __init__(
         # make idx map
         self.sort_atoms = sort_atoms
         self.idx_map = self._make_idx_map(self.atom_type)
-        # train dirs
-        self.test_dir = self.dirs[-1]
-        if trn_all_set:
-            self.train_dirs = self.dirs
-        else:
-            if len(self.dirs) == 1:
-                self.train_dirs = self.dirs
-            else:
-                self.train_dirs = self.dirs[:-1]
         self.data_dict = {}
         # add box and coord
         self.add("box", 9, must=self.pbc)
@@ -225,7 +216,7 @@ def get_data_dict(self) -> dict:
 
     def check_batch_size(self, batch_size):
         """Check if the system can get a batch of data with `batch_size` frames."""
-        for ii in self.train_dirs:
+        for ii in self.dirs:
             if self.data_dict["coord"]["high_prec"]:
                 tmpe = (
                     (ii / "coord.npy").load_numpy().astype(GLOBAL_ENER_FLOAT_PRECISION)
@@ -240,24 +231,7 @@ def check_batch_size(self, batch_size):
 
     def check_test_size(self, test_size):
         """Check if the system can get a test dataset with `test_size` frames."""
-        if self.data_dict["coord"]["high_prec"]:
-            tmpe = (
-                (self.test_dir / "coord.npy")
-                .load_numpy()
-                .astype(GLOBAL_ENER_FLOAT_PRECISION)
-            )
-        else:
-            tmpe = (
-                (self.test_dir / "coord.npy")
-                .load_numpy()
-                .astype(GLOBAL_NP_FLOAT_PRECISION)
-            )
-        if tmpe.ndim == 1:
-            tmpe = tmpe.reshape([1, -1])
-        if tmpe.shape[0] < test_size:
-            return self.test_dir, tmpe.shape[0]
-        else:
-            return None
+        return self.check_batch_size(test_size)
 
     def get_item_torch(self, index: int) -> dict:
         """Get a single frame data . The frame is picked from the data system by index. The index is coded across all the sets.
@@ -287,7 +261,7 @@ def get_batch(self, batch_size: int) -> dict:
         else:
             set_size = 0
         if self.iterator + batch_size > set_size:
-            self._load_batch_set(self.train_dirs[self.set_count % self.get_numb_set()])
+            self._load_batch_set(self.dirs[self.set_count % self.get_numb_set()])
             self.set_count += 1
             set_size = self.batch_set["coord"].shape[0]
         iterator_1 = self.iterator + batch_size
@@ -307,7 +281,7 @@ def get_test(self, ntests: int = -1) -> dict:
             Size of the test data set. If `ntests` is -1, all test data will be get.
         """
         if not hasattr(self, "test_set"):
-            self._load_test_set(self.test_dir, self.shuffle_test)
+            self._load_test_set(self.shuffle_test)
         if ntests == -1:
             idx = None
         else:
@@ -340,11 +314,11 @@ def get_atom_type(self) -> List[int]:
 
     def get_numb_set(self) -> int:
         """Get number of training sets."""
-        return len(self.train_dirs)
+        return len(self.dirs)
 
     def get_numb_batch(self, batch_size: int, set_idx: int) -> int:
         """Get the number of batches in a set."""
-        data = self._load_set(self.train_dirs[set_idx])
+        data = self._load_set(self.dirs[set_idx])
         ret = data["coord"].shape[0] // batch_size
         if ret == 0:
             ret = 1
@@ -353,7 +327,7 @@ def get_numb_batch(self, batch_size: int, set_idx: int) -> int:
     def get_sys_numb_batch(self, batch_size: int) -> int:
         """Get the number of batches in the data system."""
         ret = 0
-        for ii in range(len(self.train_dirs)):
+        for ii in range(len(self.dirs)):
             ret += self.get_numb_batch(batch_size, ii)
         return ret
 
@@ -388,7 +362,7 @@ def avg(self, key):
         info = self.data_dict[key]
         ndof = info["ndof"]
         eners = []
-        for ii in self.train_dirs:
+        for ii in self.dirs:
             data = self._load_set(ii)
             ei = data[key].reshape([-1, ndof])
             eners.append(ei)
@@ -441,8 +415,21 @@ def _load_batch_set(self, set_name: DPPath):
     def reset_get_batch(self):
         self.iterator = 0
 
-    def _load_test_set(self, set_name: DPPath, shuffle_test):
-        self.test_set = self._load_set(set_name)
+    def _load_test_set(self, shuffle_test: bool):
+        test_sets = []
+        for ii in self.dirs:
+            test_set = self._load_set(ii)
+            test_sets.append(test_set)
+        # merge test sets
+        self.test_set = {}
+        assert len(test_sets) > 0
+        for kk in test_sets[0]:
+            if "find_" in kk:
+                self.test_set[kk] = test_sets[0][kk]
+            else:
+                self.test_set[kk] = np.concatenate(
+                    [test_set[kk] for test_set in test_sets], axis=0
+                )
         if shuffle_test:
             self.test_set, _ = self._shuffle_data(self.test_set)
 
diff --git a/source/tests/tf/test_deepmd_data.py b/source/tests/tf/test_deepmd_data.py
index 40bceb2d79..0969a9baf1 100644
--- a/source/tests/tf/test_deepmd_data.py
+++ b/source/tests/tf/test_deepmd_data.py
@@ -143,6 +143,9 @@ def setUp(self):
         path = os.path.join(self.data_name, "set.bar", "test_frame.npy")
         self.test_frame_bar = rng.random([self.nframes, 5])
         np.save(path, self.test_frame_bar)
+        path = os.path.join(self.data_name, "set.tar", "test_frame.npy")
+        self.test_frame_tar = rng.random([2, 5])
+        np.save(path, self.test_frame_tar)
         # t n
         self.test_null = np.zeros([self.nframes, 2 * self.natoms])
         # tensor shape
@@ -162,8 +165,9 @@ def test_init(self):
         self.assertEqual(dd.idx_map[0], 1)
         self.assertEqual(dd.idx_map[1], 0)
         self.assertEqual(dd.type_map, ["foo", "bar"])
-        self.assertEqual(dd.test_dir, "test_data/set.tar")
-        self.assertEqual(dd.train_dirs, ["test_data/set.bar", "test_data/set.foo"])
+        self.assertEqual(
+            dd.dirs, ["test_data/set.bar", "test_data/set.foo", "test_data/set.tar"]
+        )
 
     def test_init_type_map(self):
         dd = DeepmdData(self.data_name, type_map=["bar", "foo", "tar"])
@@ -182,7 +186,7 @@ def test_load_set(self):
         )
         data = dd._load_set(os.path.join(self.data_name, "set.foo"))
         nframes = data["coord"].shape[0]
-        self.assertEqual(dd.get_numb_set(), 2)
+        self.assertEqual(dd.get_numb_set(), 3)
         self.assertEqual(dd.get_type_map(), ["foo", "bar"])
         self.assertEqual(dd.get_natoms(), 2)
         self.assertEqual(list(dd.get_natoms_vec(3)), [2, 2, 1, 1, 0])
@@ -257,7 +261,10 @@ def test_avg(self):
         dd = DeepmdData(self.data_name).add("test_frame", 5, atomic=False, must=True)
         favg = dd.avg("test_frame")
         fcmp = np.average(
-            np.concatenate((self.test_frame, self.test_frame_bar), axis=0), axis=0
+            np.concatenate(
+                (self.test_frame, self.test_frame_bar, self.test_frame_tar), axis=0
+            ),
+            axis=0,
         )
         np.testing.assert_almost_equal(favg, fcmp, places)
 
@@ -266,13 +273,17 @@ def test_check_batch_size(self):
         ret = dd.check_batch_size(10)
         self.assertEqual(ret, (os.path.join(self.data_name, "set.bar"), 5))
         ret = dd.check_batch_size(5)
+        self.assertEqual(ret, (os.path.join(self.data_name, "set.tar"), 2))
+        ret = dd.check_batch_size(1)
         self.assertEqual(ret, None)
 
     def test_check_test_size(self):
         dd = DeepmdData(self.data_name)
         ret = dd.check_test_size(10)
+        self.assertEqual(ret, (os.path.join(self.data_name, "set.bar"), 5))
+        ret = dd.check_test_size(5)
         self.assertEqual(ret, (os.path.join(self.data_name, "set.tar"), 2))
-        ret = dd.check_test_size(2)
+        ret = dd.check_test_size(1)
         self.assertEqual(ret, None)
 
     def test_get_batch(self):
@@ -284,6 +295,10 @@ def test_get_batch(self):
         data = dd.get_batch(5)
         self._comp_np_mat2(np.sort(data["coord"], axis=0), np.sort(self.coord, axis=0))
         data = dd.get_batch(5)
+        self._comp_np_mat2(
+            np.sort(data["coord"], axis=0), np.sort(self.coord_tar, axis=0)
+        )
+        data = dd.get_batch(5)
         self._comp_np_mat2(
             np.sort(data["coord"], axis=0), np.sort(self.coord_bar, axis=0)
         )
@@ -293,8 +308,11 @@ def test_get_batch(self):
     def test_get_test(self):
         dd = DeepmdData(self.data_name)
         data = dd.get_test()
+        expected_coord = np.concatenate(
+            (self.coord_bar, self.coord, self.coord_tar), axis=0
+        )
         self._comp_np_mat2(
-            np.sort(data["coord"], axis=0), np.sort(self.coord_tar, axis=0)
+            np.sort(data["coord"], axis=0), np.sort(expected_coord, axis=0)
         )
 
     def test_get_nbatch(self):
@@ -368,8 +386,7 @@ def test_init(self):
         dd = DeepmdData(self.data_name)
         self.assertEqual(dd.idx_map[0], 0)
         self.assertEqual(dd.type_map, ["X"])
-        self.assertEqual(dd.test_dir, self.data_name + "#/set.000")
-        self.assertEqual(dd.train_dirs, [self.data_name + "#/set.000"])
+        self.assertEqual(dd.dirs[0], self.data_name + "#/set.000")
 
     def test_get_batch(self):
         dd = DeepmdData(self.data_name)
diff --git a/source/tests/tf/test_deepmd_data_sys.py b/source/tests/tf/test_deepmd_data_sys.py
index 710a6d0ac5..893177ac4f 100644
--- a/source/tests/tf/test_deepmd_data_sys.py
+++ b/source/tests/tf/test_deepmd_data_sys.py
@@ -68,7 +68,7 @@ def test_ntypes(self):
         ds.add("test", self.test_ndof, atomic=True, must=True)
         ds.add("null", self.test_ndof, atomic=True, must=False)
         self.assertEqual(ds.get_ntypes(), 3)
-        self.assertEqual(ds.get_nbatches(), [2, 4, 3, 2])
+        self.assertEqual(ds.get_nbatches(), [3, 6, 5, 4])
         self.assertEqual(ds.get_nsystems(), self.nsys)
         self.assertEqual(list(ds.get_batch_size()), [batch_size] * 4)
 
@@ -101,13 +101,27 @@ def test_get_test(self):
         data = ds.get_test(sys_idx=sys_idx)
         self.assertEqual(list(data["type"][0]), list(np.sort(self.atom_type[sys_idx])))
         self._in_array(
-            np.load("sys_0/set.002/coord.npy"),
+            np.concatenate(
+                [
+                    np.load("sys_0/set.000/coord.npy"),
+                    np.load("sys_0/set.001/coord.npy"),
+                    np.load("sys_0/set.002/coord.npy"),
+                ],
+                axis=0,
+            ),
             ds.get_sys(sys_idx).idx_map,
             3,
             data["coord"],
         )
         self._in_array(
-            np.load("sys_0/set.002/test.npy"),
+            np.concatenate(
+                [
+                    np.load("sys_0/set.000/test.npy"),
+                    np.load("sys_0/set.001/test.npy"),
+                    np.load("sys_0/set.002/test.npy"),
+                ],
+                axis=0,
+            ),
             ds.get_sys(sys_idx).idx_map,
             self.test_ndof,
             data["test"],
@@ -115,7 +129,10 @@ def test_get_test(self):
         self.assertAlmostEqual(
             np.linalg.norm(
                 np.zeros(
-                    [self.nframes[sys_idx] + 2, self.natoms[sys_idx] * self.test_ndof]
+                    [
+                        self.nframes[sys_idx] * self.nset + 0 + 1 + 2,
+                        self.natoms[sys_idx] * self.test_ndof,
+                    ]
                 )
                 - data["null"]
             ),
@@ -124,7 +141,10 @@ def test_get_test(self):
         self.assertAlmostEqual(
             np.linalg.norm(
                 np.ones(
-                    [self.nframes[sys_idx] + 2, self.natoms[sys_idx] * self.test_ndof]
+                    [
+                        self.nframes[sys_idx] * self.nset + 0 + 1 + 2,
+                        self.natoms[sys_idx] * self.test_ndof,
+                    ]
                 )
                 - data["ones"]
             ),
@@ -135,13 +155,27 @@ def test_get_test(self):
         data = ds.get_test(sys_idx=sys_idx)
         self.assertEqual(list(data["type"][0]), list(np.sort(self.atom_type[sys_idx])))
         self._in_array(
-            np.load("sys_2/set.002/coord.npy"),
+            np.concatenate(
+                [
+                    np.load("sys_2/set.000/coord.npy"),
+                    np.load("sys_2/set.001/coord.npy"),
+                    np.load("sys_2/set.002/coord.npy"),
+                ],
+                axis=0,
+            ),
             ds.get_sys(sys_idx).idx_map,
             3,
             data["coord"],
         )
         self._in_array(
-            np.load("sys_2/set.002/test.npy"),
+            np.concatenate(
+                [
+                    np.load("sys_2/set.000/test.npy"),
+                    np.load("sys_2/set.001/test.npy"),
+                    np.load("sys_2/set.002/test.npy"),
+                ],
+                axis=0,
+            ),
             ds.get_sys(sys_idx).idx_map,
             self.test_ndof,
             data["test"],
@@ -149,7 +183,10 @@ def test_get_test(self):
         self.assertAlmostEqual(
             np.linalg.norm(
                 np.zeros(
-                    [self.nframes[sys_idx] + 2, self.natoms[sys_idx] * self.test_ndof]
+                    [
+                        self.nframes[sys_idx] * self.nset + 0 + 1 + 2,
+                        self.natoms[sys_idx] * self.test_ndof,
+                    ]
                 )
                 - data["null"]
             ),
@@ -207,6 +244,27 @@ def test_get_batch(self):
         )
         data = ds.get_batch(sys_idx=sys_idx)
         self.assertEqual(list(data["type"][0]), list(np.sort(self.atom_type[sys_idx])))
+        self._in_array(
+            np.load("sys_0/set.002/coord.npy"),
+            ds.get_sys(sys_idx).idx_map,
+            3,
+            data["coord"],
+        )
+        self._in_array(
+            np.load("sys_0/set.002/test.npy"),
+            ds.get_sys(sys_idx).idx_map,
+            self.test_ndof,
+            data["test"],
+        )
+        self.assertAlmostEqual(
+            np.linalg.norm(
+                np.zeros([batch_size, self.natoms[sys_idx] * self.test_ndof])
+                - data["null"]
+            ),
+            0.0,
+        )
+        data = ds.get_batch(sys_idx=sys_idx)
+        self.assertEqual(list(data["type"][0]), list(np.sort(self.atom_type[sys_idx])))
         self._in_array(
             np.load("sys_0/set.000/coord.npy"),
             ds.get_sys(sys_idx).idx_map,
@@ -292,6 +350,48 @@ def test_get_batch(self):
         )
         data = ds.get_batch(sys_idx=sys_idx)
         self.assertEqual(list(data["type"][0]), list(np.sort(self.atom_type[sys_idx])))
+        self._in_array(
+            np.load("sys_2/set.002/coord.npy"),
+            ds.get_sys(sys_idx).idx_map,
+            3,
+            data["coord"],
+        )
+        self._in_array(
+            np.load("sys_2/set.002/test.npy"),
+            ds.get_sys(sys_idx).idx_map,
+            self.test_ndof,
+            data["test"],
+        )
+        self.assertAlmostEqual(
+            np.linalg.norm(
+                np.zeros([batch_size, self.natoms[sys_idx] * self.test_ndof])
+                - data["null"]
+            ),
+            0.0,
+        )
+        data = ds.get_batch(sys_idx=sys_idx)
+        self.assertEqual(list(data["type"][0]), list(np.sort(self.atom_type[sys_idx])))
+        self._in_array(
+            np.load("sys_2/set.002/coord.npy"),
+            ds.get_sys(sys_idx).idx_map,
+            3,
+            data["coord"],
+        )
+        self._in_array(
+            np.load("sys_2/set.002/test.npy"),
+            ds.get_sys(sys_idx).idx_map,
+            self.test_ndof,
+            data["test"],
+        )
+        self.assertAlmostEqual(
+            np.linalg.norm(
+                np.zeros([batch_size, self.natoms[sys_idx] * self.test_ndof])
+                - data["null"]
+            ),
+            0.0,
+        )
+        data = ds.get_batch(sys_idx=sys_idx)
+        self.assertEqual(list(data["type"][0]), list(np.sort(self.atom_type[sys_idx])))
         self._in_array(
             np.load("sys_2/set.000/coord.npy"),
             ds.get_sys(sys_idx).idx_map,
@@ -324,16 +424,16 @@ def test_prob_sys_size_1(self):
         self.assertAlmostEqual(np.sum(prob[2:4]), 0.8)
         # number of training set is self.nset-1
         # shift is the total number of set size shift...
-        shift = np.sum(np.arange(self.nset - 1))
+        shift = np.sum(np.arange(self.nset))
         self.assertAlmostEqual(
             prob[1] / prob[0],
-            float(self.nframes[1] * (self.nset - 1) + shift)
-            / float(self.nframes[0] * (self.nset - 1) + shift),
+            float(self.nframes[1] * (self.nset) + shift)
+            / float(self.nframes[0] * (self.nset) + shift),
         )
         self.assertAlmostEqual(
             prob[3] / prob[2],
-            float(self.nframes[3] * (self.nset - 1) + shift)
-            / float(self.nframes[2] * (self.nset - 1) + shift),
+            float(self.nframes[3] * (self.nset) + shift)
+            / float(self.nframes[2] * (self.nset) + shift),
         )
 
     def test_prob_sys_size_2(self):
@@ -348,13 +448,13 @@ def test_prob_sys_size_2(self):
         self.assertAlmostEqual(np.sum(prob[2:4]), 0.8)
         # number of training set is self.nset-1
         # shift is the total number of set size shift...
-        shift = np.sum(np.arange(self.nset - 1))
+        shift = np.sum(np.arange(self.nset))
         self.assertAlmostEqual(prob[0], 0.0)
         self.assertAlmostEqual(prob[1], 0.2)
         self.assertAlmostEqual(
             prob[3] / prob[2],
-            float(self.nframes[3] * (self.nset - 1) + shift)
-            / float(self.nframes[2] * (self.nset - 1) + shift),
+            float(self.nframes[3] * (self.nset) + shift)
+            / float(self.nframes[2] * (self.nset) + shift),
         )
 
     def _idx_map(self, target, idx_map, ndof):
diff --git a/source/tests/tf/test_gen_stat_data.py b/source/tests/tf/test_gen_stat_data.py
index c3f7e765f7..ebede15fbb 100644
--- a/source/tests/tf/test_gen_stat_data.py
+++ b/source/tests/tf/test_gen_stat_data.py
@@ -122,7 +122,7 @@ def test_ener_shift(self):
         data = DeepmdDataSystem(["system_0", "system_1"], 5, 10, 1.0)
         data.add("energy", 1, must=True)
         ener_shift0 = data.compute_energy_shift(rcond=1)
-        all_stat = make_stat_input(data, 4, merge_sys=False)
+        all_stat = make_stat_input(data, 6, merge_sys=False)
         descrpt = DescrptSeA(6.0, 5.8, [46, 92], neuron=[25, 50, 100], axis_neuron=16)
         fitting = EnerFitting(
             descrpt.get_ntypes(),
@@ -138,7 +138,7 @@ def test_ener_shift_assigned(self):
         ae0 = dp_random.random()
         data = DeepmdDataSystem(["system_0"], 5, 10, 1.0)
         data.add("energy", 1, must=True)
-        all_stat = make_stat_input(data, 4, merge_sys=False)
+        all_stat = make_stat_input(data, 6, merge_sys=False)
         descrpt = DescrptSeA(6.0, 5.8, [46, 92], neuron=[25, 50, 100], axis_neuron=16)
         fitting = EnerFitting(
             descrpt.get_ntypes(),