From 7786126295ff7b0a37854131a78a93a72266c626 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 11 Jun 2024 09:40:35 -0400 Subject: [PATCH] breaking: use all sets for training and test (#3862) Fix #3860. Remove `train_dirs` and `test_dir` in `DeepmdData`. ## Summary by CodeRabbit - **New Features** - All data sets are now trained and tested by default, simplifying the training process. - **Bug Fixes** - Improved logic for handling training directories and test set merging. - **Tests** - Added new test cases for the updated data handling methods. - Updated existing tests to reflect changes in data set handling and batch sizes. --------- Signed-off-by: Jinzhe Zeng --- deepmd/utils/data.py | 61 +++++------ source/tests/tf/test_deepmd_data.py | 33 ++++-- source/tests/tf/test_deepmd_data_sys.py | 132 +++++++++++++++++++++--- source/tests/tf/test_gen_stat_data.py | 4 +- 4 files changed, 167 insertions(+), 63 deletions(-) diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py index 91782d898f..86681ddb07 100644 --- a/deepmd/utils/data.py +++ b/deepmd/utils/data.py @@ -42,7 +42,7 @@ class DeepmdData: modifier Data modifier that has the method `modify_data` trn_all_set - Use all sets as training dataset. Otherwise, if the number of sets is more than 1, the last set is left for test. + [DEPRECATED] Deprecated. Now all sets are trained and tested. sort_atoms : bool Sort atoms by atom types. Required to enable when the data is directly feeded to descriptors except mixed types. @@ -109,15 +109,6 @@ def __init__( # make idx map self.sort_atoms = sort_atoms self.idx_map = self._make_idx_map(self.atom_type) - # train dirs - self.test_dir = self.dirs[-1] - if trn_all_set: - self.train_dirs = self.dirs - else: - if len(self.dirs) == 1: - self.train_dirs = self.dirs - else: - self.train_dirs = self.dirs[:-1] self.data_dict = {} # add box and coord self.add("box", 9, must=self.pbc) @@ -225,7 +216,7 @@ def get_data_dict(self) -> dict: def check_batch_size(self, batch_size): """Check if the system can get a batch of data with `batch_size` frames.""" - for ii in self.train_dirs: + for ii in self.dirs: if self.data_dict["coord"]["high_prec"]: tmpe = ( (ii / "coord.npy").load_numpy().astype(GLOBAL_ENER_FLOAT_PRECISION) @@ -240,24 +231,7 @@ def check_batch_size(self, batch_size): def check_test_size(self, test_size): """Check if the system can get a test dataset with `test_size` frames.""" - if self.data_dict["coord"]["high_prec"]: - tmpe = ( - (self.test_dir / "coord.npy") - .load_numpy() - .astype(GLOBAL_ENER_FLOAT_PRECISION) - ) - else: - tmpe = ( - (self.test_dir / "coord.npy") - .load_numpy() - .astype(GLOBAL_NP_FLOAT_PRECISION) - ) - if tmpe.ndim == 1: - tmpe = tmpe.reshape([1, -1]) - if tmpe.shape[0] < test_size: - return self.test_dir, tmpe.shape[0] - else: - return None + return self.check_batch_size(test_size) def get_item_torch(self, index: int) -> dict: """Get a single frame data . The frame is picked from the data system by index. The index is coded across all the sets. @@ -287,7 +261,7 @@ def get_batch(self, batch_size: int) -> dict: else: set_size = 0 if self.iterator + batch_size > set_size: - self._load_batch_set(self.train_dirs[self.set_count % self.get_numb_set()]) + self._load_batch_set(self.dirs[self.set_count % self.get_numb_set()]) self.set_count += 1 set_size = self.batch_set["coord"].shape[0] iterator_1 = self.iterator + batch_size @@ -307,7 +281,7 @@ def get_test(self, ntests: int = -1) -> dict: Size of the test data set. If `ntests` is -1, all test data will be get. """ if not hasattr(self, "test_set"): - self._load_test_set(self.test_dir, self.shuffle_test) + self._load_test_set(self.shuffle_test) if ntests == -1: idx = None else: @@ -340,11 +314,11 @@ def get_atom_type(self) -> List[int]: def get_numb_set(self) -> int: """Get number of training sets.""" - return len(self.train_dirs) + return len(self.dirs) def get_numb_batch(self, batch_size: int, set_idx: int) -> int: """Get the number of batches in a set.""" - data = self._load_set(self.train_dirs[set_idx]) + data = self._load_set(self.dirs[set_idx]) ret = data["coord"].shape[0] // batch_size if ret == 0: ret = 1 @@ -353,7 +327,7 @@ def get_numb_batch(self, batch_size: int, set_idx: int) -> int: def get_sys_numb_batch(self, batch_size: int) -> int: """Get the number of batches in the data system.""" ret = 0 - for ii in range(len(self.train_dirs)): + for ii in range(len(self.dirs)): ret += self.get_numb_batch(batch_size, ii) return ret @@ -388,7 +362,7 @@ def avg(self, key): info = self.data_dict[key] ndof = info["ndof"] eners = [] - for ii in self.train_dirs: + for ii in self.dirs: data = self._load_set(ii) ei = data[key].reshape([-1, ndof]) eners.append(ei) @@ -441,8 +415,21 @@ def _load_batch_set(self, set_name: DPPath): def reset_get_batch(self): self.iterator = 0 - def _load_test_set(self, set_name: DPPath, shuffle_test): - self.test_set = self._load_set(set_name) + def _load_test_set(self, shuffle_test: bool): + test_sets = [] + for ii in self.dirs: + test_set = self._load_set(ii) + test_sets.append(test_set) + # merge test sets + self.test_set = {} + assert len(test_sets) > 0 + for kk in test_sets[0]: + if "find_" in kk: + self.test_set[kk] = test_sets[0][kk] + else: + self.test_set[kk] = np.concatenate( + [test_set[kk] for test_set in test_sets], axis=0 + ) if shuffle_test: self.test_set, _ = self._shuffle_data(self.test_set) diff --git a/source/tests/tf/test_deepmd_data.py b/source/tests/tf/test_deepmd_data.py index 40bceb2d79..0969a9baf1 100644 --- a/source/tests/tf/test_deepmd_data.py +++ b/source/tests/tf/test_deepmd_data.py @@ -143,6 +143,9 @@ def setUp(self): path = os.path.join(self.data_name, "set.bar", "test_frame.npy") self.test_frame_bar = rng.random([self.nframes, 5]) np.save(path, self.test_frame_bar) + path = os.path.join(self.data_name, "set.tar", "test_frame.npy") + self.test_frame_tar = rng.random([2, 5]) + np.save(path, self.test_frame_tar) # t n self.test_null = np.zeros([self.nframes, 2 * self.natoms]) # tensor shape @@ -162,8 +165,9 @@ def test_init(self): self.assertEqual(dd.idx_map[0], 1) self.assertEqual(dd.idx_map[1], 0) self.assertEqual(dd.type_map, ["foo", "bar"]) - self.assertEqual(dd.test_dir, "test_data/set.tar") - self.assertEqual(dd.train_dirs, ["test_data/set.bar", "test_data/set.foo"]) + self.assertEqual( + dd.dirs, ["test_data/set.bar", "test_data/set.foo", "test_data/set.tar"] + ) def test_init_type_map(self): dd = DeepmdData(self.data_name, type_map=["bar", "foo", "tar"]) @@ -182,7 +186,7 @@ def test_load_set(self): ) data = dd._load_set(os.path.join(self.data_name, "set.foo")) nframes = data["coord"].shape[0] - self.assertEqual(dd.get_numb_set(), 2) + self.assertEqual(dd.get_numb_set(), 3) self.assertEqual(dd.get_type_map(), ["foo", "bar"]) self.assertEqual(dd.get_natoms(), 2) self.assertEqual(list(dd.get_natoms_vec(3)), [2, 2, 1, 1, 0]) @@ -257,7 +261,10 @@ def test_avg(self): dd = DeepmdData(self.data_name).add("test_frame", 5, atomic=False, must=True) favg = dd.avg("test_frame") fcmp = np.average( - np.concatenate((self.test_frame, self.test_frame_bar), axis=0), axis=0 + np.concatenate( + (self.test_frame, self.test_frame_bar, self.test_frame_tar), axis=0 + ), + axis=0, ) np.testing.assert_almost_equal(favg, fcmp, places) @@ -266,13 +273,17 @@ def test_check_batch_size(self): ret = dd.check_batch_size(10) self.assertEqual(ret, (os.path.join(self.data_name, "set.bar"), 5)) ret = dd.check_batch_size(5) + self.assertEqual(ret, (os.path.join(self.data_name, "set.tar"), 2)) + ret = dd.check_batch_size(1) self.assertEqual(ret, None) def test_check_test_size(self): dd = DeepmdData(self.data_name) ret = dd.check_test_size(10) + self.assertEqual(ret, (os.path.join(self.data_name, "set.bar"), 5)) + ret = dd.check_test_size(5) self.assertEqual(ret, (os.path.join(self.data_name, "set.tar"), 2)) - ret = dd.check_test_size(2) + ret = dd.check_test_size(1) self.assertEqual(ret, None) def test_get_batch(self): @@ -284,6 +295,10 @@ def test_get_batch(self): data = dd.get_batch(5) self._comp_np_mat2(np.sort(data["coord"], axis=0), np.sort(self.coord, axis=0)) data = dd.get_batch(5) + self._comp_np_mat2( + np.sort(data["coord"], axis=0), np.sort(self.coord_tar, axis=0) + ) + data = dd.get_batch(5) self._comp_np_mat2( np.sort(data["coord"], axis=0), np.sort(self.coord_bar, axis=0) ) @@ -293,8 +308,11 @@ def test_get_batch(self): def test_get_test(self): dd = DeepmdData(self.data_name) data = dd.get_test() + expected_coord = np.concatenate( + (self.coord_bar, self.coord, self.coord_tar), axis=0 + ) self._comp_np_mat2( - np.sort(data["coord"], axis=0), np.sort(self.coord_tar, axis=0) + np.sort(data["coord"], axis=0), np.sort(expected_coord, axis=0) ) def test_get_nbatch(self): @@ -368,8 +386,7 @@ def test_init(self): dd = DeepmdData(self.data_name) self.assertEqual(dd.idx_map[0], 0) self.assertEqual(dd.type_map, ["X"]) - self.assertEqual(dd.test_dir, self.data_name + "#/set.000") - self.assertEqual(dd.train_dirs, [self.data_name + "#/set.000"]) + self.assertEqual(dd.dirs[0], self.data_name + "#/set.000") def test_get_batch(self): dd = DeepmdData(self.data_name) diff --git a/source/tests/tf/test_deepmd_data_sys.py b/source/tests/tf/test_deepmd_data_sys.py index 710a6d0ac5..893177ac4f 100644 --- a/source/tests/tf/test_deepmd_data_sys.py +++ b/source/tests/tf/test_deepmd_data_sys.py @@ -68,7 +68,7 @@ def test_ntypes(self): ds.add("test", self.test_ndof, atomic=True, must=True) ds.add("null", self.test_ndof, atomic=True, must=False) self.assertEqual(ds.get_ntypes(), 3) - self.assertEqual(ds.get_nbatches(), [2, 4, 3, 2]) + self.assertEqual(ds.get_nbatches(), [3, 6, 5, 4]) self.assertEqual(ds.get_nsystems(), self.nsys) self.assertEqual(list(ds.get_batch_size()), [batch_size] * 4) @@ -101,13 +101,27 @@ def test_get_test(self): data = ds.get_test(sys_idx=sys_idx) self.assertEqual(list(data["type"][0]), list(np.sort(self.atom_type[sys_idx]))) self._in_array( - np.load("sys_0/set.002/coord.npy"), + np.concatenate( + [ + np.load("sys_0/set.000/coord.npy"), + np.load("sys_0/set.001/coord.npy"), + np.load("sys_0/set.002/coord.npy"), + ], + axis=0, + ), ds.get_sys(sys_idx).idx_map, 3, data["coord"], ) self._in_array( - np.load("sys_0/set.002/test.npy"), + np.concatenate( + [ + np.load("sys_0/set.000/test.npy"), + np.load("sys_0/set.001/test.npy"), + np.load("sys_0/set.002/test.npy"), + ], + axis=0, + ), ds.get_sys(sys_idx).idx_map, self.test_ndof, data["test"], @@ -115,7 +129,10 @@ def test_get_test(self): self.assertAlmostEqual( np.linalg.norm( np.zeros( - [self.nframes[sys_idx] + 2, self.natoms[sys_idx] * self.test_ndof] + [ + self.nframes[sys_idx] * self.nset + 0 + 1 + 2, + self.natoms[sys_idx] * self.test_ndof, + ] ) - data["null"] ), @@ -124,7 +141,10 @@ def test_get_test(self): self.assertAlmostEqual( np.linalg.norm( np.ones( - [self.nframes[sys_idx] + 2, self.natoms[sys_idx] * self.test_ndof] + [ + self.nframes[sys_idx] * self.nset + 0 + 1 + 2, + self.natoms[sys_idx] * self.test_ndof, + ] ) - data["ones"] ), @@ -135,13 +155,27 @@ def test_get_test(self): data = ds.get_test(sys_idx=sys_idx) self.assertEqual(list(data["type"][0]), list(np.sort(self.atom_type[sys_idx]))) self._in_array( - np.load("sys_2/set.002/coord.npy"), + np.concatenate( + [ + np.load("sys_2/set.000/coord.npy"), + np.load("sys_2/set.001/coord.npy"), + np.load("sys_2/set.002/coord.npy"), + ], + axis=0, + ), ds.get_sys(sys_idx).idx_map, 3, data["coord"], ) self._in_array( - np.load("sys_2/set.002/test.npy"), + np.concatenate( + [ + np.load("sys_2/set.000/test.npy"), + np.load("sys_2/set.001/test.npy"), + np.load("sys_2/set.002/test.npy"), + ], + axis=0, + ), ds.get_sys(sys_idx).idx_map, self.test_ndof, data["test"], @@ -149,7 +183,10 @@ def test_get_test(self): self.assertAlmostEqual( np.linalg.norm( np.zeros( - [self.nframes[sys_idx] + 2, self.natoms[sys_idx] * self.test_ndof] + [ + self.nframes[sys_idx] * self.nset + 0 + 1 + 2, + self.natoms[sys_idx] * self.test_ndof, + ] ) - data["null"] ), @@ -207,6 +244,27 @@ def test_get_batch(self): ) data = ds.get_batch(sys_idx=sys_idx) self.assertEqual(list(data["type"][0]), list(np.sort(self.atom_type[sys_idx]))) + self._in_array( + np.load("sys_0/set.002/coord.npy"), + ds.get_sys(sys_idx).idx_map, + 3, + data["coord"], + ) + self._in_array( + np.load("sys_0/set.002/test.npy"), + ds.get_sys(sys_idx).idx_map, + self.test_ndof, + data["test"], + ) + self.assertAlmostEqual( + np.linalg.norm( + np.zeros([batch_size, self.natoms[sys_idx] * self.test_ndof]) + - data["null"] + ), + 0.0, + ) + data = ds.get_batch(sys_idx=sys_idx) + self.assertEqual(list(data["type"][0]), list(np.sort(self.atom_type[sys_idx]))) self._in_array( np.load("sys_0/set.000/coord.npy"), ds.get_sys(sys_idx).idx_map, @@ -292,6 +350,48 @@ def test_get_batch(self): ) data = ds.get_batch(sys_idx=sys_idx) self.assertEqual(list(data["type"][0]), list(np.sort(self.atom_type[sys_idx]))) + self._in_array( + np.load("sys_2/set.002/coord.npy"), + ds.get_sys(sys_idx).idx_map, + 3, + data["coord"], + ) + self._in_array( + np.load("sys_2/set.002/test.npy"), + ds.get_sys(sys_idx).idx_map, + self.test_ndof, + data["test"], + ) + self.assertAlmostEqual( + np.linalg.norm( + np.zeros([batch_size, self.natoms[sys_idx] * self.test_ndof]) + - data["null"] + ), + 0.0, + ) + data = ds.get_batch(sys_idx=sys_idx) + self.assertEqual(list(data["type"][0]), list(np.sort(self.atom_type[sys_idx]))) + self._in_array( + np.load("sys_2/set.002/coord.npy"), + ds.get_sys(sys_idx).idx_map, + 3, + data["coord"], + ) + self._in_array( + np.load("sys_2/set.002/test.npy"), + ds.get_sys(sys_idx).idx_map, + self.test_ndof, + data["test"], + ) + self.assertAlmostEqual( + np.linalg.norm( + np.zeros([batch_size, self.natoms[sys_idx] * self.test_ndof]) + - data["null"] + ), + 0.0, + ) + data = ds.get_batch(sys_idx=sys_idx) + self.assertEqual(list(data["type"][0]), list(np.sort(self.atom_type[sys_idx]))) self._in_array( np.load("sys_2/set.000/coord.npy"), ds.get_sys(sys_idx).idx_map, @@ -324,16 +424,16 @@ def test_prob_sys_size_1(self): self.assertAlmostEqual(np.sum(prob[2:4]), 0.8) # number of training set is self.nset-1 # shift is the total number of set size shift... - shift = np.sum(np.arange(self.nset - 1)) + shift = np.sum(np.arange(self.nset)) self.assertAlmostEqual( prob[1] / prob[0], - float(self.nframes[1] * (self.nset - 1) + shift) - / float(self.nframes[0] * (self.nset - 1) + shift), + float(self.nframes[1] * (self.nset) + shift) + / float(self.nframes[0] * (self.nset) + shift), ) self.assertAlmostEqual( prob[3] / prob[2], - float(self.nframes[3] * (self.nset - 1) + shift) - / float(self.nframes[2] * (self.nset - 1) + shift), + float(self.nframes[3] * (self.nset) + shift) + / float(self.nframes[2] * (self.nset) + shift), ) def test_prob_sys_size_2(self): @@ -348,13 +448,13 @@ def test_prob_sys_size_2(self): self.assertAlmostEqual(np.sum(prob[2:4]), 0.8) # number of training set is self.nset-1 # shift is the total number of set size shift... - shift = np.sum(np.arange(self.nset - 1)) + shift = np.sum(np.arange(self.nset)) self.assertAlmostEqual(prob[0], 0.0) self.assertAlmostEqual(prob[1], 0.2) self.assertAlmostEqual( prob[3] / prob[2], - float(self.nframes[3] * (self.nset - 1) + shift) - / float(self.nframes[2] * (self.nset - 1) + shift), + float(self.nframes[3] * (self.nset) + shift) + / float(self.nframes[2] * (self.nset) + shift), ) def _idx_map(self, target, idx_map, ndof): diff --git a/source/tests/tf/test_gen_stat_data.py b/source/tests/tf/test_gen_stat_data.py index c3f7e765f7..ebede15fbb 100644 --- a/source/tests/tf/test_gen_stat_data.py +++ b/source/tests/tf/test_gen_stat_data.py @@ -122,7 +122,7 @@ def test_ener_shift(self): data = DeepmdDataSystem(["system_0", "system_1"], 5, 10, 1.0) data.add("energy", 1, must=True) ener_shift0 = data.compute_energy_shift(rcond=1) - all_stat = make_stat_input(data, 4, merge_sys=False) + all_stat = make_stat_input(data, 6, merge_sys=False) descrpt = DescrptSeA(6.0, 5.8, [46, 92], neuron=[25, 50, 100], axis_neuron=16) fitting = EnerFitting( descrpt.get_ntypes(), @@ -138,7 +138,7 @@ def test_ener_shift_assigned(self): ae0 = dp_random.random() data = DeepmdDataSystem(["system_0"], 5, 10, 1.0) data.add("energy", 1, must=True) - all_stat = make_stat_input(data, 4, merge_sys=False) + all_stat = make_stat_input(data, 6, merge_sys=False) descrpt = DescrptSeA(6.0, 5.8, [46, 92], neuron=[25, 50, 100], axis_neuron=16) fitting = EnerFitting( descrpt.get_ntypes(),