From 7a55e36805133999b2f51d5b8b112577b00a68ec Mon Sep 17 00:00:00 2001
From: corochann <acc1ssnn9terias@gmail.com>
Date: Mon, 18 Oct 2021 13:58:18 +0900
Subject: [PATCH 01/14] torch.jit.script support

---
 torch_dftd/functions/dftd2.py           |   4 +-
 torch_dftd/functions/dftd3.py           |  35 +++---
 torch_dftd/functions/distance.py        |  34 +++++-
 torch_dftd/functions/smoothing.py       |  29 ++++-
 torch_dftd/functions/triplets.py        | 152 +++++++++++++-----------
 torch_dftd/functions/triplets_kernel.py |   1 +
 torch_dftd/nn/dftd2_module.py           |  17 ++-
 torch_dftd/nn/dftd3_module.py           |  23 ++--
 8 files changed, 190 insertions(+), 105 deletions(-)

diff --git a/torch_dftd/functions/dftd2.py b/torch_dftd/functions/dftd2.py
index 13e9347..513bc3a 100644
--- a/torch_dftd/functions/dftd2.py
+++ b/torch_dftd/functions/dftd2.py
@@ -6,6 +6,7 @@
 from torch_dftd.functions.smoothing import poly_smoothing
 
 
+@torch.jit.script
 def edisp_d2(
     Z: Tensor,
     r: Tensor,
@@ -44,7 +45,7 @@ def edisp_d2(
     r2 = r ** 2
     r6 = r2 ** 3
 
-    idx_i, idx_j = edge_index
+    idx_i, idx_j = edge_index[0], edge_index[1]
     # compute all necessary quantities
     Zi = Z[idx_i]  # (n_edges,)
     Zj = Z[idx_j]
@@ -71,6 +72,7 @@ def edisp_d2(
         # (1,)
         g = e6.sum()[None]
     else:
+        assert batch is not None
         # (n_graphs,)
         if batch.size()[0] == 0:
             n_graphs = 1
diff --git a/torch_dftd/functions/dftd3.py b/torch_dftd/functions/dftd3.py
index dc5d61d..ef1c9fb 100644
--- a/torch_dftd/functions/dftd3.py
+++ b/torch_dftd/functions/dftd3.py
@@ -9,8 +9,8 @@
 
 # conversion factors used in grimme d3 code
 
-d3_autoang = 0.52917726  # for converting distance from bohr to angstrom
-d3_autoev = 27.21138505  # for converting a.u. to eV
+d3_autoang: float = 0.52917726  # for converting distance from bohr to angstrom
+d3_autoev: float = 27.21138505  # for converting a.u. to eV
 
 d3_k1 = 16.000
 d3_k2 = 4 / 3
@@ -18,6 +18,7 @@
 d3_maxc = 5  # maximum number of coordination complexes
 
 
+@torch.jit.script
 def _ncoord(
     Z: Tensor,
     r: Tensor,
@@ -66,6 +67,7 @@ def _ncoord(
     return g  # (n_atoms,)
 
 
+@torch.jit.script
 def _getc6(
     Zi: Tensor, Zj: Tensor, nci: Tensor, ncj: Tensor, c6ab: Tensor, k3: float = d3_k3
 ) -> Tensor:
@@ -104,6 +106,7 @@ def _getc6(
     return c6
 
 
+@torch.jit.script
 def edisp(
     Z: Tensor,
     r: Tensor,
@@ -120,12 +123,12 @@ def edisp(
     shift_pos: Optional[Tensor] = None,
     pos: Optional[Tensor] = None,
     cell: Optional[Tensor] = None,
-    r2=None,
-    r6=None,
-    r8=None,
-    k1=d3_k1,
-    k2=d3_k2,
-    k3=d3_k3,
+    r2: Optional[Tensor] = None,
+    r6: Optional[Tensor] = None,
+    r8: Optional[Tensor] = None,
+    k1: float = d3_k1,
+    k2: float = d3_k2,
+    k3: float = d3_k3,
     cutoff_smoothing: str = "none",
     damping: str = "zero",
     bidirectional: bool = False,
@@ -171,7 +174,7 @@ def edisp(
     if r8 is None:
         r8 = r6 * r2
 
-    idx_i, idx_j = edge_index
+    idx_i, idx_j = edge_index[0], edge_index[1]
     # compute all necessary quantities
     Zi = Z[idx_i]  # (n_edges,)
     Zj = Z[idx_j]
@@ -250,6 +253,7 @@ def edisp(
         g = e68.to(torch.float64).sum()[None]
     else:
         # (n_graphs,)
+        assert batch is not None
         if batch.size()[0] == 0:
             n_graphs = 1
         else:
@@ -261,6 +265,7 @@ def edisp(
         g *= 2.0
 
     if abc:
+        assert cnthr is not None
         within_cutoff = r <= cnthr
         # r_abc = r[within_cutoff]
         # r2_abc = r2[within_cutoff]
@@ -284,10 +289,11 @@ def edisp(
         with torch.no_grad():
             # triplet_node_index, triplet_edge_index = calc_triplets_cycle(edge_index_abc, n_atoms, shift=shift_abc)
             # Type hinting
-            triplet_node_index: Tensor
-            multiplicity: Tensor
-            edge_jk: Tensor
-            batch_triplets: Optional[Tensor]
+            # triplet_node_index: Tensor
+            # multiplicity: Tensor
+            # edge_jk: Tensor
+            # batch_triplets: Optional[Tensor]
+            assert pos is not None
             triplet_node_index, multiplicity, edge_jk, batch_triplets = calc_triplets(
                 edge_index_abc,
                 shift_pos=shift_abc,
@@ -303,7 +309,7 @@ def edisp(
         )
         r_jk = calc_distances(pos, torch.stack([idx_j, idx_k], dim=0), cell, shift_jk)
         kj_within_cutoff = r_jk <= cnthr
-        del shift_jk
+        # del shift_jk
 
         triplet_node_index = triplet_node_index[kj_within_cutoff]
         multiplicity, edge_jk, batch_triplets = (
@@ -355,5 +361,6 @@ def edisp(
             e6abc = e3.to(torch.float64).sum()
             g += e6abc
         else:
+            assert batch_triplets is not None
             g.scatter_add_(0, batch_triplets, e3.to(torch.float64))
     return g  # (n_graphs,)
diff --git a/torch_dftd/functions/distance.py b/torch_dftd/functions/distance.py
index e177bdb..847cf26 100644
--- a/torch_dftd/functions/distance.py
+++ b/torch_dftd/functions/distance.py
@@ -4,12 +4,13 @@
 from torch import Tensor
 
 
+@torch.jit.script
 def calc_distances(
     pos: Tensor,
     edge_index: Tensor,
     cell: Optional[Tensor] = None,
     shift_pos: Optional[Tensor] = None,
-    eps=1e-20,
+    eps: float = 1e-20,
 ) -> Tensor:
     """Distance calculation function.
 
@@ -17,6 +18,7 @@ def calc_distances(
         pos (Tensor): (n_atoms, 3) atom positions.
         edge_index (Tensor): (2, n_edges) edge_index for graph.
         cell (Tensor): cell size, None for non periodic system.
+            This it NOT USED now, it is left for backward compatibility.
         shift_pos (Tensor): (n_edges, 3) position shift vectors of edges owing to the periodic boundary. It should be length unit.
         eps (float): Small float value to avoid NaN in backward when the distance is 0.
 
@@ -25,12 +27,38 @@ def calc_distances(
 
     """
 
-    idx_i, idx_j = edge_index
+    idx_i, idx_j = edge_index[0], edge_index[1]
     # calculate interatomic distances
     Ri = pos[idx_i]
     Rj = pos[idx_j]
-    if cell is not None:
+    if shift_pos is not None:
         Rj += shift_pos
     # eps is to avoid Nan in backward when Dij = 0 with sqrt.
     Dij = torch.sqrt(torch.sum((Ri - Rj) ** 2, dim=-1) + eps)
     return Dij
+
+
+if __name__ == '__main__':
+    num_runs = 50
+    torch._C._jit_set_num_profiled_runs(num_runs)
+    print("profiled runs: ", torch._C._jit_get_num_profiled_runs())
+
+    device = "cuda:0"
+    # device = "cpu"
+    n_atoms = 1000
+    pos = torch.randn(n_atoms, 3, device=device) * n_atoms
+    edge_index = torch.stack([torch.arange(n_atoms), torch.arange(n_atoms - 1, -1, -1)], dim=0).to(device)
+    print("edge_index", edge_index.shape)
+    calc_distances(pos, edge_index)
+    from time import perf_counter
+
+    time_list = []
+    for i in range(100):
+        torch.cuda.synchronize()
+        s0 = perf_counter()
+        calc_distances(pos, edge_index)
+        torch.cuda.synchronize()
+        e0 = perf_counter()
+        time_list.append(e0 - s0)
+    print("Time: ", time_list)
+    import IPython; IPython.embed()
diff --git a/torch_dftd/functions/smoothing.py b/torch_dftd/functions/smoothing.py
index 1f8595f..fe5f0ba 100644
--- a/torch_dftd/functions/smoothing.py
+++ b/torch_dftd/functions/smoothing.py
@@ -2,12 +2,13 @@
 from torch import Tensor
 
 
+@torch.jit.script
 def poly_smoothing(r: Tensor, cutoff: float) -> Tensor:
     """Computes a smooth step from 1 to 0 starting at 1 bohr before the cutoff
 
     Args:
         r (Tensor): (n_edges,)
-        cutoff (float): ()
+        cutoff (float): cutoff length
 
     Returns:
         r (Tensor): Smoothed `r`
@@ -23,3 +24,29 @@ def poly_smoothing(r: Tensor, cutoff: float) -> Tensor:
         torch.ones_like(x),
         torch.where(r >= cutoff, torch.zeros_like(x), 6 * x5 - 15 * x4 + 10 * x3),
     )
+
+
+if __name__ == '__main__':
+    from time import perf_counter
+    device = "cpu"
+    n_edges = 10000
+    r = torch.rand(n_edges).to(device) * 10.0
+
+    time_list = []
+    for i in range(200):
+        torch.cuda.synchronize()
+        s0 = perf_counter()
+        r2 = poly_smoothing(r, 5.0)
+        torch.cuda.synchronize()
+        e0 = perf_counter()
+        time_list.append(e0 - s0)
+    print("Time: ", time_list)
+    import numpy as np
+
+    time_array = np.array(time_list)
+    print("time:",
+          np.mean(time_array),
+          np.mean(time_array[10:]),
+          np.mean(time_array[50:]),
+          np.mean(time_array[100:]))
+    import IPython; IPython.embed()
diff --git a/torch_dftd/functions/triplets.py b/torch_dftd/functions/triplets.py
index a21be85..61d537a 100644
--- a/torch_dftd/functions/triplets.py
+++ b/torch_dftd/functions/triplets.py
@@ -1,70 +1,13 @@
-from typing import Optional, Tuple
+from typing import Optional, Tuple, List
 
 import torch
 from torch import Tensor
 from torch_dftd.functions.triplets_kernel import _calc_triplets_core_gpu
 
 
-def calc_triplets(
-    edge_index: Tensor,
-    shift_pos: Optional[Tensor] = None,
-    dtype=torch.float32,
-    batch_edge: Optional[Tensor] = None,
-) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
-    """Calculate triplet edge index.
-
-    Args:
-        edge_index (Tensor): (2, n_edges) edge_index for graph. It must be bidirectional edge.
-        shift_pos (Tensor or None): (n_edges, 3) used to calculate unique atoms when pbc=True.
-        dtype: dtype for `multiplicity`
-        batch_edge (Tensor or None): Specify batch indices for `edge_index`.
-
-    Returns:
-        triplet_node_index (Tensor): (3, n_triplets) index for node `i`, `j`, `k` respectively.
-           i.e.: idx_i, idx_j, idx_k = triplet_node_index
-        multiplicity (Tensor): (n_triplets,) multiplicity indicates duplication of same triplet pair.
-            It only takes 1 in non-pbc, but it takes 2 or 3 in pbc case. dtype is specified in the argument.
-        edge_jk (Tensor): (n_triplet_edges, 2=(j, k)) edge indices for j and k.
-           i.e.: idx_j, idx_k = edge_jk[:, 0], edge_jk[:, 1]
-        batch_triplets (Tensor): (n_triplets,) batch indices for each triplets.
-    """
-    dst, src = edge_index
-    is_larger = dst >= src
-    dst = dst[is_larger]
-    src = src[is_larger]
-    # sort `src`
-    sort_inds = torch.argsort(src)
-    src = src[sort_inds]
-    dst = dst[sort_inds]
-
-    if shift_pos is None:
-        edge_indices = torch.arange(src.shape[0], dtype=torch.long, device=edge_index.device)
-    else:
-        edge_indices = torch.arange(shift_pos.shape[0], dtype=torch.long, device=edge_index.device)
-        edge_indices = edge_indices[is_larger][sort_inds]
-
-    if batch_edge is None:
-        batch_edge = torch.zeros(src.shape[0], dtype=torch.long, device=edge_index.device)
-    else:
-        batch_edge = batch_edge[is_larger][sort_inds]
-
-    unique, counts = torch.unique_consecutive(src, return_counts=True)
-    counts_cumsum = torch.cumsum(counts, dim=0)
-    counts_cumsum = torch.cat(
-        [torch.zeros((1,), device=counts.device, dtype=torch.long), counts_cumsum], dim=0
-    )
-
-    if str(unique.device) == "cpu":
-        return _calc_triplets_core(
-            counts, unique, dst, edge_indices, batch_edge, counts_cumsum, dtype=dtype
-        )
-    else:
-        return _calc_triplets_core_gpu(
-            counts, unique, dst, edge_indices, batch_edge, counts_cumsum, dtype=dtype
-        )
-
-
-def _calc_triplets_core(counts, unique, dst, edge_indices, batch_edge, counts_cumsum, dtype):
+@torch.jit.script
+def _calc_triplets_core(counts: Tensor, unique: Tensor, dst: Tensor, edge_indices: Tensor, batch_edge: Tensor, counts_cumsum: Tensor,
+                        dtype: torch.dtype):
     device = unique.device
     n_triplets = torch.sum(counts * (counts - 1) / 2)
     if n_triplets == 0:
@@ -78,20 +21,25 @@ def _calc_triplets_core(counts, unique, dst, edge_indices, batch_edge, counts_cu
         batch_triplets = torch.zeros((0,), dtype=torch.long, device=device)
         return triplet_node_index, multiplicity, edge_jk, batch_triplets
 
-    triplet_node_index_list = []  # (n_triplet_edges, 3)
-    edge_jk_list = []  # (n_triplet_edges, 2) represents j and k indices
-    multiplicity_list = []  # (n_triplet_edges) represents multiplicity
-    batch_triplets_list = []  # (n_triplet_edges) represents batch index for triplets
+    triplet_node_index_list: List[List[int]] = []  # (n_triplet_edges, 3)
+    edge_jk_list: List[List[int]] = []  # (n_triplet_edges, 2) represents j and k indices
+    multiplicity_list: List[float] = []  # (n_triplet_edges) represents multiplicity
+    batch_triplets_list: List[int] = []  # (n_triplet_edges) represents batch index for triplets
+
+    unique_list: List[int] = unique.tolist()
+    dst_list: List[int] = dst.tolist()
+    counts_cumsum_list: List[int] = counts_cumsum.tolist()
+    batch_edge_list: List[int] = batch_edge.tolist()
     for i in range(len(unique)):
-        _src = unique[i].item()
-        _n_edges = counts[i].item()
-        _dst = dst[counts_cumsum[i] : counts_cumsum[i + 1]]
-        _offset = counts_cumsum[i].item()
-        _batch_index = batch_edge[counts_cumsum[i]].item()
+        _src: int = unique_list[i]
+        _n_edges = counts_cumsum_list[i]
+        _dst: List[int] = dst_list[counts_cumsum_list[i] : counts_cumsum_list[i + 1]]
+        _offset = counts_cumsum_list[i]
+        _batch_index = batch_edge_list[counts_cumsum_list[i]]
         for j in range(_n_edges - 1):
             for k in range(j + 1, _n_edges):
-                _dst0 = _dst[j].item()  # _dst0 maybe swapped with _dst1, need to reset here.
-                _dst1 = _dst[k].item()
+                _dst0: int = _dst[j]  # _dst0 maybe swapped with _dst1, need to reset here.
+                _dst1: int = _dst[k]
                 batch_triplets_list.append(_batch_index)
                 # --- triplet_node_index_list & shift_list in sorted way... ---
                 # sort order to be _src <= _dst0 <= _dst1, and i <= _j <= _k
@@ -135,3 +83,63 @@ def _calc_triplets_core(counts, unique, dst, edge_indices, batch_edge, counts_cu
     # (n_triplet_edges, 3=(ij, ik, jk), 3=(xyz) )
     batch_triplets = torch.as_tensor(batch_triplets_list, dtype=torch.long, device=device)
     return triplet_node_index, multiplicity, edge_jk, batch_triplets
+
+
+@torch.jit.script
+def calc_triplets(
+    edge_index: Tensor,
+    shift_pos: Optional[Tensor] = None,
+    dtype: torch.dtype = torch.float32,
+    batch_edge: Optional[Tensor] = None,
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    """Calculate triplet edge index.
+
+    Args:
+        edge_index (Tensor): (2, n_edges) edge_index for graph. It must be bidirectional edge.
+        shift_pos (Tensor or None): (n_edges, 3) used to calculate unique atoms when pbc=True.
+        dtype: dtype for `multiplicity`
+        batch_edge (Tensor or None): Specify batch indices for `edge_index`.
+
+    Returns:
+        triplet_node_index (Tensor): (3, n_triplets) index for node `i`, `j`, `k` respectively.
+           i.e.: idx_i, idx_j, idx_k = triplet_node_index
+        multiplicity (Tensor): (n_triplets,) multiplicity indicates duplication of same triplet pair.
+            It only takes 1 in non-pbc, but it takes 2 or 3 in pbc case. dtype is specified in the argument.
+        edge_jk (Tensor): (n_triplet_edges, 2=(j, k)) edge indices for j and k.
+           i.e.: idx_j, idx_k = edge_jk[:, 0], edge_jk[:, 1]
+        batch_triplets (Tensor): (n_triplets,) batch indices for each triplets.
+    """
+    dst, src = edge_index[0], edge_index[1]
+    is_larger = dst >= src
+    dst = dst[is_larger]
+    src = src[is_larger]
+    # sort `src`
+    sort_inds = torch.argsort(src)
+    src = src[sort_inds]
+    dst = dst[sort_inds]
+
+    if shift_pos is None:
+        edge_indices = torch.arange(src.shape[0], dtype=torch.long, device=edge_index.device)
+    else:
+        edge_indices = torch.arange(shift_pos.shape[0], dtype=torch.long, device=edge_index.device)
+        edge_indices = edge_indices[is_larger][sort_inds]
+
+    if batch_edge is None:
+        batch_edge = torch.zeros(src.shape[0], dtype=torch.long, device=edge_index.device)
+    else:
+        batch_edge = batch_edge[is_larger][sort_inds]
+
+    unique, counts = torch.unique_consecutive(src, return_counts=True)
+    counts_cumsum = torch.cumsum(counts, dim=0)
+    counts_cumsum = torch.cat(
+        [torch.zeros((1,), device=counts.device, dtype=torch.long), counts_cumsum], dim=0
+    )
+
+    if str(unique.device) == "cpu":
+        return _calc_triplets_core(
+            counts, unique, dst, edge_indices, batch_edge, counts_cumsum, dtype=dtype
+        )
+    else:
+        return _calc_triplets_core_gpu(
+            counts, unique, dst, edge_indices, batch_edge, counts_cumsum, dtype=dtype
+        )
diff --git a/torch_dftd/functions/triplets_kernel.py b/torch_dftd/functions/triplets_kernel.py
index cf4d96a..34e24a2 100644
--- a/torch_dftd/functions/triplets_kernel.py
+++ b/torch_dftd/functions/triplets_kernel.py
@@ -113,6 +113,7 @@ def _cupy2torch(array: cp.ndarray) -> Tensor:
     _calc_triplets_core_gpu_kernel = None
 
 
+@torch.jit.unused
 def _calc_triplets_core_gpu(
     counts: Tensor,
     unique: Tensor,
diff --git a/torch_dftd/nn/dftd2_module.py b/torch_dftd/nn/dftd2_module.py
index 57fd76d..8a70a07 100644
--- a/torch_dftd/nn/dftd2_module.py
+++ b/torch_dftd/nn/dftd2_module.py
@@ -20,6 +20,9 @@ class DFTD2Module(BaseDFTDModule):
         bidirectional (bool): calculated `edge_index` is bidirectional or not.
     """
 
+    c6ab: Tensor
+    r0ab: Tensor
+
     def __init__(
         self,
         params: Dict[str, float],
@@ -52,24 +55,26 @@ def calc_energy_batch(
         batch: Optional[Tensor] = None,
         batch_edge: Optional[Tensor] = None,
         damping: str = "zero",
+        autoang: float = d3_autoang,
+        autoev: float = d3_autoev
     ) -> Tensor:
         """Forward computation to calculate atomic wise dispersion energy"""
         shift_pos = pos.new_zeros((edge_index.size()[1], 3, 3)) if shift_pos is None else shift_pos
-        pos_bohr = pos / d3_autoang  # angstrom -> bohr
+        pos_bohr = pos / autoang  # angstrom -> bohr
         if cell is None:
             cell_bohr: Optional[Tensor] = None
         else:
-            cell_bohr = cell / d3_autoang  # angstrom -> bohr
-        shift_bohr = shift_pos / d3_autoang  # angstrom -> bohr
+            cell_bohr = cell / autoang  # angstrom -> bohr
+        shift_bohr = shift_pos / autoang  # angstrom -> bohr
         r = calc_distances(pos_bohr, edge_index, cell_bohr, shift_bohr)
 
         # E_disp (n_graphs,): Energy in eV unit
-        E_disp = d3_autoev * edisp_d2(
+        E_disp = autoev * edisp_d2(
             Z,
             r,
             edge_index,
-            c6ab=self.c6ab,  # type:ignore
-            r0ab=self.r0ab,  # type:ignore
+            c6ab=self.c6ab,
+            r0ab=self.r0ab,
             params=self.params,
             damping=damping,
             bidirectional=self.bidirectional,
diff --git a/torch_dftd/nn/dftd3_module.py b/torch_dftd/nn/dftd3_module.py
index 6fe0a84..a40c0f6 100644
--- a/torch_dftd/nn/dftd3_module.py
+++ b/torch_dftd/nn/dftd3_module.py
@@ -24,6 +24,11 @@ class DFTD3Module(BaseDFTDModule):
         bidirectional (bool): calculated `edge_index` is bidirectional or not.
     """
 
+    c6ab: Tensor
+    r0ab: Tensor
+    rcov: Tensor
+    r2r4: Tensor
+
     def __init__(
         self,
         params: Dict[str, float],
@@ -74,25 +79,27 @@ def calc_energy_batch(
         batch: Optional[Tensor] = None,
         batch_edge: Optional[Tensor] = None,
         damping: str = "zero",
+        autoang: float = d3_autoang,
+        autoev: float = d3_autoev,
     ) -> Tensor:
         """Forward computation to calculate atomic wise dispersion energy"""
         shift_pos = pos.new_zeros((edge_index.size()[1], 3, 3)) if shift_pos is None else shift_pos
-        pos_bohr = pos / d3_autoang  # angstrom -> bohr
+        pos_bohr = pos / autoang  # angstrom -> bohr
         if cell is None:
             cell_bohr: Optional[Tensor] = None
         else:
-            cell_bohr = cell / d3_autoang  # angstrom -> bohr
-        shift_bohr = shift_pos / d3_autoang  # angstrom -> bohr
+            cell_bohr = cell / autoang  # angstrom -> bohr
+        shift_bohr = shift_pos / autoang  # angstrom -> bohr
         r = calc_distances(pos_bohr, edge_index, cell_bohr, shift_bohr)
         # E_disp (n_graphs,): Energy in eV unit
-        E_disp = d3_autoev * edisp(
+        E_disp = autoev * edisp(
             Z,
             r,
             edge_index,
-            c6ab=self.c6ab,  # type:ignore
-            r0ab=self.r0ab,  # type:ignore
-            rcov=self.rcov,  # type:ignore
-            r2r4=self.r2r4,  # type:ignore
+            c6ab=self.c6ab,
+            r0ab=self.r0ab,
+            rcov=self.rcov,
+            r2r4=self.r2r4,
             params=self.params,
             cutoff=self.cutoff / Bohr,
             cnthr=self.cnthr / Bohr,

From 5890892516b7a6918e25257f6cc73b251337eab8 Mon Sep 17 00:00:00 2001
From: corochann <acc1ssnn9terias@gmail.com>
Date: Mon, 18 Oct 2021 14:01:09 +0900
Subject: [PATCH 02/14] black, isort

---
 torch_dftd/functions/distance.py  | 10 +++++++---
 torch_dftd/functions/smoothing.py | 19 ++++++++++++-------
 torch_dftd/functions/triplets.py  | 13 ++++++++++---
 torch_dftd/nn/dftd2_module.py     |  2 +-
 4 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/torch_dftd/functions/distance.py b/torch_dftd/functions/distance.py
index 847cf26..614ad4c 100644
--- a/torch_dftd/functions/distance.py
+++ b/torch_dftd/functions/distance.py
@@ -38,7 +38,7 @@ def calc_distances(
     return Dij
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     num_runs = 50
     torch._C._jit_set_num_profiled_runs(num_runs)
     print("profiled runs: ", torch._C._jit_get_num_profiled_runs())
@@ -47,7 +47,9 @@ def calc_distances(
     # device = "cpu"
     n_atoms = 1000
     pos = torch.randn(n_atoms, 3, device=device) * n_atoms
-    edge_index = torch.stack([torch.arange(n_atoms), torch.arange(n_atoms - 1, -1, -1)], dim=0).to(device)
+    edge_index = torch.stack([torch.arange(n_atoms), torch.arange(n_atoms - 1, -1, -1)], dim=0).to(
+        device
+    )
     print("edge_index", edge_index.shape)
     calc_distances(pos, edge_index)
     from time import perf_counter
@@ -61,4 +63,6 @@ def calc_distances(
         e0 = perf_counter()
         time_list.append(e0 - s0)
     print("Time: ", time_list)
-    import IPython; IPython.embed()
+    import IPython
+
+    IPython.embed()
diff --git a/torch_dftd/functions/smoothing.py b/torch_dftd/functions/smoothing.py
index fe5f0ba..2d431b1 100644
--- a/torch_dftd/functions/smoothing.py
+++ b/torch_dftd/functions/smoothing.py
@@ -26,8 +26,9 @@ def poly_smoothing(r: Tensor, cutoff: float) -> Tensor:
     )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     from time import perf_counter
+
     device = "cpu"
     n_edges = 10000
     r = torch.rand(n_edges).to(device) * 10.0
@@ -44,9 +45,13 @@ def poly_smoothing(r: Tensor, cutoff: float) -> Tensor:
     import numpy as np
 
     time_array = np.array(time_list)
-    print("time:",
-          np.mean(time_array),
-          np.mean(time_array[10:]),
-          np.mean(time_array[50:]),
-          np.mean(time_array[100:]))
-    import IPython; IPython.embed()
+    print(
+        "time:",
+        np.mean(time_array),
+        np.mean(time_array[10:]),
+        np.mean(time_array[50:]),
+        np.mean(time_array[100:]),
+    )
+    import IPython
+
+    IPython.embed()
diff --git a/torch_dftd/functions/triplets.py b/torch_dftd/functions/triplets.py
index 61d537a..37b6dfd 100644
--- a/torch_dftd/functions/triplets.py
+++ b/torch_dftd/functions/triplets.py
@@ -1,4 +1,4 @@
-from typing import Optional, Tuple, List
+from typing import List, Optional, Tuple
 
 import torch
 from torch import Tensor
@@ -6,8 +6,15 @@
 
 
 @torch.jit.script
-def _calc_triplets_core(counts: Tensor, unique: Tensor, dst: Tensor, edge_indices: Tensor, batch_edge: Tensor, counts_cumsum: Tensor,
-                        dtype: torch.dtype):
+def _calc_triplets_core(
+    counts: Tensor,
+    unique: Tensor,
+    dst: Tensor,
+    edge_indices: Tensor,
+    batch_edge: Tensor,
+    counts_cumsum: Tensor,
+    dtype: torch.dtype,
+):
     device = unique.device
     n_triplets = torch.sum(counts * (counts - 1) / 2)
     if n_triplets == 0:
diff --git a/torch_dftd/nn/dftd2_module.py b/torch_dftd/nn/dftd2_module.py
index 8a70a07..cc08a03 100644
--- a/torch_dftd/nn/dftd2_module.py
+++ b/torch_dftd/nn/dftd2_module.py
@@ -56,7 +56,7 @@ def calc_energy_batch(
         batch_edge: Optional[Tensor] = None,
         damping: str = "zero",
         autoang: float = d3_autoang,
-        autoev: float = d3_autoev
+        autoev: float = d3_autoev,
     ) -> Tensor:
         """Forward computation to calculate atomic wise dispersion energy"""
         shift_pos = pos.new_zeros((edge_index.size()[1], 3, 3)) if shift_pos is None else shift_pos

From c45e19ea952fd962b87a490f79bac065c5651bd5 Mon Sep 17 00:00:00 2001
From: corochann <acc1ssnn9terias@gmail.com>
Date: Mon, 18 Oct 2021 17:44:07 +0900
Subject: [PATCH 03/14] fix

---
 torch_dftd/functions/dftd3.py           | 6 +++---
 torch_dftd/functions/triplets_kernel.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch_dftd/functions/dftd3.py b/torch_dftd/functions/dftd3.py
index ef1c9fb..85d80c1 100644
--- a/torch_dftd/functions/dftd3.py
+++ b/torch_dftd/functions/dftd3.py
@@ -54,7 +54,7 @@ def _ncoord(
     Zi = Z[idx_i]
     Zj = Z[idx_j]
     rco = rcov[Zi] + rcov[Zj]  # (n_edges,)
-    rr = rco.type(r.dtype) / r
+    rr = rco.to(r.dtype) / r
     damp = 1.0 / (1.0 + torch.exp(-k1 * (rr - 1.0)))
     if cutoff is not None and cutoff_smoothing == "poly":
         damp *= poly_smoothing(r, cutoff)
@@ -86,7 +86,7 @@ def _getc6(
     """
     # gather the relevant entries from the table
     # c6ab (95, 95, 5, 5, 3) --> c6ab_ (n_edges, 5, 5, 3)
-    c6ab_ = c6ab[Zi, Zj].type(nci.dtype)
+    c6ab_ = c6ab[Zi, Zj].to(nci.dtype)
     # calculate c6 coefficients
 
     # cn0, cn1, cn2 (n_edges, 5, 5)
@@ -195,7 +195,7 @@ def edisp(
     ncj = nc[idx_j]
     c6 = _getc6(Zi, Zj, nci, ncj, c6ab=c6ab, k3=k3)  # c6 coefficients
 
-    c8 = 3 * c6 * r2r4[Zi].type(c6.dtype) * r2r4[Zj].type(c6.dtype)  # c8 coefficient
+    c8 = 3 * c6 * r2r4[Zi].to(c6.dtype) * r2r4[Zj].to(c6.dtype)  # c8 coefficient
 
     s6 = params["s6"]
     s8 = params["s18"]
diff --git a/torch_dftd/functions/triplets_kernel.py b/torch_dftd/functions/triplets_kernel.py
index 34e24a2..edf5f75 100644
--- a/torch_dftd/functions/triplets_kernel.py
+++ b/torch_dftd/functions/triplets_kernel.py
@@ -113,7 +113,7 @@ def _cupy2torch(array: cp.ndarray) -> Tensor:
     _calc_triplets_core_gpu_kernel = None
 
 
-@torch.jit.unused
+@torch.jit.ignore
 def _calc_triplets_core_gpu(
     counts: Tensor,
     unique: Tensor,

From e2f94e812758da75836110ab714faa850b1c6c1d Mon Sep 17 00:00:00 2001
From: corochann <acc1ssnn9terias@gmail.com>
Date: Mon, 18 Oct 2021 18:07:58 +0900
Subject: [PATCH 04/14] tmp

---
 torch_dftd/functions/distance.py  | 12 +++++++++++-
 torch_dftd/functions/smoothing.py |  9 +++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/torch_dftd/functions/distance.py b/torch_dftd/functions/distance.py
index 614ad4c..25a85f1 100644
--- a/torch_dftd/functions/distance.py
+++ b/torch_dftd/functions/distance.py
@@ -40,7 +40,13 @@ def calc_distances(
 
 if __name__ == "__main__":
     num_runs = 50
-    torch._C._jit_set_num_profiled_runs(num_runs)
+
+    old_prof_exec_state = torch._C._jit_set_profiling_executor(False)
+    old_prof_mode_state = torch._C._jit_set_profiling_mode(False)
+    old_num_prof_runs = torch._C._jit_set_num_profiled_runs(num_runs)
+    print("old_prof_exec_state", old_prof_exec_state,
+          "old_prof_mode_state", old_prof_mode_state,
+          "old_num_prof_runs", old_num_prof_runs)
     print("profiled runs: ", torch._C._jit_get_num_profiled_runs())
 
     device = "cuda:0"
@@ -59,6 +65,10 @@ def calc_distances(
         torch.cuda.synchronize()
         s0 = perf_counter()
         calc_distances(pos, edge_index)
+        # if i < 10:
+        #     print(f"----- {i} ------------------------")
+        #     print(torch.jit.last_executed_optimized_graph())
+
         torch.cuda.synchronize()
         e0 = perf_counter()
         time_list.append(e0 - s0)
diff --git a/torch_dftd/functions/smoothing.py b/torch_dftd/functions/smoothing.py
index 2d431b1..9f230f3 100644
--- a/torch_dftd/functions/smoothing.py
+++ b/torch_dftd/functions/smoothing.py
@@ -29,6 +29,15 @@ def poly_smoothing(r: Tensor, cutoff: float) -> Tensor:
 if __name__ == "__main__":
     from time import perf_counter
 
+    num_runs = 50
+    old_prof_exec_state = torch._C._jit_set_profiling_executor(False)
+    old_prof_mode_state = torch._C._jit_set_profiling_mode(False)
+    old_num_prof_runs = torch._C._jit_set_num_profiled_runs(num_runs)
+    print("old_prof_exec_state", old_prof_exec_state,
+          "old_prof_mode_state", old_prof_mode_state,
+          "old_num_prof_runs", old_num_prof_runs)
+    print("profiled runs: ", torch._C._jit_get_num_profiled_runs())
+
     device = "cpu"
     n_edges = 10000
     r = torch.rand(n_edges).to(device) * 10.0

From c9c3a3490dfba049bc29155600f2998c58ad249b Mon Sep 17 00:00:00 2001
From: corochann <acc1ssnn9terias@gmail.com>
Date: Mon, 18 Oct 2021 21:44:45 +0900
Subject: [PATCH 05/14] update

---
 torch_dftd/dftd3_xc_params.py        |   2 +-
 torch_dftd/functions/distance.py     |  11 +-
 torch_dftd/functions/smoothing.py    |  11 +-
 torch_dftd/nn/base_dftd_module.py    | 152 ++++++++++++++++++++-------
 torch_dftd/nn/dftd2_module.py        |   3 +-
 torch_dftd/nn/dftd3_module.py        |   5 +-
 torch_dftd/torch_dftd3_calculator.py |   6 +-
 7 files changed, 138 insertions(+), 52 deletions(-)

diff --git a/torch_dftd/dftd3_xc_params.py b/torch_dftd/dftd3_xc_params.py
index 00c806c..6789deb 100644
--- a/torch_dftd/dftd3_xc_params.py
+++ b/torch_dftd/dftd3_xc_params.py
@@ -535,7 +535,7 @@ def get_dftd3_default_params(
         rs6 = 1.1
         s18 = 0.0
         alp = 20.0
-        rs18 = None  # Not used.
+        rs18 = 0.0  # It is DUMMY value. Not used.
         if xc == "b-lyp":
             s6 = 1.2
         elif xc == "b-p":
diff --git a/torch_dftd/functions/distance.py b/torch_dftd/functions/distance.py
index 25a85f1..033b2ca 100644
--- a/torch_dftd/functions/distance.py
+++ b/torch_dftd/functions/distance.py
@@ -44,9 +44,14 @@ def calc_distances(
     old_prof_exec_state = torch._C._jit_set_profiling_executor(False)
     old_prof_mode_state = torch._C._jit_set_profiling_mode(False)
     old_num_prof_runs = torch._C._jit_set_num_profiled_runs(num_runs)
-    print("old_prof_exec_state", old_prof_exec_state,
-          "old_prof_mode_state", old_prof_mode_state,
-          "old_num_prof_runs", old_num_prof_runs)
+    print(
+        "old_prof_exec_state",
+        old_prof_exec_state,
+        "old_prof_mode_state",
+        old_prof_mode_state,
+        "old_num_prof_runs",
+        old_num_prof_runs,
+    )
     print("profiled runs: ", torch._C._jit_get_num_profiled_runs())
 
     device = "cuda:0"
diff --git a/torch_dftd/functions/smoothing.py b/torch_dftd/functions/smoothing.py
index 9f230f3..b006dab 100644
--- a/torch_dftd/functions/smoothing.py
+++ b/torch_dftd/functions/smoothing.py
@@ -33,9 +33,14 @@ def poly_smoothing(r: Tensor, cutoff: float) -> Tensor:
     old_prof_exec_state = torch._C._jit_set_profiling_executor(False)
     old_prof_mode_state = torch._C._jit_set_profiling_mode(False)
     old_num_prof_runs = torch._C._jit_set_num_profiled_runs(num_runs)
-    print("old_prof_exec_state", old_prof_exec_state,
-          "old_prof_mode_state", old_prof_mode_state,
-          "old_num_prof_runs", old_num_prof_runs)
+    print(
+        "old_prof_exec_state",
+        old_prof_exec_state,
+        "old_prof_mode_state",
+        old_prof_mode_state,
+        "old_num_prof_runs",
+        old_num_prof_runs,
+    )
     print("profiled runs: ", torch._C._jit_get_num_profiled_runs())
 
     device = "cpu"
diff --git a/torch_dftd/nn/base_dftd_module.py b/torch_dftd/nn/base_dftd_module.py
index 48248ff..57f4c40 100644
--- a/torch_dftd/nn/base_dftd_module.py
+++ b/torch_dftd/nn/base_dftd_module.py
@@ -1,15 +1,14 @@
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-import numpy as np
 import torch
-from ase.neighborlist import primitive_neighbor_list
-from ase.units import Bohr
 from torch import Tensor, nn
+from torch_dftd.functions.dftd3 import d3_autoang, d3_autoev
 
 
 class BaseDFTDModule(nn.Module):
     """BaseDFTDModule"""
 
+    @torch.jit.ignore
     def calc_energy_batch(
         self,
         Z: Tensor,
@@ -21,6 +20,8 @@ def calc_energy_batch(
         batch: Optional[Tensor] = None,
         batch_edge: Optional[Tensor] = None,
         damping: str = "zero",
+        autoang: float = d3_autoang,
+        autoev: float = d3_autoev,
     ) -> Tensor:
         """Forward computation to calculate atomic wise dispersion energy.
 
@@ -36,12 +37,15 @@ def calc_energy_batch(
             batch (Tensor): (n_atoms,) Specify which graph this atom belongs to
             batch_edge (Tensor): (n_edges, 3) Specify which graph this edge belongs to
             damping (str):
+            autoang (float):
+            autoev (float):
 
         Returns:
             energy (Tensor): (n_atoms,)
         """
         raise NotImplementedError()
 
+    @torch.jit.export
     def calc_energy(
         self,
         Z: Tensor,
@@ -53,7 +57,9 @@ def calc_energy(
         batch: Optional[Tensor] = None,
         batch_edge: Optional[Tensor] = None,
         damping: str = "zero",
-    ) -> List[Dict[str, Any]]:
+        autoang: float = d3_autoang,
+        autoev: float = d3_autoev,
+    ) -> List[Dict[str, float]]:
         """Forward computation of dispersion energy
 
         Backward computation is skipped for fast computation of only energy.
@@ -68,24 +74,38 @@ def calc_energy(
             batch (Tensor):
             batch_edge (Tensor):
             damping (str): damping method. "zero", "bj", "zerom", "bjm"
+            autoang (float):
+            autoev (float):
 
         Returns:
             results_list (list): calculated result. It contains calculate energy in "energy" key.
         """
         with torch.no_grad():
             E_disp = self.calc_energy_batch(
-                Z, pos, edge_index, cell, pbc, shift_pos, batch, batch_edge, damping=damping
+                Z,
+                pos,
+                edge_index,
+                cell,
+                pbc,
+                shift_pos,
+                batch,
+                batch_edge,
+                damping=damping,
+                autoang=autoang,
+                autoev=autoev,
             )
+        E_disp_list: List[float] = E_disp.tolist()
         if batch is None:
-            return [{"energy": E_disp.item()}]
+            return [{"energy": E_disp_list[0]}]
         else:
             if batch.size()[0] == 0:
                 n_graphs = 1
             else:
                 n_graphs = int(batch[-1]) + 1
-            return [{"energy": E_disp[i].item()} for i in range(n_graphs)]
+            return [{"energy": E_disp_list[i]} for i in range(n_graphs)]
 
-    def calc_energy_and_forces(
+    @torch.jit.export
+    def _calc_energy_and_forces_core(
         self,
         Z: Tensor,
         pos: Tensor,
@@ -96,23 +116,9 @@ def calc_energy_and_forces(
         batch: Optional[Tensor] = None,
         batch_edge: Optional[Tensor] = None,
         damping: str = "zero",
-    ) -> List[Dict[str, Any]]:
-        """Forward computation of dispersion energy, force and stress
-
-        Args:
-            Z (Tensor): (n_atoms,) atomic numbers.
-            pos (Tensor): atom positions in angstrom
-            cell (Tensor): cell size in angstrom, None for non periodic system.
-            pbc (Tensor): pbc condition, None for non periodic system.
-            shift_pos (Tensor):  (n_atoms, 3) shift vector (length unit).
-            damping (str): damping method. "zero", "bj", "zerom", "bjm"
-
-        Returns:
-            results (list): calculated results. Contains following:
-                "energy": ()
-                "forces": (n_atoms, 3)
-                "stress": (6,)
-        """
+        autoang: float = d3_autoang,
+        autoev: float = d3_autoev,
+    ) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
         pos.requires_grad_(True)
         if cell is not None:
             # pos is depending on `cell` size
@@ -123,21 +129,21 @@ def calc_energy_and_forces(
             shift_pos.requires_grad_(True)
 
         E_disp = self.calc_energy_batch(
-            Z, pos, edge_index, cell, pbc, shift_pos, batch, batch_edge, damping=damping
+            Z,
+            pos,
+            edge_index,
+            cell,
+            pbc,
+            shift_pos,
+            batch,
+            batch_edge,
+            damping=damping,
+            autoang=autoang,
+            autoev=autoev,
         )
 
         E_disp.sum().backward()
-        forces = -pos.grad  # [eV/angstrom]
-        if batch is None:
-            results_list = [{"energy": E_disp.item(), "forces": forces.cpu().numpy()}]
-        else:
-            if batch.size()[0] == 0:
-                n_graphs = 1
-            else:
-                n_graphs = int(batch[-1]) + 1
-            results_list = [{"energy": E_disp[i].item()} for i in range(n_graphs)]
-            for i in range(n_graphs):
-                results_list[i]["forces"] = forces[batch == i].cpu().numpy()
+        pos_grad = pos.grad
 
         if cell is not None:
             # stress = torch.mm(cell_grad, cell.T) / cell_volume
@@ -155,9 +161,15 @@ def calc_energy_and_forces(
                     dim=0,
                 )
                 stress = cell_grad.to(cell.dtype) / cell_volume
-                results_list[0]["stress"] = stress.detach().cpu().numpy()
+                # results_list[0]["stress"] = stress.detach().cpu().numpy()
             else:
+                assert isinstance(batch, Tensor)
                 assert isinstance(batch_edge, Tensor)
+                if batch.size()[0] == 0:
+                    n_graphs = 1
+                else:
+                    n_graphs = int(batch[-1]) + 1
+
                 # cell (bs, 3, 3)
                 cell_volume = torch.det(cell).abs()
                 cell_grad = pos.new_zeros((n_graphs, 6), dtype=torch.float64)
@@ -172,6 +184,68 @@ def calc_energy_and_forces(
                     (shift_pos[:, voigt_left] * shift_pos.grad[:, voigt_right]).to(torch.float64),
                 )
                 stress = cell_grad.to(cell.dtype) / cell_volume[:, None]
+                # stress = stress.detach().cpu().numpy()
+                stress = stress.detach().cpu()
+        else:
+            stress = None
+        return E_disp, pos_grad, stress
+
+    @torch.jit.ignore
+    def calc_energy_and_forces(
+        self,
+        Z: Tensor,
+        pos: Tensor,
+        edge_index: Tensor,
+        cell: Optional[Tensor] = None,
+        pbc: Optional[Tensor] = None,
+        shift_pos: Optional[Tensor] = None,
+        batch: Optional[Tensor] = None,
+        batch_edge: Optional[Tensor] = None,
+        damping: str = "zero",
+        autoang: float = d3_autoang,
+        autoev: float = d3_autoev,
+    ) -> List[Dict[str, Any]]:
+        """Forward computation of dispersion energy, force and stress
+
+        Args:
+            Z (Tensor): (n_atoms,) atomic numbers.
+            pos (Tensor): atom positions in angstrom
+            cell (Tensor): cell size in angstrom, None for non periodic system.
+            pbc (Tensor): pbc condition, None for non periodic system.
+            shift_pos (Tensor):  (n_atoms, 3) shift vector (length unit).
+            damping (str): damping method. "zero", "bj", "zerom", "bjm"
+            autoang (float):
+            autoev (float):
+
+        Returns:
+            results (list): calculated results. Contains following:
+                "energy": ()
+                "forces": (n_atoms, 3)
+                "stress": (6,)
+        """
+        E_disp, pos_grad, stress = self._calc_energy_and_forces_core(
+            Z, pos, edge_index, cell, pbc, shift_pos, batch, batch_edge, damping, autoang, autoev
+        )
+
+        forces = pos_grad
+        n_graphs = 0  # Just to declare for torch.jit.script.
+        if batch is None:
+            results_list = [{"energy": E_disp.item(), "forces": forces.cpu().numpy()}]
+        else:
+            if batch.size()[0] == 0:
+                n_graphs = 1
+            else:
+                n_graphs = int(batch[-1]) + 1
+            results_list = [{"energy": E_disp[i].item()} for i in range(n_graphs)]
+            for i in range(n_graphs):
+                results_list[i]["forces"] = forces[batch == i].cpu().numpy()
+
+        if stress is not None:
+            # stress = torch.mm(cell_grad, cell.T) / cell_volume
+            # Get stress in Voigt notation (xx, yy, zz, yz, xz, xy)
+            if batch is None:
+                results_list[0]["stress"] = stress.detach().cpu().numpy()
+            else:
                 stress = stress.detach().cpu().numpy()
                 for i in range(n_graphs):
                     results_list[i]["stress"] = stress[i]
diff --git a/torch_dftd/nn/dftd2_module.py b/torch_dftd/nn/dftd2_module.py
index cc08a03..5940af2 100644
--- a/torch_dftd/nn/dftd2_module.py
+++ b/torch_dftd/nn/dftd2_module.py
@@ -44,6 +44,7 @@ def __init__(
         # atom pair distance (95, 95)
         self.register_buffer("r0ab", r0ab)
 
+    @torch.jit.export
     def calc_energy_batch(
         self,
         Z: Tensor,
@@ -78,7 +79,7 @@ def calc_energy_batch(
             params=self.params,
             damping=damping,
             bidirectional=self.bidirectional,
-            cutoff=self.cutoff / Bohr,
+            cutoff=self.cutoff / autoang,
             batch=batch,
             batch_edge=batch_edge,
             cutoff_smoothing=self.cutoff_smoothing,
diff --git a/torch_dftd/nn/dftd3_module.py b/torch_dftd/nn/dftd3_module.py
index a40c0f6..5259086 100644
--- a/torch_dftd/nn/dftd3_module.py
+++ b/torch_dftd/nn/dftd3_module.py
@@ -68,6 +68,7 @@ def __init__(
         self.bidirectional = bidirectional
         self.cutoff_smoothing = cutoff_smoothing
 
+    @torch.jit.export
     def calc_energy_batch(
         self,
         Z: Tensor,
@@ -101,8 +102,8 @@ def calc_energy_batch(
             rcov=self.rcov,
             r2r4=self.r2r4,
             params=self.params,
-            cutoff=self.cutoff / Bohr,
-            cnthr=self.cnthr / Bohr,
+            cutoff=self.cutoff / autoang,
+            cnthr=self.cnthr / autoang,
             batch=batch,
             batch_edge=batch_edge,
             shift_pos=shift_bohr,
diff --git a/torch_dftd/torch_dftd3_calculator.py b/torch_dftd/torch_dftd3_calculator.py
index 5c050ad..fa68456 100644
--- a/torch_dftd/torch_dftd3_calculator.py
+++ b/torch_dftd/torch_dftd3_calculator.py
@@ -58,7 +58,7 @@ def __init__(
         self.old = old
         self.device = torch.device(device)
         if old:
-            self.dftd_module: torch.nn.Module = DFTD2Module(
+            dftd_module: torch.nn.Module = DFTD2Module(
                 self.params,
                 cutoff=cutoff,
                 dtype=dtype,
@@ -66,7 +66,7 @@ def __init__(
                 cutoff_smoothing=cutoff_smoothing,
             )
         else:
-            self.dftd_module = DFTD3Module(
+            dftd_module = DFTD3Module(
                 self.params,
                 cutoff=cutoff,
                 cnthr=cnthr,
@@ -75,7 +75,7 @@ def __init__(
                 bidirectional=bidirectional,
                 cutoff_smoothing=cutoff_smoothing,
             )
-        self.dftd_module.to(device)
+        self.dftd_module = torch.jit.script(dftd_module.to(device))
         self.dtype = dtype
         self.cutoff = cutoff
         self.bidirectional = bidirectional

From f3d0f54fff8807fd605711aa0f5b480a2f187a5b Mon Sep 17 00:00:00 2001
From: corochann <acc1ssnn9terias@gmail.com>
Date: Tue, 19 Oct 2021 13:32:57 +0900
Subject: [PATCH 06/14] Use original conversion.

---
 torch_dftd/nn/dftd2_module.py | 3 ++-
 torch_dftd/nn/dftd3_module.py | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/torch_dftd/nn/dftd2_module.py b/torch_dftd/nn/dftd2_module.py
index 5940af2..d6269a9 100644
--- a/torch_dftd/nn/dftd2_module.py
+++ b/torch_dftd/nn/dftd2_module.py
@@ -38,6 +38,7 @@ def __init__(
         self.dtype = dtype
         self.bidirectional = bidirectional
         self.cutoff_smoothing = cutoff_smoothing
+        self._bohr = Bohr  # For torch.jit, `Bohr` must be local variable.
         r0ab, c6ab = get_dftd2_params()
         # atom pair coefficient (87, 87)
         self.register_buffer("c6ab", c6ab)
@@ -79,7 +80,7 @@ def calc_energy_batch(
             params=self.params,
             damping=damping,
             bidirectional=self.bidirectional,
-            cutoff=self.cutoff / autoang,
+            cutoff=self.cutoff / self._bohr,
             batch=batch,
             batch_edge=batch_edge,
             cutoff_smoothing=self.cutoff_smoothing,
diff --git a/torch_dftd/nn/dftd3_module.py b/torch_dftd/nn/dftd3_module.py
index 5259086..0003dd4 100644
--- a/torch_dftd/nn/dftd3_module.py
+++ b/torch_dftd/nn/dftd3_module.py
@@ -67,6 +67,7 @@ def __init__(
         self.dtype = dtype
         self.bidirectional = bidirectional
         self.cutoff_smoothing = cutoff_smoothing
+        self._bohr = Bohr  # For torch.jit, `Bohr` must be local variable.
 
     @torch.jit.export
     def calc_energy_batch(
@@ -102,8 +103,8 @@ def calc_energy_batch(
             rcov=self.rcov,
             r2r4=self.r2r4,
             params=self.params,
-            cutoff=self.cutoff / autoang,
-            cnthr=self.cnthr / autoang,
+            cutoff=self.cutoff / self._bohr,
+            cnthr=self.cnthr / self._bohr,
             batch=batch,
             batch_edge=batch_edge,
             shift_pos=shift_bohr,

From ab17f589d8f7802d854980b2ff55ea924f68bc3c Mon Sep 17 00:00:00 2001
From: corochann <acc1ssnn9terias@gmail.com>
Date: Tue, 19 Oct 2021 13:44:49 +0900
Subject: [PATCH 07/14] refactor

---
 torch_dftd/functions/dftd3.py     |  7 -----
 torch_dftd/functions/distance.py  | 45 -------------------------------
 torch_dftd/functions/smoothing.py | 45 -------------------------------
 torch_dftd/nn/base_dftd_module.py | 13 +++++----
 4 files changed, 6 insertions(+), 104 deletions(-)

diff --git a/torch_dftd/functions/dftd3.py b/torch_dftd/functions/dftd3.py
index 85d80c1..f13a47d 100644
--- a/torch_dftd/functions/dftd3.py
+++ b/torch_dftd/functions/dftd3.py
@@ -287,12 +287,6 @@ def edisp(
             # (n_edges, ) -> (n_edges * 2, )
             shift_abc = None if shift_abc is None else torch.cat([shift_abc, -shift_abc], dim=0)
         with torch.no_grad():
-            # triplet_node_index, triplet_edge_index = calc_triplets_cycle(edge_index_abc, n_atoms, shift=shift_abc)
-            # Type hinting
-            # triplet_node_index: Tensor
-            # multiplicity: Tensor
-            # edge_jk: Tensor
-            # batch_triplets: Optional[Tensor]
             assert pos is not None
             triplet_node_index, multiplicity, edge_jk, batch_triplets = calc_triplets(
                 edge_index_abc,
@@ -309,7 +303,6 @@ def edisp(
         )
         r_jk = calc_distances(pos, torch.stack([idx_j, idx_k], dim=0), cell, shift_jk)
         kj_within_cutoff = r_jk <= cnthr
-        # del shift_jk
 
         triplet_node_index = triplet_node_index[kj_within_cutoff]
         multiplicity, edge_jk, batch_triplets = (
diff --git a/torch_dftd/functions/distance.py b/torch_dftd/functions/distance.py
index 033b2ca..90a4f0b 100644
--- a/torch_dftd/functions/distance.py
+++ b/torch_dftd/functions/distance.py
@@ -36,48 +36,3 @@ def calc_distances(
     # eps is to avoid Nan in backward when Dij = 0 with sqrt.
     Dij = torch.sqrt(torch.sum((Ri - Rj) ** 2, dim=-1) + eps)
     return Dij
-
-
-if __name__ == "__main__":
-    num_runs = 50
-
-    old_prof_exec_state = torch._C._jit_set_profiling_executor(False)
-    old_prof_mode_state = torch._C._jit_set_profiling_mode(False)
-    old_num_prof_runs = torch._C._jit_set_num_profiled_runs(num_runs)
-    print(
-        "old_prof_exec_state",
-        old_prof_exec_state,
-        "old_prof_mode_state",
-        old_prof_mode_state,
-        "old_num_prof_runs",
-        old_num_prof_runs,
-    )
-    print("profiled runs: ", torch._C._jit_get_num_profiled_runs())
-
-    device = "cuda:0"
-    # device = "cpu"
-    n_atoms = 1000
-    pos = torch.randn(n_atoms, 3, device=device) * n_atoms
-    edge_index = torch.stack([torch.arange(n_atoms), torch.arange(n_atoms - 1, -1, -1)], dim=0).to(
-        device
-    )
-    print("edge_index", edge_index.shape)
-    calc_distances(pos, edge_index)
-    from time import perf_counter
-
-    time_list = []
-    for i in range(100):
-        torch.cuda.synchronize()
-        s0 = perf_counter()
-        calc_distances(pos, edge_index)
-        # if i < 10:
-        #     print(f"----- {i} ------------------------")
-        #     print(torch.jit.last_executed_optimized_graph())
-
-        torch.cuda.synchronize()
-        e0 = perf_counter()
-        time_list.append(e0 - s0)
-    print("Time: ", time_list)
-    import IPython
-
-    IPython.embed()
diff --git a/torch_dftd/functions/smoothing.py b/torch_dftd/functions/smoothing.py
index b006dab..e993e39 100644
--- a/torch_dftd/functions/smoothing.py
+++ b/torch_dftd/functions/smoothing.py
@@ -24,48 +24,3 @@ def poly_smoothing(r: Tensor, cutoff: float) -> Tensor:
         torch.ones_like(x),
         torch.where(r >= cutoff, torch.zeros_like(x), 6 * x5 - 15 * x4 + 10 * x3),
     )
-
-
-if __name__ == "__main__":
-    from time import perf_counter
-
-    num_runs = 50
-    old_prof_exec_state = torch._C._jit_set_profiling_executor(False)
-    old_prof_mode_state = torch._C._jit_set_profiling_mode(False)
-    old_num_prof_runs = torch._C._jit_set_num_profiled_runs(num_runs)
-    print(
-        "old_prof_exec_state",
-        old_prof_exec_state,
-        "old_prof_mode_state",
-        old_prof_mode_state,
-        "old_num_prof_runs",
-        old_num_prof_runs,
-    )
-    print("profiled runs: ", torch._C._jit_get_num_profiled_runs())
-
-    device = "cpu"
-    n_edges = 10000
-    r = torch.rand(n_edges).to(device) * 10.0
-
-    time_list = []
-    for i in range(200):
-        torch.cuda.synchronize()
-        s0 = perf_counter()
-        r2 = poly_smoothing(r, 5.0)
-        torch.cuda.synchronize()
-        e0 = perf_counter()
-        time_list.append(e0 - s0)
-    print("Time: ", time_list)
-    import numpy as np
-
-    time_array = np.array(time_list)
-    print(
-        "time:",
-        np.mean(time_array),
-        np.mean(time_array[10:]),
-        np.mean(time_array[50:]),
-        np.mean(time_array[100:]),
-    )
-    import IPython
-
-    IPython.embed()
diff --git a/torch_dftd/nn/base_dftd_module.py b/torch_dftd/nn/base_dftd_module.py
index 57f4c40..590418e 100644
--- a/torch_dftd/nn/base_dftd_module.py
+++ b/torch_dftd/nn/base_dftd_module.py
@@ -161,7 +161,6 @@ def _calc_energy_and_forces_core(
                     dim=0,
                 )
                 stress = cell_grad.to(cell.dtype) / cell_volume
-                # results_list[0]["stress"] = stress.detach().cpu().numpy()
             else:
                 assert isinstance(batch, Tensor)
                 assert isinstance(batch_edge, Tensor)
@@ -184,8 +183,7 @@ def _calc_energy_and_forces_core(
                     (shift_pos[:, voigt_left] * shift_pos.grad[:, voigt_right]).to(torch.float64),
                 )
                 stress = cell_grad.to(cell.dtype) / cell_volume[:, None]
-                # stress = stress.detach().cpu().numpy()
-                stress = stress.detach().cpu()
+                stress = stress
         else:
             stress = None
         return E_disp, pos_grad, stress
@@ -227,18 +225,19 @@ def calc_energy_and_forces(
             Z, pos, edge_index, cell, pbc, shift_pos, batch, batch_edge, damping, autoang, autoev
         )
 
-        forces = pos_grad
+        forces = (-pos_grad).cpu().numpy()
         n_graphs = 0  # Just to declare for torch.jit.script.
         if batch is None:
-            results_list = [{"energy": E_disp.item(), "forces": forces.cpu().numpy()}]
+            results_list = [{"energy": E_disp.item(), "forces": forces}]
         else:
             if batch.size()[0] == 0:
                 n_graphs = 1
             else:
                 n_graphs = int(batch[-1]) + 1
-            results_list = [{"energy": E_disp[i].item()} for i in range(n_graphs)]
+            E_disp_list = E_disp.tolist()
+            results_list = [{"energy": E_disp_list[i]} for i in range(n_graphs)]
             for i in range(n_graphs):
-                results_list[i]["forces"] = forces[batch == i].cpu().numpy()
+                results_list[i]["forces"] = forces[batch == i]
 
         if stress is not None:
             # stress = torch.mm(cell_grad, cell.T) / cell_volume

From 0ae6e75a02db754e272818e45c54ca8bed2a901f Mon Sep 17 00:00:00 2001
From: corochann <acc1ssnn9terias@gmail.com>
Date: Wed, 20 Oct 2021 05:14:23 +0900
Subject: [PATCH 08/14] update

---
 .flexci/pytest_script.sh                |  2 +-
 torch_dftd/functions/triplets.py        |  9 +++++----
 torch_dftd/functions/triplets_kernel.py | 22 ++++++++++++++++++++--
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/.flexci/pytest_script.sh b/.flexci/pytest_script.sh
index 0a0960b..fa395e4 100644
--- a/.flexci/pytest_script.sh
+++ b/.flexci/pytest_script.sh
@@ -20,7 +20,7 @@ main() {
   docker run --runtime=nvidia --rm --volume="$(pwd)":/workspace -w /workspace \
     ${IMAGE} \
     bash -x -c "pip install flake8 pytest pytest-cov pytest-xdist pytest-benchmark && \
-      pip install cupy-cuda102 pytorch-pfn-extras!=0.5.0 && \
+      pip install cupy-cuda102 pytorch-pfn-extras<0.5.0 && \
       pip install -e .[develop] && \
       pysen run lint && \
       pytest --cov=torch_dftd -n $(nproc) -m 'not slow' tests &&
diff --git a/torch_dftd/functions/triplets.py b/torch_dftd/functions/triplets.py
index 37b6dfd..50c3297 100644
--- a/torch_dftd/functions/triplets.py
+++ b/torch_dftd/functions/triplets.py
@@ -13,10 +13,10 @@ def _calc_triplets_core(
     edge_indices: Tensor,
     batch_edge: Tensor,
     counts_cumsum: Tensor,
-    dtype: torch.dtype,
-):
+    dtype: torch.dtype = torch.float32,
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     device = unique.device
-    n_triplets = torch.sum(counts * (counts - 1) / 2)
+    n_triplets = int(torch.sum(counts * (counts - 1) / 2))
     if n_triplets == 0:
         # (n_triplet_edges, 3)
         triplet_node_index = torch.zeros((0, 3), dtype=torch.long, device=device)
@@ -35,11 +35,12 @@ def _calc_triplets_core(
 
     unique_list: List[int] = unique.tolist()
     dst_list: List[int] = dst.tolist()
+    counts_list: List[int] = counts.tolist()
     counts_cumsum_list: List[int] = counts_cumsum.tolist()
     batch_edge_list: List[int] = batch_edge.tolist()
     for i in range(len(unique)):
         _src: int = unique_list[i]
-        _n_edges = counts_cumsum_list[i]
+        _n_edges: int = counts_list[i]
         _dst: List[int] = dst_list[counts_cumsum_list[i] : counts_cumsum_list[i + 1]]
         _offset = counts_cumsum_list[i]
         _batch_index = batch_edge_list[counts_cumsum_list[i]]
diff --git a/torch_dftd/functions/triplets_kernel.py b/torch_dftd/functions/triplets_kernel.py
index edf5f75..52c5edc 100644
--- a/torch_dftd/functions/triplets_kernel.py
+++ b/torch_dftd/functions/triplets_kernel.py
@@ -114,15 +114,15 @@ def _cupy2torch(array: cp.ndarray) -> Tensor:
 
 
 @torch.jit.ignore
-def _calc_triplets_core_gpu(
+def _calc_triplets_core_gpu_run_kernel(
     counts: Tensor,
     unique: Tensor,
     dst: Tensor,
     edge_indices: Tensor,
     batch_edge: Tensor,
     counts_cumsum: Tensor,
-    dtype: torch.dtype = torch.float32,
 ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    dtype: torch.dtype = torch.float32
     if not _ppe_available:
         raise ImportError("Please install pytorch_pfn_extras to use `_calc_triplets_core_gpu`!")
     if not _cupy_available:
@@ -157,3 +157,21 @@ def _calc_triplets_core_gpu(
     )
     # torch tensor buffer is already modified in above cupy functions.
     return triplet_node_index, multiplicity, edge_jk, batch_triplets
+
+
+@torch.jit.script
+def _calc_triplets_core_gpu(
+    counts: Tensor,
+    unique: Tensor,
+    dst: Tensor,
+    edge_indices: Tensor,
+    batch_edge: Tensor,
+    counts_cumsum: Tensor,
+    dtype: torch.dtype = torch.float32,
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    # dtype cannot be used inside @torch.jit.ignore function...
+    # https://github.com/pytorch/pytorch/issues/51941
+    triplet_node_index, multiplicity, edge_jk, batch_triplets = _calc_triplets_core_gpu_run_kernel(
+        counts, unique, dst, edge_indices, batch_edge, counts_cumsum
+    )
+    return triplet_node_index, multiplicity.to(dtype), edge_jk, batch_triplets

From 5e2c452f80ae7c4954c6e2258337353e5baad979 Mon Sep 17 00:00:00 2001
From: corochann <acc1ssnn9terias@gmail.com>
Date: Wed, 20 Oct 2021 05:21:41 +0900
Subject: [PATCH 09/14] update

---
 .flexci/pytest_script.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.flexci/pytest_script.sh b/.flexci/pytest_script.sh
index fa395e4..cae4544 100644
--- a/.flexci/pytest_script.sh
+++ b/.flexci/pytest_script.sh
@@ -20,7 +20,7 @@ main() {
   docker run --runtime=nvidia --rm --volume="$(pwd)":/workspace -w /workspace \
     ${IMAGE} \
     bash -x -c "pip install flake8 pytest pytest-cov pytest-xdist pytest-benchmark && \
-      pip install cupy-cuda102 pytorch-pfn-extras<0.5.0 && \
+      pip install cupy-cuda102 'pytorch-pfn-extras<0.5.0' && \
       pip install -e .[develop] && \
       pysen run lint && \
       pytest --cov=torch_dftd -n $(nproc) -m 'not slow' tests &&

From d80c8231c3bc4d8f24c366c6606af9f2c1f84ca4 Mon Sep 17 00:00:00 2001
From: corochann <acc1ssnn9terias@gmail.com>
Date: Wed, 20 Oct 2021 05:43:24 +0900
Subject: [PATCH 10/14] update test env

---
 .flexci/build_and_push.sh | 6 +++---
 .flexci/pytest_script.sh  | 2 +-
 README.md                 | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.flexci/build_and_push.sh b/.flexci/build_and_push.sh
index 2e8a018..75a00a5 100644
--- a/.flexci/build_and_push.sh
+++ b/.flexci/build_and_push.sh
@@ -32,10 +32,10 @@ docker_build_and_push() {
 WAIT_PIDS=""
 
 # PyTorch 1.5 + Python 3.6
-docker_build_and_push torch15 \
+docker_build_and_push torch19 \
     --build-arg base_image="nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04" \
-    --build-arg python_version="3.6.12" \
-    --build-arg pip_packages="torch==1.5.* torchvision==0.6.* ${TEST_PIP_PACKAGES}" &
+    --build-arg python_version="3.7.9" \
+    --build-arg pip_packages="torch==1.9.1 ${TEST_PIP_PACKAGES}" &
 WAIT_PIDS="$! ${WAIT_PIDS}"
 
 # Wait until the build complete.
diff --git a/.flexci/pytest_script.sh b/.flexci/pytest_script.sh
index cae4544..25deb85 100644
--- a/.flexci/pytest_script.sh
+++ b/.flexci/pytest_script.sh
@@ -20,7 +20,7 @@ main() {
   docker run --runtime=nvidia --rm --volume="$(pwd)":/workspace -w /workspace \
     ${IMAGE} \
     bash -x -c "pip install flake8 pytest pytest-cov pytest-xdist pytest-benchmark && \
-      pip install cupy-cuda102 'pytorch-pfn-extras<0.5.0' && \
+      pip install cupy-cuda102 pytorch-pfn-extras==0.4.2 && \
       pip install -e .[develop] && \
       pysen run lint && \
       pytest --cov=torch_dftd -n $(nproc) -m 'not slow' tests &&
diff --git a/README.md b/README.md
index 50ecbea..8a5ba44 100644
--- a/README.md
+++ b/README.md
@@ -34,14 +34,14 @@ print(f"forces {forces}")
 ## Dependency
 
 The library is tested under following environment.
- - python: 3.6
+ - python: 3.7
  - CUDA: 10.2
 ```bash
-torch==1.5.1
+torch==1.9.1
 ase==3.21.1
 # Below is only for 3-body term
-cupy-cuda102==8.6.0
-pytorch-pfn-extras==0.3.2
+cupy-cuda102==9.5.0
+pytorch-pfn-extras==0.4.2
 ```
 
 ## Development tips

From ba0b172fd06829dc2ba5776d83d3c1142ae58134 Mon Sep 17 00:00:00 2001
From: corochann <acc1ssnn9terias@gmail.com>
Date: Wed, 20 Oct 2021 09:45:21 +0900
Subject: [PATCH 11/14] update

---
 .flexci/pytest_script.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.flexci/pytest_script.sh b/.flexci/pytest_script.sh
index 25deb85..8ffba7f 100644
--- a/.flexci/pytest_script.sh
+++ b/.flexci/pytest_script.sh
@@ -2,7 +2,7 @@
 set -eu
 
 #IMAGE=pytorch/pytorch:1.5.1-cuda10.1-cudnn7-devel
-IMAGE=asia.gcr.io/pfn-public-ci/torch-dftd-ci:torch15
+IMAGE=asia.gcr.io/pfn-public-ci/torch-dftd-ci:torch19
 
 
 main() {

From ca6d235381185e274359712b3c2687d745a674c3 Mon Sep 17 00:00:00 2001
From: corochann <acc1ssnn9terias@gmail.com>
Date: Wed, 20 Oct 2021 11:41:59 +0900
Subject: [PATCH 12/14] Use numpy array

---
 torch_dftd/nn/base_dftd_module.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch_dftd/nn/base_dftd_module.py b/torch_dftd/nn/base_dftd_module.py
index 590418e..dee3679 100644
--- a/torch_dftd/nn/base_dftd_module.py
+++ b/torch_dftd/nn/base_dftd_module.py
@@ -236,8 +236,9 @@ def calc_energy_and_forces(
                 n_graphs = int(batch[-1]) + 1
             E_disp_list = E_disp.tolist()
             results_list = [{"energy": E_disp_list[i]} for i in range(n_graphs)]
+            batch_array = batch.cpu().numpy()
             for i in range(n_graphs):
-                results_list[i]["forces"] = forces[batch == i]
+                results_list[i]["forces"] = forces[batch_array == i]
 
         if stress is not None:
             # stress = torch.mm(cell_grad, cell.T) / cell_volume

From 75b5a9e729d420d0e27d2d93009e25c3a02e6eb9 Mon Sep 17 00:00:00 2001
From: corochann <acc1ssnn9terias@gmail.com>
Date: Wed, 20 Oct 2021 12:21:57 +0900
Subject: [PATCH 13/14] bug fix?

---
 torch_dftd/nn/dftd2_module.py | 2 +-
 torch_dftd/nn/dftd3_module.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_dftd/nn/dftd2_module.py b/torch_dftd/nn/dftd2_module.py
index d6269a9..af9862b 100644
--- a/torch_dftd/nn/dftd2_module.py
+++ b/torch_dftd/nn/dftd2_module.py
@@ -61,7 +61,7 @@ def calc_energy_batch(
         autoev: float = d3_autoev,
     ) -> Tensor:
         """Forward computation to calculate atomic wise dispersion energy"""
-        shift_pos = pos.new_zeros((edge_index.size()[1], 3, 3)) if shift_pos is None else shift_pos
+        shift_pos = pos.new_zeros((edge_index.size()[1], 3)) if shift_pos is None else shift_pos
         pos_bohr = pos / autoang  # angstrom -> bohr
         if cell is None:
             cell_bohr: Optional[Tensor] = None
diff --git a/torch_dftd/nn/dftd3_module.py b/torch_dftd/nn/dftd3_module.py
index 0003dd4..e5fbaac 100644
--- a/torch_dftd/nn/dftd3_module.py
+++ b/torch_dftd/nn/dftd3_module.py
@@ -85,7 +85,7 @@ def calc_energy_batch(
         autoev: float = d3_autoev,
     ) -> Tensor:
         """Forward computation to calculate atomic wise dispersion energy"""
-        shift_pos = pos.new_zeros((edge_index.size()[1], 3, 3)) if shift_pos is None else shift_pos
+        shift_pos = pos.new_zeros((edge_index.size()[1], 3)) if shift_pos is None else shift_pos
         pos_bohr = pos / autoang  # angstrom -> bohr
         if cell is None:
             cell_bohr: Optional[Tensor] = None

From 14f6dfcdd717c15446cccb4a873c0e8240cdec6c Mon Sep 17 00:00:00 2001
From: corochann <acc1ssnn9terias@gmail.com>
Date: Wed, 20 Oct 2021 15:31:08 +0900
Subject: [PATCH 14/14] fix docstring

---
 torch_dftd/functions/dftd3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_dftd/functions/dftd3.py b/torch_dftd/functions/dftd3.py
index f13a47d..f69667f 100644
--- a/torch_dftd/functions/dftd3.py
+++ b/torch_dftd/functions/dftd3.py
@@ -149,7 +149,7 @@ def edisp(
         cnthr (float or None): cutoff distance for coordination number calculation in **bohr**
         batch (Tensor or None): (n_atoms,)
         batch_edge (Tensor or None): (n_edges,)
-        shift_pos (Tensor or None): (n_atoms,) used to calculate 3-body term when abc=True
+        shift_pos (Tensor or None): (n_edges, 3) used to calculate 3-body term when abc=True
         pos (Tensor): (n_atoms, 3) position in **bohr**
         cell (Tensor): (3, 3) cell size in **bohr**
         r2 (Tensor or None):