[Cylon] Fix Cylon CI system and add slice operation on scaling test

Signed-off-by: Arup Sarker <[email protected]>
cylondata · Jul 25, 2023 · 283de82 · 283de82
1 parent b0b15f0
commit 283de82
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 6 deletions.
diff --git a/conda/environments/cylon_MacOS.yml b/conda/environments/cylon_MacOS.yml
@@ -10,8 +10,8 @@ dependencies:
   - glog
   - openmpi>=4.1.2
   - cython>=0.29,<0.30
-  - numpy
-  - pandas>=1.0,<1.6.0dev0
+  - numpy<1.24.4
+  - pandas>=1.0,<2.0.0
   - fsspec>=0.6.0
   - setuptools
   # they are not needed for using pygcylon or compiling it

diff --git a/conda/environments/gcylon.yml b/conda/environments/gcylon.yml
@@ -15,8 +15,8 @@ dependencies:
   - glog
   - openmpi=4.1.3=ha1ae619_105
   - ucx>=1.12.1
-  - numpy
-  - pandas>=1.0,<1.6.0dev0
+  - numpy<1.24.4
+  - pandas>=1.0,<2.0.0
   - fsspec>=0.6.0
   - setuptools
   # these are for running tests only,

diff --git a/conda/environments/windows.yml b/conda/environments/windows.yml
@@ -10,8 +10,8 @@ dependencies:
   - glog
   - msmpi
   - cython>=0.29,<0.30
-  - numpy
-  - pandas>=1.0,<1.6.0dev0
+  - numpy<1.24.4
+  - pandas>=1.0,<2.0.0
   - fsspec>=0.6.0
   - setuptools
   # they are not needed for using pygcylon or compiling it

diff --git a/rivanna/scripts/cylon_scaling.py b/rivanna/scripts/cylon_scaling.py
@@ -61,6 +61,57 @@ def join(data=None):
 
     env.finalize()
 
+def slice(data=None):
+    StopWatch.start(f"slice_total_{data['host']}_{data['rows']}_{data['it']}")
+
+    comm = MPI.COMM_WORLD
+
+    config = MPIConfig(comm)
+    env = CylonEnv(config=config, distributed=True)
+
+    u = data['unique']
+
+    if data['scaling'] == 'w':  # weak
+        num_rows = data['rows']
+        max_val = num_rows * env.world_size
+    else:  # 's' strong
+        max_val = data['rows']
+        num_rows = int(data['rows'] / env.world_size)
+
+    rng = default_rng(seed=env.rank)
+    data1 = rng.integers(0, int(max_val * u), size=(num_rows, 2))
+    data2 = rng.integers(0, int(max_val * u), size=(num_rows, 2))
+
+    df1 = DataFrame(pd.DataFrame(data1).add_prefix("col"))
+    df2 = DataFrame(pd.DataFrame(data2).add_prefix("col"))
+
+    if env.rank == 0:
+        print("Task# ", data['task'])
+
+    for i in range(data['it']):
+        env.barrier()
+        StopWatch.start(f"slice_{i}_{data['host']}_{data['rows']}_{data['it']}")
+        t1 = time.time()
+        df3 = df1[0:20000000, env] # distributed slice
+        #print(df3)
+        #df3 = df1.merge(df2, on=[0], algorithm='sort', env=env)
+        env.barrier()
+        t2 = time.time()
+        t = (t2 - t1)
+        sum_t = comm.reduce(t)
+        tot_l = comm.reduce(len(df3))
+
+        if env.rank == 0:
+            avg_t = sum_t / env.world_size
+            print("### ", data['scaling'], env.world_size, num_rows, max_val, i, avg_t, tot_l)
+            StopWatch.stop(f"slice_{i}_{data['host']}_{data['rows']}_{data['it']}")
+
+    StopWatch.stop(f"slice_total_{data['host']}_{data['rows']}_{data['it']}")
+
+    if env.rank == 0:
+        StopWatch.benchmark(tag=str(data))
+
+    env.finalize()
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="weak scaling")