From 450b2757e87af8ca8fad4fefc1fa31a0c4d99960 Mon Sep 17 00:00:00 2001
From: raunaks13 <raunak@eto.ai>
Date: Thu, 23 May 2024 01:08:27 +0000
Subject: [PATCH 1/7] initial benchmarking structure for encodings

---
 python/python/benchmarks/test_v2_reader.py | 219 +++++++++++++++++++++
 1 file changed, 219 insertions(+)
 create mode 100644 python/python/benchmarks/test_v2_reader.py

diff --git a/python/python/benchmarks/test_v2_reader.py b/python/python/benchmarks/test_v2_reader.py
new file mode 100644
index 0000000000..c95e5b3a9e
--- /dev/null
+++ b/python/python/benchmarks/test_v2_reader.py
@@ -0,0 +1,219 @@
+import csv
+import os
+import shutil
+from datetime import datetime
+
+import pyarrow as pa
+import pyarrow.dataset as ds
+import pyarrow.parquet as pq
+from lance.file import LanceFileReader, LanceFileWriter
+from lance.tracing import trace_to_chrome
+
+trace_to_chrome(level="trace")
+
+"""
+Data generation (in-memory)
+import duckdb
+import pyarrow.parquet as pq
+
+con = duckdb.connect(database=':memory:')
+con.execute("INSTALL tpch; LOAD tpch")
+con.execute("CALL dbgen(sf=10)")
+
+tables = ["lineitem"]
+for t in tables:
+    res = con.query("SELECT * FROM " + t)
+    pq.write_table(res.to_arrow_table(), t + ".parquet")
+"""
+
+
+def write_lance_file(dataset, dir_path="/tmp/tpch.lancev2"):
+    """
+    Takes a pyarrow dataset object as input, and writes a lance
+    file to disk in a directory
+    """
+
+    with LanceFileWriter(dir_path, dataset.schema) as writer:
+        for batch in dataset.to_batches():
+            writer.write_batch(batch)
+
+
+def measure_pyarrow_read_time(path, num_trials, verbose=False):
+    """
+    Measures the time required to read a parquet file using pyarrow,
+    averaged over multiple trials
+    """
+    parquet_read_time = 0
+    for trial in range(1, num_trials + 1):
+        start = datetime.now()
+        pa.parquet.read_table(path)
+        end = datetime.now()
+        parquet_read_time += (end - start).total_seconds()
+
+        if verbose:
+            print(f"Parquet Read Time: {parquet_read_time/trial}s")
+
+    avg_read_time = parquet_read_time / num_trials
+    print(f"Parquet Read Time: {avg_read_time}s")
+    return avg_read_time
+
+
+def measure_lance_read_time(path, num_trials, batch_size, verbose=False):
+    """
+    Measures the time required to read a lance file using lance,
+    averaged over multiple trials
+    """
+    lance_read_time = 0
+    for trial in range(1, num_trials + 1):
+        start = datetime.now()
+        reader = LanceFileReader(path)
+        for batch in reader.read_all(batch_size=batch_size).to_batches():
+            pass
+        end = datetime.now()
+
+        lance_read_time += (end - start).total_seconds()
+
+        if verbose:
+            print(f"V2 Read Time: {lance_read_time/trial}s")
+
+    avg_read_time = lance_read_time / num_trials
+    print(f"V2 Read Time: {avg_read_time}s")
+    return avg_read_time
+
+
+def benchmark_tpch_lineitem(dataset_dir, num_trials=10, verbose=False):
+    name = benchmark_tpch_lineitem.__name__
+    dataset_path = dataset_dir + "lineitem.parquet"
+    output = csv.writer(open(name + ".csv", "w"))
+    output.writerow(
+        [
+            "name",
+            "dataset",
+            "num_trials",
+            "num_rows",
+            "parquet_size",
+            "pyarrow_read_time",
+            "lance_size",
+            "lance_read_time",
+        ]
+    )
+
+    dataset = ds.dataset(dataset_path)
+    num_rows = dataset.count_rows()
+    parquet_size = os.path.getsize(dataset_path) / (1024**2)
+
+    pyarrow_read_time = measure_pyarrow_read_time(dataset_path, num_trials=10)
+
+    dataset = ds.dataset(dataset_path)
+    lance_path = "/tmp/tpch.lancev2"
+    write_lance_file(dataset, dir_path=lance_path)
+    lance_size = os.path.getsize(lance_path) / (1024**2)
+
+    lance_read_time = measure_lance_read_time(
+        lance_path, num_trials=10, batch_size=1024 * 8
+    )
+
+    output.writerow(
+        [
+            name,
+            "tpch_lineitem",
+            num_trials,
+            num_rows,
+            parquet_size,
+            pyarrow_read_time,
+            lance_size,
+            lance_read_time,
+        ]
+    )
+
+
+def benchmark_tpch_numeric_encodings(dataset_dir, num_trials=10, verbose=False):
+    """
+    Loads numeric columns from TPCH tables and benchmarks the read times for
+    parquet and lance files
+    """
+
+    name = benchmark_tpch_numeric_encodings.__name__
+    output = csv.writer(open(name + ".csv", "w"))
+    output.writerow(
+        [
+            "name",
+            "dataset",
+            "num_trials",
+            "num_rows",
+            "parquet_size",
+            "pyarrow_read_time",
+            "lance_size",
+            "lance_read_time",
+        ]
+    )
+
+    tables = [
+        "nation",
+        "region",
+        "part",
+        "supplier",
+        "customer",
+        "partsupp",
+        "orders",
+        "lineitem",
+    ]
+
+    for table_name in tables:
+        print("Table: {}".format(table_name))
+        PATH = dataset_dir + table_name + ".parquet"
+
+        orig_dataset = ds.dataset(PATH)
+        orig_schema = orig_dataset.schema
+
+        # Only specified the numeric types relevant to TPCH. We can expand it
+        # if we generalize this later
+        int_columns = [
+            field.name
+            for field in orig_schema
+            if field.type
+            in (pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.decimal128(15, 2))
+        ]
+
+        # Write table with only numeric columns to disk
+        table = orig_dataset.to_table(columns=int_columns)
+        parquet_path = "/tmp/numeric_ds.parquet"
+        pq.write_table(table, parquet_path)
+        dataset = ds.dataset(parquet_path)
+
+        num_rows = dataset.count_rows()
+        parquet_size = os.path.getsize(PATH) / (1024**2)
+
+        pyarrow_read_time = measure_pyarrow_read_time(parquet_path, num_trials=10)
+
+        lance_path = "/tmp/tpch.lancev2"
+        write_lance_file(dataset, dir_path=lance_path)
+        lance_size = os.path.getsize(lance_path) / (1024**2)
+
+        lance_read_time = measure_lance_read_time(
+            lance_path, num_trials=10, batch_size=1024 * 8
+        )
+
+        output.writerow(
+            [
+                name,
+                "tpch_" + table_name,
+                num_trials,
+                num_rows,
+                parquet_size,
+                pyarrow_read_time,
+                lance_size,
+                lance_read_time,
+            ]
+        )
+
+        os.remove(parquet_path)
+
+        if os.path.isdir(lance_path):
+            shutil.rmtree(lance_path)
+        else:
+            os.remove(lance_path)
+
+
+# benchmark_tpch_lineitem("/home/ubuntu/test/TPCH_SF1/")
+benchmark_tpch_numeric_encodings("/home/ubuntu/test/TPCH_SF1/")

From 3dd59c5c64f35349cfbb7613c167ad1ed0eb713f Mon Sep 17 00:00:00 2001
From: raunaks13 <raunak@eto.ai>
Date: Thu, 23 May 2024 21:56:28 +0000
Subject: [PATCH 2/7] added benchmarking for writes, plain non-numeric, and
 plain vector encodings

---
 python/python/benchmarks/test_v2_reader.py | 245 +++++++++++++++++----
 1 file changed, 208 insertions(+), 37 deletions(-)

diff --git a/python/python/benchmarks/test_v2_reader.py b/python/python/benchmarks/test_v2_reader.py
index c95e5b3a9e..31607e709c 100644
--- a/python/python/benchmarks/test_v2_reader.py
+++ b/python/python/benchmarks/test_v2_reader.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
 import csv
 import os
 import shutil
@@ -27,17 +29,6 @@
 """
 
 
-def write_lance_file(dataset, dir_path="/tmp/tpch.lancev2"):
-    """
-    Takes a pyarrow dataset object as input, and writes a lance
-    file to disk in a directory
-    """
-
-    with LanceFileWriter(dir_path, dataset.schema) as writer:
-        for batch in dataset.to_batches():
-            writer.write_batch(batch)
-
-
 def measure_pyarrow_read_time(path, num_trials, verbose=False):
     """
     Measures the time required to read a parquet file using pyarrow,
@@ -58,6 +49,28 @@ def measure_pyarrow_read_time(path, num_trials, verbose=False):
     return avg_read_time
 
 
+def measure_parquet_write_time(table, path, num_trials, verbose=False):
+    """
+    Measures the time required to write a parquet file using pyarrow
+    """
+    if os.path.exists(path):
+        if os.path.isdir(path):
+            shutil.rmtree(path)
+        else:
+            os.remove(path)
+
+    parquet_write_time = 0
+    for trial in range(1, num_trials + 1):
+        start = datetime.now()
+        pq.write_table(table, path, compression="snappy")
+        end = datetime.now()
+        parquet_write_time += (end - start).total_seconds()
+
+    avg_write_time = parquet_write_time / num_trials
+    print(f"Parquet Write Time: {avg_write_time}s")
+    return avg_write_time
+
+
 def measure_lance_read_time(path, num_trials, batch_size, verbose=False):
     """
     Measures the time required to read a lance file using lance,
@@ -81,6 +94,32 @@ def measure_lance_read_time(path, num_trials, batch_size, verbose=False):
     return avg_read_time
 
 
+def measure_lance_write_time(dataset, path, num_trials=10, verbose=False):
+    """
+    Takes a lance dataset object as input, and writes a lance
+    file to disk in a directory
+    """
+
+    if os.path.exists(path):
+        if os.path.isdir(path):
+            shutil.rmtree(path)
+        else:
+            os.remove(path)
+
+    lance_write_time = 0
+    for trial in range(1, num_trials + 1):
+        start = datetime.now()
+        with LanceFileWriter(path, dataset.schema) as writer:
+            for batch in dataset.to_batches():
+                writer.write_batch(batch)
+        end = datetime.now()
+        lance_write_time += (end - start).total_seconds()
+
+    avg_write_time = lance_write_time / num_trials
+    print(f"Lance Write Time: {avg_write_time}s")
+    return avg_write_time
+
+
 def benchmark_tpch_lineitem(dataset_dir, num_trials=10, verbose=False):
     name = benchmark_tpch_lineitem.__name__
     dataset_path = dataset_dir + "lineitem.parquet"
@@ -106,7 +145,7 @@ def benchmark_tpch_lineitem(dataset_dir, num_trials=10, verbose=False):
 
     dataset = ds.dataset(dataset_path)
     lance_path = "/tmp/tpch.lancev2"
-    write_lance_file(dataset, dir_path=lance_path)
+    measure_lance_write_time(dataset, path=lance_path, num_trials=1)
     lance_size = os.path.getsize(lance_path) / (1024**2)
 
     lance_read_time = measure_lance_read_time(
@@ -127,23 +166,28 @@ def benchmark_tpch_lineitem(dataset_dir, num_trials=10, verbose=False):
     )
 
 
-def benchmark_tpch_numeric_encodings(dataset_dir, num_trials=10, verbose=False):
+def benchmark_tpch_encodings(
+    dataset_dir, dataset_name, encoding_type, num_trials=10, verbose=False
+):
     """
     Loads numeric columns from TPCH tables and benchmarks the read times for
     parquet and lance files
     """
 
-    name = benchmark_tpch_numeric_encodings.__name__
-    output = csv.writer(open(name + ".csv", "w"))
+    benchmark_name = benchmark_tpch_encodings.__name__
+    output = csv.writer(open(benchmark_name + "_" + encoding_type + ".csv", "w"))
     output.writerow(
         [
             "name",
             "dataset",
+            "lance_encoding_type",
             "num_trials",
             "num_rows",
             "parquet_size",
-            "pyarrow_read_time",
+            "parquet_write_time",
+            "parquet_read_time",
             "lance_size",
+            "lance_write_time",
             "lance_read_time",
         ]
     )
@@ -166,43 +210,55 @@ def benchmark_tpch_numeric_encodings(dataset_dir, num_trials=10, verbose=False):
         orig_dataset = ds.dataset(PATH)
         orig_schema = orig_dataset.schema
 
-        # Only specified the numeric types relevant to TPCH. We can expand it
-        # if we generalize this later
-        int_columns = [
-            field.name
-            for field in orig_schema
-            if field.type
-            in (pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.decimal128(15, 2))
+        # for field in orig_schema:
+        #     print(field.name, field.type)
+
+        if encoding_type == "plain_numeric":
+            target_types = [pa.int32(), pa.int64(), pa.float32(), pa.float64()]
+        elif encoding_type == "plain_non_numeric":
+            target_types = [pa.string(), pa.date32()]
+
+        target_columns = [
+            field.name for field in orig_schema if field.type in target_types
         ]
 
-        # Write table with only numeric columns to disk
-        table = orig_dataset.to_table(columns=int_columns)
-        parquet_path = "/tmp/numeric_ds.parquet"
-        pq.write_table(table, parquet_path)
-        dataset = ds.dataset(parquet_path)
+        table = orig_dataset.to_table(columns=target_columns)
+        parquet_path = "/tmp/parquet_ds.parquet"
 
+        parquet_write_time = measure_parquet_write_time(
+            table, parquet_path, num_trials=num_trials
+        )
+
+        dataset = ds.dataset(parquet_path)
         num_rows = dataset.count_rows()
-        parquet_size = os.path.getsize(PATH) / (1024**2)
+        parquet_size = os.path.getsize(parquet_path) / (1024**2)
 
-        pyarrow_read_time = measure_pyarrow_read_time(parquet_path, num_trials=10)
+        parquet_read_time = measure_pyarrow_read_time(
+            parquet_path, num_trials=num_trials
+        )
 
         lance_path = "/tmp/tpch.lancev2"
-        write_lance_file(dataset, dir_path=lance_path)
+        lance_write_time = measure_lance_write_time(
+            table, lance_path, num_trials=num_trials
+        )
         lance_size = os.path.getsize(lance_path) / (1024**2)
 
         lance_read_time = measure_lance_read_time(
-            lance_path, num_trials=10, batch_size=1024 * 8
+            lance_path, num_trials=num_trials, batch_size=1024 * 8
         )
 
         output.writerow(
             [
-                name,
-                "tpch_" + table_name,
+                benchmark_name,
+                dataset_name + "_" + table_name,
+                encoding_type,
                 num_trials,
                 num_rows,
                 parquet_size,
-                pyarrow_read_time,
+                parquet_write_time,
+                parquet_read_time,
                 lance_size,
+                lance_write_time,
                 lance_read_time,
             ]
         )
@@ -215,5 +271,120 @@ def benchmark_tpch_numeric_encodings(dataset_dir, num_trials=10, verbose=False):
             os.remove(lance_path)
 
 
-# benchmark_tpch_lineitem("/home/ubuntu/test/TPCH_SF1/")
-benchmark_tpch_numeric_encodings("/home/ubuntu/test/TPCH_SF1/")
+"""
+Data generation (sift)
+    nvecs = 10000
+    ndims = 128
+    with open("../sift/sift_base.fvecs", mode="rb") as fobj:
+        buf = fobj.read()
+        data = np.array(struct.unpack("<1280000f",
+                        buf[4 : 4 + 4 * nvecs * ndims])).reshape((nvecs, ndims))
+        dd = dict(zip(range(nvecs), data))
+
+    table = vec_to_table(dd)
+    parquet_path = "/home/ubuntu/test/sift1m.parquet"
+
+"""
+
+
+def benchmark_sift_vector_encodings(
+    dataset_dir, dataset_name, encoding_type, num_trials=10, verbose=False
+):
+    benchmark_name = benchmark_sift_vector_encodings.__name__
+    output = csv.writer(open(benchmark_name + "_" + encoding_type + ".csv", "w"))
+    output.writerow(
+        [
+            "name",
+            "dataset",
+            "lance_encoding_type",
+            "num_trials",
+            "num_rows",
+            "parquet_size",
+            "parquet_write_time",
+            "parquet_read_time",
+            "lance_size",
+            "lance_write_time",
+            "lance_read_time",
+        ]
+    )
+
+    PATH = dataset_dir + "sift1m.parquet"
+    orig_dataset = ds.dataset(PATH)
+    # orig_schema = orig_dataset.schema
+
+    # for field in orig_schema:
+    #     print(field.name, field.type)
+
+    # target_columns = [
+    #     field.name for field in orig_schema if is_fixed_size_list(field.type)
+    # ]
+    # print(target_columns)
+
+    table = orig_dataset.to_table()
+    parquet_path = "/tmp/parquet_ds.parquet"
+
+    parquet_write_time = measure_parquet_write_time(
+        table, parquet_path, num_trials=num_trials
+    )
+
+    dataset = ds.dataset(parquet_path)
+    num_rows = dataset.count_rows()
+    parquet_size = os.path.getsize(parquet_path) / (1024**2)
+
+    parquet_read_time = measure_pyarrow_read_time(parquet_path, num_trials=num_trials)
+
+    lance_path = "/tmp/tpch.lancev2"
+    lance_write_time = measure_lance_write_time(
+        table, lance_path, num_trials=num_trials
+    )
+    lance_size = os.path.getsize(lance_path) / (1024**2)
+
+    lance_read_time = measure_lance_read_time(
+        lance_path, num_trials=num_trials, batch_size=1024 * 8
+    )
+
+    output.writerow(
+        [
+            benchmark_name,
+            dataset_name,
+            encoding_type,
+            num_trials,
+            num_rows,
+            parquet_size,
+            parquet_write_time,
+            parquet_read_time,
+            lance_size,
+            lance_write_time,
+            lance_read_time,
+        ]
+    )
+
+    # os.remove(parquet_path)
+
+    # if os.path.isdir(lance_path):
+    #     shutil.rmtree(lance_path)
+    # else:
+    #     os.remove(lance_path)
+
+
+if __name__ == "__main__":
+    if os.path.exists("benchmark_tpch_encodings.csv"):
+        os.remove("benchmark_tpch_encodings.csv")
+
+    # benchmark_tpch_lineitem("/home/ubuntu/test/TPCH_SF1/")
+    benchmark_tpch_encodings(
+        "/home/ubuntu/test/TPCH_SF1/",
+        dataset_name="tpch_sf1",
+        encoding_type="plain_numeric",
+    )
+    benchmark_tpch_encodings(
+        "/home/ubuntu/test/TPCH_SF1/",
+        dataset_name="tpch_sf1",
+        encoding_type="plain_non_numeric",
+    )
+    benchmark_sift_vector_encodings(
+        "/home/ubuntu/test/",
+        dataset_name="sift1m",
+        encoding_type="plain_fixed_size_list",
+        num_trials=3,
+    )

From c9a9c4a3d1ec406a027e51e1ddc2ed8ae21ea9e3 Mon Sep 17 00:00:00 2001
From: raunaks13 <raunak@eto.ai>
Date: Thu, 23 May 2024 22:16:23 +0000
Subject: [PATCH 3/7] updated verbose messages

---
 python/python/benchmarks/test_v2_reader.py | 39 +++++++++++-----------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/python/python/benchmarks/test_v2_reader.py b/python/python/benchmarks/test_v2_reader.py
index 31607e709c..2ceb31bafd 100644
--- a/python/python/benchmarks/test_v2_reader.py
+++ b/python/python/benchmarks/test_v2_reader.py
@@ -42,7 +42,9 @@ def measure_pyarrow_read_time(path, num_trials, verbose=False):
         parquet_read_time += (end - start).total_seconds()
 
         if verbose:
-            print(f"Parquet Read Time: {parquet_read_time/trial}s")
+            print(
+                f"Parquet Read Time for trial {trial}: {(end - start).total_seconds()}s"
+            )
 
     avg_read_time = parquet_read_time / num_trials
     print(f"Parquet Read Time: {avg_read_time}s")
@@ -66,6 +68,12 @@ def measure_parquet_write_time(table, path, num_trials, verbose=False):
         end = datetime.now()
         parquet_write_time += (end - start).total_seconds()
 
+        if verbose:
+            print(
+                f"Parquet Write Time for trial {trial}: \
+                    {(end - start).total_seconds()}s"
+            )
+
     avg_write_time = parquet_write_time / num_trials
     print(f"Parquet Write Time: {avg_write_time}s")
     return avg_write_time
@@ -87,10 +95,12 @@ def measure_lance_read_time(path, num_trials, batch_size, verbose=False):
         lance_read_time += (end - start).total_seconds()
 
         if verbose:
-            print(f"V2 Read Time: {lance_read_time/trial}s")
+            print(
+                f"Lance Read Time for trial {trial}: {(end - start).total_seconds()}s"
+            )
 
     avg_read_time = lance_read_time / num_trials
-    print(f"V2 Read Time: {avg_read_time}s")
+    print(f"Lance Read Time: {avg_read_time}s")
     return avg_read_time
 
 
@@ -115,6 +125,11 @@ def measure_lance_write_time(dataset, path, num_trials=10, verbose=False):
         end = datetime.now()
         lance_write_time += (end - start).total_seconds()
 
+        if verbose:
+            print(
+                f"Lance Write Time for trial {trial}: {(end - start).total_seconds()}s"
+            )
+
     avg_write_time = lance_write_time / num_trials
     print(f"Lance Write Time: {avg_write_time}s")
     return avg_write_time
@@ -310,15 +325,6 @@ def benchmark_sift_vector_encodings(
 
     PATH = dataset_dir + "sift1m.parquet"
     orig_dataset = ds.dataset(PATH)
-    # orig_schema = orig_dataset.schema
-
-    # for field in orig_schema:
-    #     print(field.name, field.type)
-
-    # target_columns = [
-    #     field.name for field in orig_schema if is_fixed_size_list(field.type)
-    # ]
-    # print(target_columns)
 
     table = orig_dataset.to_table()
     parquet_path = "/tmp/parquet_ds.parquet"
@@ -359,13 +365,6 @@ def benchmark_sift_vector_encodings(
         ]
     )
 
-    # os.remove(parquet_path)
-
-    # if os.path.isdir(lance_path):
-    #     shutil.rmtree(lance_path)
-    # else:
-    #     os.remove(lance_path)
-
 
 if __name__ == "__main__":
     if os.path.exists("benchmark_tpch_encodings.csv"):
@@ -386,5 +385,5 @@ def benchmark_sift_vector_encodings(
         "/home/ubuntu/test/",
         dataset_name="sift1m",
         encoding_type="plain_fixed_size_list",
-        num_trials=3,
+        num_trials=5,
     )

From ebc36045761e8c9eb3439ecc60e8aca37ebaa79b Mon Sep 17 00:00:00 2001
From: raunaks13 <raunak@eto.ai>
Date: Sat, 25 May 2024 00:50:18 +0000
Subject: [PATCH 4/7] added timestamp encodings, increased dataset size

---
 python/python/benchmarks/test_v2_reader.py | 67 ++++++++++++++++------
 1 file changed, 49 insertions(+), 18 deletions(-)

diff --git a/python/python/benchmarks/test_v2_reader.py b/python/python/benchmarks/test_v2_reader.py
index 2ceb31bafd..a5730cf071 100644
--- a/python/python/benchmarks/test_v2_reader.py
+++ b/python/python/benchmarks/test_v2_reader.py
@@ -190,7 +190,9 @@ def benchmark_tpch_encodings(
     """
 
     benchmark_name = benchmark_tpch_encodings.__name__
-    output = csv.writer(open(benchmark_name + "_" + encoding_type + ".csv", "w"))
+    output = csv.writer(
+        open(benchmark_name + "_" + dataset_name + "_" + encoding_type + ".csv", "w")
+    )
     output.writerow(
         [
             "name",
@@ -231,11 +233,16 @@ def benchmark_tpch_encodings(
         if encoding_type == "plain_numeric":
             target_types = [pa.int32(), pa.int64(), pa.float32(), pa.float64()]
         elif encoding_type == "plain_non_numeric":
-            target_types = [pa.string(), pa.date32()]
+            target_types = [pa.string()]
+        elif encoding_type == "plain_timestamp":
+            target_types = [pa.date32()]
 
         target_columns = [
             field.name for field in orig_schema if field.type in target_types
         ]
+        print(target_columns)
+        if len(target_columns) == 0:
+            continue
 
         table = orig_dataset.to_table(columns=target_columns)
         parquet_path = "/tmp/parquet_ds.parquet"
@@ -246,7 +253,7 @@ def benchmark_tpch_encodings(
 
         dataset = ds.dataset(parquet_path)
         num_rows = dataset.count_rows()
-        parquet_size = os.path.getsize(parquet_path) / (1024**2)
+        parquet_size = os.path.getsize(parquet_path) / (1024**3)
 
         parquet_read_time = measure_pyarrow_read_time(
             parquet_path, num_trials=num_trials
@@ -256,7 +263,7 @@ def benchmark_tpch_encodings(
         lance_write_time = measure_lance_write_time(
             table, lance_path, num_trials=num_trials
         )
-        lance_size = os.path.getsize(lance_path) / (1024**2)
+        lance_size = os.path.getsize(lance_path) / (1024**3)
 
         lance_read_time = measure_lance_read_time(
             lance_path, num_trials=num_trials, batch_size=1024 * 8
@@ -288,11 +295,11 @@ def benchmark_tpch_encodings(
 
 """
 Data generation (sift)
-    nvecs = 10000
+    nvecs = 1000000
     ndims = 128
     with open("../sift/sift_base.fvecs", mode="rb") as fobj:
         buf = fobj.read()
-        data = np.array(struct.unpack("<1280000f",
+        data = np.array(struct.unpack("<128000000f",
                         buf[4 : 4 + 4 * nvecs * ndims])).reshape((nvecs, ndims))
         dd = dict(zip(range(nvecs), data))
 
@@ -335,7 +342,7 @@ def benchmark_sift_vector_encodings(
 
     dataset = ds.dataset(parquet_path)
     num_rows = dataset.count_rows()
-    parquet_size = os.path.getsize(parquet_path) / (1024**2)
+    parquet_size = os.path.getsize(parquet_path) / (1024**3)
 
     parquet_read_time = measure_pyarrow_read_time(parquet_path, num_trials=num_trials)
 
@@ -343,7 +350,7 @@ def benchmark_sift_vector_encodings(
     lance_write_time = measure_lance_write_time(
         table, lance_path, num_trials=num_trials
     )
-    lance_size = os.path.getsize(lance_path) / (1024**2)
+    lance_size = os.path.getsize(lance_path) / (1024**3)
 
     lance_read_time = measure_lance_read_time(
         lance_path, num_trials=num_trials, batch_size=1024 * 8
@@ -367,23 +374,47 @@ def benchmark_sift_vector_encodings(
 
 
 if __name__ == "__main__":
-    if os.path.exists("benchmark_tpch_encodings.csv"):
-        os.remove("benchmark_tpch_encodings.csv")
+    # if os.path.exists("benchmark_tpch_encodings_plain_non_numeric.csv"):
+    #     os.remove("benchmark_tpch_encodings_plain_non_numeric.csv")
 
     # benchmark_tpch_lineitem("/home/ubuntu/test/TPCH_SF1/")
+    # benchmark_tpch_encodings(
+    #     "/home/ubuntu/test/TPCH_SF1/",
+    #     dataset_name="tpch_sf1",
+    #     encoding_type="plain_numeric",
+    # )
+    # benchmark_tpch_encodings(
+    #     "/home/ubuntu/test/TPCH_SF1/",
+    #     dataset_name="tpch_sf1",
+    #     encoding_type="plain_non_numeric",
+    # )
+    # benchmark_tpch_encodings(
+    #     "/home/ubuntu/test/TPCH_SF1/",
+    #     dataset_name="tpch_sf1",
+    #     encoding_type="plain_timestamp",
+    # )
+
     benchmark_tpch_encodings(
-        "/home/ubuntu/test/TPCH_SF1/",
-        dataset_name="tpch_sf1",
+        "/home/ubuntu/test/TPCH_SF10/",
+        dataset_name="tpch_sf10",
         encoding_type="plain_numeric",
+        num_trials=5,
     )
     benchmark_tpch_encodings(
-        "/home/ubuntu/test/TPCH_SF1/",
-        dataset_name="tpch_sf1",
+        "/home/ubuntu/test/TPCH_SF10/",
+        dataset_name="tpch_sf10",
         encoding_type="plain_non_numeric",
+        num_trials=5,
     )
-    benchmark_sift_vector_encodings(
-        "/home/ubuntu/test/",
-        dataset_name="sift1m",
-        encoding_type="plain_fixed_size_list",
+    benchmark_tpch_encodings(
+        "/home/ubuntu/test/TPCH_SF10/",
+        dataset_name="tpch_sf10",
+        encoding_type="plain_timestamp",
         num_trials=5,
     )
+    # benchmark_sift_vector_encodings(
+    #     "/home/ubuntu/test/",
+    #     dataset_name="sift1m",
+    #     encoding_type="plain_fixed_size_list",
+    #     num_trials=5,
+    # )

From 91cb5f1c7695c4ff4c798b67417a1e69c2a20ae7 Mon Sep 17 00:00:00 2001
From: raunaks13 <raunak.shah20@gmail.com>
Date: Thu, 1 Aug 2024 00:35:23 +0000
Subject: [PATCH 5/7] updates

---
 ...est_v2_reader.py => test_v2_read_write.py} | 49 ++++++-------------
 1 file changed, 16 insertions(+), 33 deletions(-)
 rename python/python/benchmarks/{test_v2_reader.py => test_v2_read_write.py} (90%)

diff --git a/python/python/benchmarks/test_v2_reader.py b/python/python/benchmarks/test_v2_read_write.py
similarity index 90%
rename from python/python/benchmarks/test_v2_reader.py
rename to python/python/benchmarks/test_v2_read_write.py
index a5730cf071..40d22f23ef 100644
--- a/python/python/benchmarks/test_v2_reader.py
+++ b/python/python/benchmarks/test_v2_read_write.py
@@ -374,47 +374,30 @@ def benchmark_sift_vector_encodings(
 
 
 if __name__ == "__main__":
-    # if os.path.exists("benchmark_tpch_encodings_plain_non_numeric.csv"):
-    #     os.remove("benchmark_tpch_encodings_plain_non_numeric.csv")
-
-    # benchmark_tpch_lineitem("/home/ubuntu/test/TPCH_SF1/")
-    # benchmark_tpch_encodings(
-    #     "/home/ubuntu/test/TPCH_SF1/",
-    #     dataset_name="tpch_sf1",
-    #     encoding_type="plain_numeric",
-    # )
-    # benchmark_tpch_encodings(
-    #     "/home/ubuntu/test/TPCH_SF1/",
-    #     dataset_name="tpch_sf1",
-    #     encoding_type="plain_non_numeric",
-    # )
-    # benchmark_tpch_encodings(
-    #     "/home/ubuntu/test/TPCH_SF1/",
-    #     dataset_name="tpch_sf1",
-    #     encoding_type="plain_timestamp",
-    # )
+    # Should contain all table files
+    tpch_dir_path = "/tmp/TPCH_SF1/"
+    dataset_name = "tpch_sf1"
 
+    benchmark_tpch_lineitem(tpch_dir_path)
     benchmark_tpch_encodings(
-        "/home/ubuntu/test/TPCH_SF10/",
-        dataset_name="tpch_sf10",
+        tpch_dir_path,
+        dataset_name=dataset_name,
         encoding_type="plain_numeric",
-        num_trials=5,
     )
     benchmark_tpch_encodings(
-        "/home/ubuntu/test/TPCH_SF10/",
-        dataset_name="tpch_sf10",
+        tpch_dir_path,
+        dataset_name=dataset_name,
         encoding_type="plain_non_numeric",
-        num_trials=5,
     )
     benchmark_tpch_encodings(
-        "/home/ubuntu/test/TPCH_SF10/",
-        dataset_name="tpch_sf10",
+        tpch_dir_path,
+        dataset_name=dataset_name,
         encoding_type="plain_timestamp",
+    )
+
+    benchmark_sift_vector_encodings(
+        "/home/ubuntu/test/",
+        dataset_name="sift1m",
+        encoding_type="plain_fixed_size_list",
         num_trials=5,
     )
-    # benchmark_sift_vector_encodings(
-    #     "/home/ubuntu/test/",
-    #     dataset_name="sift1m",
-    #     encoding_type="plain_fixed_size_list",
-    #     num_trials=5,
-    # )

From 413f3e92a0f0a204a4385f44d8b889f98bdfc04f Mon Sep 17 00:00:00 2001
From: raunaks13 <raunak.shah20@gmail.com>
Date: Fri, 2 Aug 2024 16:19:55 +0000
Subject: [PATCH 6/7] addressed comments

---
 python/python/benchmarks/test_v2_read_write.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/python/benchmarks/test_v2_read_write.py b/python/python/benchmarks/test_v2_read_write.py
index 40d22f23ef..2d99b4b5b2 100644
--- a/python/python/benchmarks/test_v2_read_write.py
+++ b/python/python/benchmarks/test_v2_read_write.py
@@ -11,7 +11,7 @@
 from lance.file import LanceFileReader, LanceFileWriter
 from lance.tracing import trace_to_chrome
 
-trace_to_chrome(level="trace")
+# trace_to_chrome(level="trace")
 
 """
 Data generation (in-memory)
@@ -227,9 +227,6 @@ def benchmark_tpch_encodings(
         orig_dataset = ds.dataset(PATH)
         orig_schema = orig_dataset.schema
 
-        # for field in orig_schema:
-        #     print(field.name, field.type)
-
         if encoding_type == "plain_numeric":
             target_types = [pa.int32(), pa.int64(), pa.float32(), pa.float64()]
         elif encoding_type == "plain_non_numeric":
@@ -240,7 +237,6 @@ def benchmark_tpch_encodings(
         target_columns = [
             field.name for field in orig_schema if field.type in target_types
         ]
-        print(target_columns)
         if len(target_columns) == 0:
             continue
 

From 344a40c73b824eb23ef2b9b07416180b0d07a21e Mon Sep 17 00:00:00 2001
From: raunaks13 <raunak.shah20@gmail.com>
Date: Fri, 2 Aug 2024 16:20:39 +0000
Subject: [PATCH 7/7] lint

---
 python/python/benchmarks/test_v2_read_write.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/python/benchmarks/test_v2_read_write.py b/python/python/benchmarks/test_v2_read_write.py
index 2d99b4b5b2..ecb1acee1e 100644
--- a/python/python/benchmarks/test_v2_read_write.py
+++ b/python/python/benchmarks/test_v2_read_write.py
@@ -9,7 +9,8 @@
 import pyarrow.dataset as ds
 import pyarrow.parquet as pq
 from lance.file import LanceFileReader, LanceFileWriter
-from lance.tracing import trace_to_chrome
+
+# from lance.tracing import trace_to_chrome
 
 # trace_to_chrome(level="trace")