From 450b2757e87af8ca8fad4fefc1fa31a0c4d99960 Mon Sep 17 00:00:00 2001 From: raunaks13 Date: Thu, 23 May 2024 01:08:27 +0000 Subject: [PATCH 1/7] initial benchmarking structure for encodings --- python/python/benchmarks/test_v2_reader.py | 219 +++++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 python/python/benchmarks/test_v2_reader.py diff --git a/python/python/benchmarks/test_v2_reader.py b/python/python/benchmarks/test_v2_reader.py new file mode 100644 index 0000000000..c95e5b3a9e --- /dev/null +++ b/python/python/benchmarks/test_v2_reader.py @@ -0,0 +1,219 @@ +import csv +import os +import shutil +from datetime import datetime + +import pyarrow as pa +import pyarrow.dataset as ds +import pyarrow.parquet as pq +from lance.file import LanceFileReader, LanceFileWriter +from lance.tracing import trace_to_chrome + +trace_to_chrome(level="trace") + +""" +Data generation (in-memory) +import duckdb +import pyarrow.parquet as pq + +con = duckdb.connect(database=':memory:') +con.execute("INSTALL tpch; LOAD tpch") +con.execute("CALL dbgen(sf=10)") + +tables = ["lineitem"] +for t in tables: + res = con.query("SELECT * FROM " + t) + pq.write_table(res.to_arrow_table(), t + ".parquet") +""" + + +def write_lance_file(dataset, dir_path="/tmp/tpch.lancev2"): + """ + Takes a pyarrow dataset object as input, and writes a lance + file to disk in a directory + """ + + with LanceFileWriter(dir_path, dataset.schema) as writer: + for batch in dataset.to_batches(): + writer.write_batch(batch) + + +def measure_pyarrow_read_time(path, num_trials, verbose=False): + """ + Measures the time required to read a parquet file using pyarrow, + averaged over multiple trials + """ + parquet_read_time = 0 + for trial in range(1, num_trials + 1): + start = datetime.now() + pa.parquet.read_table(path) + end = datetime.now() + parquet_read_time += (end - start).total_seconds() + + if verbose: + print(f"Parquet Read Time: {parquet_read_time/trial}s") + + avg_read_time = parquet_read_time / num_trials + print(f"Parquet Read Time: {avg_read_time}s") + return avg_read_time + + +def measure_lance_read_time(path, num_trials, batch_size, verbose=False): + """ + Measures the time required to read a lance file using lance, + averaged over multiple trials + """ + lance_read_time = 0 + for trial in range(1, num_trials + 1): + start = datetime.now() + reader = LanceFileReader(path) + for batch in reader.read_all(batch_size=batch_size).to_batches(): + pass + end = datetime.now() + + lance_read_time += (end - start).total_seconds() + + if verbose: + print(f"V2 Read Time: {lance_read_time/trial}s") + + avg_read_time = lance_read_time / num_trials + print(f"V2 Read Time: {avg_read_time}s") + return avg_read_time + + +def benchmark_tpch_lineitem(dataset_dir, num_trials=10, verbose=False): + name = benchmark_tpch_lineitem.__name__ + dataset_path = dataset_dir + "lineitem.parquet" + output = csv.writer(open(name + ".csv", "w")) + output.writerow( + [ + "name", + "dataset", + "num_trials", + "num_rows", + "parquet_size", + "pyarrow_read_time", + "lance_size", + "lance_read_time", + ] + ) + + dataset = ds.dataset(dataset_path) + num_rows = dataset.count_rows() + parquet_size = os.path.getsize(dataset_path) / (1024**2) + + pyarrow_read_time = measure_pyarrow_read_time(dataset_path, num_trials=10) + + dataset = ds.dataset(dataset_path) + lance_path = "/tmp/tpch.lancev2" + write_lance_file(dataset, dir_path=lance_path) + lance_size = os.path.getsize(lance_path) / (1024**2) + + lance_read_time = measure_lance_read_time( + lance_path, num_trials=10, batch_size=1024 * 8 + ) + + output.writerow( + [ + name, + "tpch_lineitem", + num_trials, + num_rows, + parquet_size, + pyarrow_read_time, + lance_size, + lance_read_time, + ] + ) + + +def benchmark_tpch_numeric_encodings(dataset_dir, num_trials=10, verbose=False): + """ + Loads numeric columns from TPCH tables and benchmarks the read times for + parquet and lance files + """ + + name = benchmark_tpch_numeric_encodings.__name__ + output = csv.writer(open(name + ".csv", "w")) + output.writerow( + [ + "name", + "dataset", + "num_trials", + "num_rows", + "parquet_size", + "pyarrow_read_time", + "lance_size", + "lance_read_time", + ] + ) + + tables = [ + "nation", + "region", + "part", + "supplier", + "customer", + "partsupp", + "orders", + "lineitem", + ] + + for table_name in tables: + print("Table: {}".format(table_name)) + PATH = dataset_dir + table_name + ".parquet" + + orig_dataset = ds.dataset(PATH) + orig_schema = orig_dataset.schema + + # Only specified the numeric types relevant to TPCH. We can expand it + # if we generalize this later + int_columns = [ + field.name + for field in orig_schema + if field.type + in (pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.decimal128(15, 2)) + ] + + # Write table with only numeric columns to disk + table = orig_dataset.to_table(columns=int_columns) + parquet_path = "/tmp/numeric_ds.parquet" + pq.write_table(table, parquet_path) + dataset = ds.dataset(parquet_path) + + num_rows = dataset.count_rows() + parquet_size = os.path.getsize(PATH) / (1024**2) + + pyarrow_read_time = measure_pyarrow_read_time(parquet_path, num_trials=10) + + lance_path = "/tmp/tpch.lancev2" + write_lance_file(dataset, dir_path=lance_path) + lance_size = os.path.getsize(lance_path) / (1024**2) + + lance_read_time = measure_lance_read_time( + lance_path, num_trials=10, batch_size=1024 * 8 + ) + + output.writerow( + [ + name, + "tpch_" + table_name, + num_trials, + num_rows, + parquet_size, + pyarrow_read_time, + lance_size, + lance_read_time, + ] + ) + + os.remove(parquet_path) + + if os.path.isdir(lance_path): + shutil.rmtree(lance_path) + else: + os.remove(lance_path) + + +# benchmark_tpch_lineitem("/home/ubuntu/test/TPCH_SF1/") +benchmark_tpch_numeric_encodings("/home/ubuntu/test/TPCH_SF1/") From 3dd59c5c64f35349cfbb7613c167ad1ed0eb713f Mon Sep 17 00:00:00 2001 From: raunaks13 Date: Thu, 23 May 2024 21:56:28 +0000 Subject: [PATCH 2/7] added benchmarking for writes, plain non-numeric, and plain vector encodings --- python/python/benchmarks/test_v2_reader.py | 245 +++++++++++++++++---- 1 file changed, 208 insertions(+), 37 deletions(-) diff --git a/python/python/benchmarks/test_v2_reader.py b/python/python/benchmarks/test_v2_reader.py index c95e5b3a9e..31607e709c 100644 --- a/python/python/benchmarks/test_v2_reader.py +++ b/python/python/benchmarks/test_v2_reader.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors import csv import os import shutil @@ -27,17 +29,6 @@ """ -def write_lance_file(dataset, dir_path="/tmp/tpch.lancev2"): - """ - Takes a pyarrow dataset object as input, and writes a lance - file to disk in a directory - """ - - with LanceFileWriter(dir_path, dataset.schema) as writer: - for batch in dataset.to_batches(): - writer.write_batch(batch) - - def measure_pyarrow_read_time(path, num_trials, verbose=False): """ Measures the time required to read a parquet file using pyarrow, @@ -58,6 +49,28 @@ def measure_pyarrow_read_time(path, num_trials, verbose=False): return avg_read_time +def measure_parquet_write_time(table, path, num_trials, verbose=False): + """ + Measures the time required to write a parquet file using pyarrow + """ + if os.path.exists(path): + if os.path.isdir(path): + shutil.rmtree(path) + else: + os.remove(path) + + parquet_write_time = 0 + for trial in range(1, num_trials + 1): + start = datetime.now() + pq.write_table(table, path, compression="snappy") + end = datetime.now() + parquet_write_time += (end - start).total_seconds() + + avg_write_time = parquet_write_time / num_trials + print(f"Parquet Write Time: {avg_write_time}s") + return avg_write_time + + def measure_lance_read_time(path, num_trials, batch_size, verbose=False): """ Measures the time required to read a lance file using lance, @@ -81,6 +94,32 @@ def measure_lance_read_time(path, num_trials, batch_size, verbose=False): return avg_read_time +def measure_lance_write_time(dataset, path, num_trials=10, verbose=False): + """ + Takes a lance dataset object as input, and writes a lance + file to disk in a directory + """ + + if os.path.exists(path): + if os.path.isdir(path): + shutil.rmtree(path) + else: + os.remove(path) + + lance_write_time = 0 + for trial in range(1, num_trials + 1): + start = datetime.now() + with LanceFileWriter(path, dataset.schema) as writer: + for batch in dataset.to_batches(): + writer.write_batch(batch) + end = datetime.now() + lance_write_time += (end - start).total_seconds() + + avg_write_time = lance_write_time / num_trials + print(f"Lance Write Time: {avg_write_time}s") + return avg_write_time + + def benchmark_tpch_lineitem(dataset_dir, num_trials=10, verbose=False): name = benchmark_tpch_lineitem.__name__ dataset_path = dataset_dir + "lineitem.parquet" @@ -106,7 +145,7 @@ def benchmark_tpch_lineitem(dataset_dir, num_trials=10, verbose=False): dataset = ds.dataset(dataset_path) lance_path = "/tmp/tpch.lancev2" - write_lance_file(dataset, dir_path=lance_path) + measure_lance_write_time(dataset, path=lance_path, num_trials=1) lance_size = os.path.getsize(lance_path) / (1024**2) lance_read_time = measure_lance_read_time( @@ -127,23 +166,28 @@ def benchmark_tpch_lineitem(dataset_dir, num_trials=10, verbose=False): ) -def benchmark_tpch_numeric_encodings(dataset_dir, num_trials=10, verbose=False): +def benchmark_tpch_encodings( + dataset_dir, dataset_name, encoding_type, num_trials=10, verbose=False +): """ Loads numeric columns from TPCH tables and benchmarks the read times for parquet and lance files """ - name = benchmark_tpch_numeric_encodings.__name__ - output = csv.writer(open(name + ".csv", "w")) + benchmark_name = benchmark_tpch_encodings.__name__ + output = csv.writer(open(benchmark_name + "_" + encoding_type + ".csv", "w")) output.writerow( [ "name", "dataset", + "lance_encoding_type", "num_trials", "num_rows", "parquet_size", - "pyarrow_read_time", + "parquet_write_time", + "parquet_read_time", "lance_size", + "lance_write_time", "lance_read_time", ] ) @@ -166,43 +210,55 @@ def benchmark_tpch_numeric_encodings(dataset_dir, num_trials=10, verbose=False): orig_dataset = ds.dataset(PATH) orig_schema = orig_dataset.schema - # Only specified the numeric types relevant to TPCH. We can expand it - # if we generalize this later - int_columns = [ - field.name - for field in orig_schema - if field.type - in (pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.decimal128(15, 2)) + # for field in orig_schema: + # print(field.name, field.type) + + if encoding_type == "plain_numeric": + target_types = [pa.int32(), pa.int64(), pa.float32(), pa.float64()] + elif encoding_type == "plain_non_numeric": + target_types = [pa.string(), pa.date32()] + + target_columns = [ + field.name for field in orig_schema if field.type in target_types ] - # Write table with only numeric columns to disk - table = orig_dataset.to_table(columns=int_columns) - parquet_path = "/tmp/numeric_ds.parquet" - pq.write_table(table, parquet_path) - dataset = ds.dataset(parquet_path) + table = orig_dataset.to_table(columns=target_columns) + parquet_path = "/tmp/parquet_ds.parquet" + parquet_write_time = measure_parquet_write_time( + table, parquet_path, num_trials=num_trials + ) + + dataset = ds.dataset(parquet_path) num_rows = dataset.count_rows() - parquet_size = os.path.getsize(PATH) / (1024**2) + parquet_size = os.path.getsize(parquet_path) / (1024**2) - pyarrow_read_time = measure_pyarrow_read_time(parquet_path, num_trials=10) + parquet_read_time = measure_pyarrow_read_time( + parquet_path, num_trials=num_trials + ) lance_path = "/tmp/tpch.lancev2" - write_lance_file(dataset, dir_path=lance_path) + lance_write_time = measure_lance_write_time( + table, lance_path, num_trials=num_trials + ) lance_size = os.path.getsize(lance_path) / (1024**2) lance_read_time = measure_lance_read_time( - lance_path, num_trials=10, batch_size=1024 * 8 + lance_path, num_trials=num_trials, batch_size=1024 * 8 ) output.writerow( [ - name, - "tpch_" + table_name, + benchmark_name, + dataset_name + "_" + table_name, + encoding_type, num_trials, num_rows, parquet_size, - pyarrow_read_time, + parquet_write_time, + parquet_read_time, lance_size, + lance_write_time, lance_read_time, ] ) @@ -215,5 +271,120 @@ def benchmark_tpch_numeric_encodings(dataset_dir, num_trials=10, verbose=False): os.remove(lance_path) -# benchmark_tpch_lineitem("/home/ubuntu/test/TPCH_SF1/") -benchmark_tpch_numeric_encodings("/home/ubuntu/test/TPCH_SF1/") +""" +Data generation (sift) + nvecs = 10000 + ndims = 128 + with open("../sift/sift_base.fvecs", mode="rb") as fobj: + buf = fobj.read() + data = np.array(struct.unpack("<1280000f", + buf[4 : 4 + 4 * nvecs * ndims])).reshape((nvecs, ndims)) + dd = dict(zip(range(nvecs), data)) + + table = vec_to_table(dd) + parquet_path = "/home/ubuntu/test/sift1m.parquet" + +""" + + +def benchmark_sift_vector_encodings( + dataset_dir, dataset_name, encoding_type, num_trials=10, verbose=False +): + benchmark_name = benchmark_sift_vector_encodings.__name__ + output = csv.writer(open(benchmark_name + "_" + encoding_type + ".csv", "w")) + output.writerow( + [ + "name", + "dataset", + "lance_encoding_type", + "num_trials", + "num_rows", + "parquet_size", + "parquet_write_time", + "parquet_read_time", + "lance_size", + "lance_write_time", + "lance_read_time", + ] + ) + + PATH = dataset_dir + "sift1m.parquet" + orig_dataset = ds.dataset(PATH) + # orig_schema = orig_dataset.schema + + # for field in orig_schema: + # print(field.name, field.type) + + # target_columns = [ + # field.name for field in orig_schema if is_fixed_size_list(field.type) + # ] + # print(target_columns) + + table = orig_dataset.to_table() + parquet_path = "/tmp/parquet_ds.parquet" + + parquet_write_time = measure_parquet_write_time( + table, parquet_path, num_trials=num_trials + ) + + dataset = ds.dataset(parquet_path) + num_rows = dataset.count_rows() + parquet_size = os.path.getsize(parquet_path) / (1024**2) + + parquet_read_time = measure_pyarrow_read_time(parquet_path, num_trials=num_trials) + + lance_path = "/tmp/tpch.lancev2" + lance_write_time = measure_lance_write_time( + table, lance_path, num_trials=num_trials + ) + lance_size = os.path.getsize(lance_path) / (1024**2) + + lance_read_time = measure_lance_read_time( + lance_path, num_trials=num_trials, batch_size=1024 * 8 + ) + + output.writerow( + [ + benchmark_name, + dataset_name, + encoding_type, + num_trials, + num_rows, + parquet_size, + parquet_write_time, + parquet_read_time, + lance_size, + lance_write_time, + lance_read_time, + ] + ) + + # os.remove(parquet_path) + + # if os.path.isdir(lance_path): + # shutil.rmtree(lance_path) + # else: + # os.remove(lance_path) + + +if __name__ == "__main__": + if os.path.exists("benchmark_tpch_encodings.csv"): + os.remove("benchmark_tpch_encodings.csv") + + # benchmark_tpch_lineitem("/home/ubuntu/test/TPCH_SF1/") + benchmark_tpch_encodings( + "/home/ubuntu/test/TPCH_SF1/", + dataset_name="tpch_sf1", + encoding_type="plain_numeric", + ) + benchmark_tpch_encodings( + "/home/ubuntu/test/TPCH_SF1/", + dataset_name="tpch_sf1", + encoding_type="plain_non_numeric", + ) + benchmark_sift_vector_encodings( + "/home/ubuntu/test/", + dataset_name="sift1m", + encoding_type="plain_fixed_size_list", + num_trials=3, + ) From c9a9c4a3d1ec406a027e51e1ddc2ed8ae21ea9e3 Mon Sep 17 00:00:00 2001 From: raunaks13 Date: Thu, 23 May 2024 22:16:23 +0000 Subject: [PATCH 3/7] updated verbose messages --- python/python/benchmarks/test_v2_reader.py | 39 +++++++++++----------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/python/python/benchmarks/test_v2_reader.py b/python/python/benchmarks/test_v2_reader.py index 31607e709c..2ceb31bafd 100644 --- a/python/python/benchmarks/test_v2_reader.py +++ b/python/python/benchmarks/test_v2_reader.py @@ -42,7 +42,9 @@ def measure_pyarrow_read_time(path, num_trials, verbose=False): parquet_read_time += (end - start).total_seconds() if verbose: - print(f"Parquet Read Time: {parquet_read_time/trial}s") + print( + f"Parquet Read Time for trial {trial}: {(end - start).total_seconds()}s" + ) avg_read_time = parquet_read_time / num_trials print(f"Parquet Read Time: {avg_read_time}s") @@ -66,6 +68,12 @@ def measure_parquet_write_time(table, path, num_trials, verbose=False): end = datetime.now() parquet_write_time += (end - start).total_seconds() + if verbose: + print( + f"Parquet Write Time for trial {trial}: \ + {(end - start).total_seconds()}s" + ) + avg_write_time = parquet_write_time / num_trials print(f"Parquet Write Time: {avg_write_time}s") return avg_write_time @@ -87,10 +95,12 @@ def measure_lance_read_time(path, num_trials, batch_size, verbose=False): lance_read_time += (end - start).total_seconds() if verbose: - print(f"V2 Read Time: {lance_read_time/trial}s") + print( + f"Lance Read Time for trial {trial}: {(end - start).total_seconds()}s" + ) avg_read_time = lance_read_time / num_trials - print(f"V2 Read Time: {avg_read_time}s") + print(f"Lance Read Time: {avg_read_time}s") return avg_read_time @@ -115,6 +125,11 @@ def measure_lance_write_time(dataset, path, num_trials=10, verbose=False): end = datetime.now() lance_write_time += (end - start).total_seconds() + if verbose: + print( + f"Lance Write Time for trial {trial}: {(end - start).total_seconds()}s" + ) + avg_write_time = lance_write_time / num_trials print(f"Lance Write Time: {avg_write_time}s") return avg_write_time @@ -310,15 +325,6 @@ def benchmark_sift_vector_encodings( PATH = dataset_dir + "sift1m.parquet" orig_dataset = ds.dataset(PATH) - # orig_schema = orig_dataset.schema - - # for field in orig_schema: - # print(field.name, field.type) - - # target_columns = [ - # field.name for field in orig_schema if is_fixed_size_list(field.type) - # ] - # print(target_columns) table = orig_dataset.to_table() parquet_path = "/tmp/parquet_ds.parquet" @@ -359,13 +365,6 @@ def benchmark_sift_vector_encodings( ] ) - # os.remove(parquet_path) - - # if os.path.isdir(lance_path): - # shutil.rmtree(lance_path) - # else: - # os.remove(lance_path) - if __name__ == "__main__": if os.path.exists("benchmark_tpch_encodings.csv"): @@ -386,5 +385,5 @@ def benchmark_sift_vector_encodings( "/home/ubuntu/test/", dataset_name="sift1m", encoding_type="plain_fixed_size_list", - num_trials=3, + num_trials=5, ) From ebc36045761e8c9eb3439ecc60e8aca37ebaa79b Mon Sep 17 00:00:00 2001 From: raunaks13 Date: Sat, 25 May 2024 00:50:18 +0000 Subject: [PATCH 4/7] added timestamp encodings, increased dataset size --- python/python/benchmarks/test_v2_reader.py | 67 ++++++++++++++++------ 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/python/python/benchmarks/test_v2_reader.py b/python/python/benchmarks/test_v2_reader.py index 2ceb31bafd..a5730cf071 100644 --- a/python/python/benchmarks/test_v2_reader.py +++ b/python/python/benchmarks/test_v2_reader.py @@ -190,7 +190,9 @@ def benchmark_tpch_encodings( """ benchmark_name = benchmark_tpch_encodings.__name__ - output = csv.writer(open(benchmark_name + "_" + encoding_type + ".csv", "w")) + output = csv.writer( + open(benchmark_name + "_" + dataset_name + "_" + encoding_type + ".csv", "w") + ) output.writerow( [ "name", @@ -231,11 +233,16 @@ def benchmark_tpch_encodings( if encoding_type == "plain_numeric": target_types = [pa.int32(), pa.int64(), pa.float32(), pa.float64()] elif encoding_type == "plain_non_numeric": - target_types = [pa.string(), pa.date32()] + target_types = [pa.string()] + elif encoding_type == "plain_timestamp": + target_types = [pa.date32()] target_columns = [ field.name for field in orig_schema if field.type in target_types ] + print(target_columns) + if len(target_columns) == 0: + continue table = orig_dataset.to_table(columns=target_columns) parquet_path = "/tmp/parquet_ds.parquet" @@ -246,7 +253,7 @@ def benchmark_tpch_encodings( dataset = ds.dataset(parquet_path) num_rows = dataset.count_rows() - parquet_size = os.path.getsize(parquet_path) / (1024**2) + parquet_size = os.path.getsize(parquet_path) / (1024**3) parquet_read_time = measure_pyarrow_read_time( parquet_path, num_trials=num_trials @@ -256,7 +263,7 @@ def benchmark_tpch_encodings( lance_write_time = measure_lance_write_time( table, lance_path, num_trials=num_trials ) - lance_size = os.path.getsize(lance_path) / (1024**2) + lance_size = os.path.getsize(lance_path) / (1024**3) lance_read_time = measure_lance_read_time( lance_path, num_trials=num_trials, batch_size=1024 * 8 @@ -288,11 +295,11 @@ def benchmark_tpch_encodings( """ Data generation (sift) - nvecs = 10000 + nvecs = 1000000 ndims = 128 with open("../sift/sift_base.fvecs", mode="rb") as fobj: buf = fobj.read() - data = np.array(struct.unpack("<1280000f", + data = np.array(struct.unpack("<128000000f", buf[4 : 4 + 4 * nvecs * ndims])).reshape((nvecs, ndims)) dd = dict(zip(range(nvecs), data)) @@ -335,7 +342,7 @@ def benchmark_sift_vector_encodings( dataset = ds.dataset(parquet_path) num_rows = dataset.count_rows() - parquet_size = os.path.getsize(parquet_path) / (1024**2) + parquet_size = os.path.getsize(parquet_path) / (1024**3) parquet_read_time = measure_pyarrow_read_time(parquet_path, num_trials=num_trials) @@ -343,7 +350,7 @@ def benchmark_sift_vector_encodings( lance_write_time = measure_lance_write_time( table, lance_path, num_trials=num_trials ) - lance_size = os.path.getsize(lance_path) / (1024**2) + lance_size = os.path.getsize(lance_path) / (1024**3) lance_read_time = measure_lance_read_time( lance_path, num_trials=num_trials, batch_size=1024 * 8 @@ -367,23 +374,47 @@ def benchmark_sift_vector_encodings( if __name__ == "__main__": - if os.path.exists("benchmark_tpch_encodings.csv"): - os.remove("benchmark_tpch_encodings.csv") + # if os.path.exists("benchmark_tpch_encodings_plain_non_numeric.csv"): + # os.remove("benchmark_tpch_encodings_plain_non_numeric.csv") # benchmark_tpch_lineitem("/home/ubuntu/test/TPCH_SF1/") + # benchmark_tpch_encodings( + # "/home/ubuntu/test/TPCH_SF1/", + # dataset_name="tpch_sf1", + # encoding_type="plain_numeric", + # ) + # benchmark_tpch_encodings( + # "/home/ubuntu/test/TPCH_SF1/", + # dataset_name="tpch_sf1", + # encoding_type="plain_non_numeric", + # ) + # benchmark_tpch_encodings( + # "/home/ubuntu/test/TPCH_SF1/", + # dataset_name="tpch_sf1", + # encoding_type="plain_timestamp", + # ) + benchmark_tpch_encodings( - "/home/ubuntu/test/TPCH_SF1/", - dataset_name="tpch_sf1", + "/home/ubuntu/test/TPCH_SF10/", + dataset_name="tpch_sf10", encoding_type="plain_numeric", + num_trials=5, ) benchmark_tpch_encodings( - "/home/ubuntu/test/TPCH_SF1/", - dataset_name="tpch_sf1", + "/home/ubuntu/test/TPCH_SF10/", + dataset_name="tpch_sf10", encoding_type="plain_non_numeric", + num_trials=5, ) - benchmark_sift_vector_encodings( - "/home/ubuntu/test/", - dataset_name="sift1m", - encoding_type="plain_fixed_size_list", + benchmark_tpch_encodings( + "/home/ubuntu/test/TPCH_SF10/", + dataset_name="tpch_sf10", + encoding_type="plain_timestamp", num_trials=5, ) + # benchmark_sift_vector_encodings( + # "/home/ubuntu/test/", + # dataset_name="sift1m", + # encoding_type="plain_fixed_size_list", + # num_trials=5, + # ) From 91cb5f1c7695c4ff4c798b67417a1e69c2a20ae7 Mon Sep 17 00:00:00 2001 From: raunaks13 Date: Thu, 1 Aug 2024 00:35:23 +0000 Subject: [PATCH 5/7] updates --- ...est_v2_reader.py => test_v2_read_write.py} | 49 ++++++------------- 1 file changed, 16 insertions(+), 33 deletions(-) rename python/python/benchmarks/{test_v2_reader.py => test_v2_read_write.py} (90%) diff --git a/python/python/benchmarks/test_v2_reader.py b/python/python/benchmarks/test_v2_read_write.py similarity index 90% rename from python/python/benchmarks/test_v2_reader.py rename to python/python/benchmarks/test_v2_read_write.py index a5730cf071..40d22f23ef 100644 --- a/python/python/benchmarks/test_v2_reader.py +++ b/python/python/benchmarks/test_v2_read_write.py @@ -374,47 +374,30 @@ def benchmark_sift_vector_encodings( if __name__ == "__main__": - # if os.path.exists("benchmark_tpch_encodings_plain_non_numeric.csv"): - # os.remove("benchmark_tpch_encodings_plain_non_numeric.csv") - - # benchmark_tpch_lineitem("/home/ubuntu/test/TPCH_SF1/") - # benchmark_tpch_encodings( - # "/home/ubuntu/test/TPCH_SF1/", - # dataset_name="tpch_sf1", - # encoding_type="plain_numeric", - # ) - # benchmark_tpch_encodings( - # "/home/ubuntu/test/TPCH_SF1/", - # dataset_name="tpch_sf1", - # encoding_type="plain_non_numeric", - # ) - # benchmark_tpch_encodings( - # "/home/ubuntu/test/TPCH_SF1/", - # dataset_name="tpch_sf1", - # encoding_type="plain_timestamp", - # ) + # Should contain all table files + tpch_dir_path = "/tmp/TPCH_SF1/" + dataset_name = "tpch_sf1" + benchmark_tpch_lineitem(tpch_dir_path) benchmark_tpch_encodings( - "/home/ubuntu/test/TPCH_SF10/", - dataset_name="tpch_sf10", + tpch_dir_path, + dataset_name=dataset_name, encoding_type="plain_numeric", - num_trials=5, ) benchmark_tpch_encodings( - "/home/ubuntu/test/TPCH_SF10/", - dataset_name="tpch_sf10", + tpch_dir_path, + dataset_name=dataset_name, encoding_type="plain_non_numeric", - num_trials=5, ) benchmark_tpch_encodings( - "/home/ubuntu/test/TPCH_SF10/", - dataset_name="tpch_sf10", + tpch_dir_path, + dataset_name=dataset_name, encoding_type="plain_timestamp", + ) + + benchmark_sift_vector_encodings( + "/home/ubuntu/test/", + dataset_name="sift1m", + encoding_type="plain_fixed_size_list", num_trials=5, ) - # benchmark_sift_vector_encodings( - # "/home/ubuntu/test/", - # dataset_name="sift1m", - # encoding_type="plain_fixed_size_list", - # num_trials=5, - # ) From 413f3e92a0f0a204a4385f44d8b889f98bdfc04f Mon Sep 17 00:00:00 2001 From: raunaks13 Date: Fri, 2 Aug 2024 16:19:55 +0000 Subject: [PATCH 6/7] addressed comments --- python/python/benchmarks/test_v2_read_write.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/python/benchmarks/test_v2_read_write.py b/python/python/benchmarks/test_v2_read_write.py index 40d22f23ef..2d99b4b5b2 100644 --- a/python/python/benchmarks/test_v2_read_write.py +++ b/python/python/benchmarks/test_v2_read_write.py @@ -11,7 +11,7 @@ from lance.file import LanceFileReader, LanceFileWriter from lance.tracing import trace_to_chrome -trace_to_chrome(level="trace") +# trace_to_chrome(level="trace") """ Data generation (in-memory) @@ -227,9 +227,6 @@ def benchmark_tpch_encodings( orig_dataset = ds.dataset(PATH) orig_schema = orig_dataset.schema - # for field in orig_schema: - # print(field.name, field.type) - if encoding_type == "plain_numeric": target_types = [pa.int32(), pa.int64(), pa.float32(), pa.float64()] elif encoding_type == "plain_non_numeric": @@ -240,7 +237,6 @@ def benchmark_tpch_encodings( target_columns = [ field.name for field in orig_schema if field.type in target_types ] - print(target_columns) if len(target_columns) == 0: continue From 344a40c73b824eb23ef2b9b07416180b0d07a21e Mon Sep 17 00:00:00 2001 From: raunaks13 Date: Fri, 2 Aug 2024 16:20:39 +0000 Subject: [PATCH 7/7] lint --- python/python/benchmarks/test_v2_read_write.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/python/benchmarks/test_v2_read_write.py b/python/python/benchmarks/test_v2_read_write.py index 2d99b4b5b2..ecb1acee1e 100644 --- a/python/python/benchmarks/test_v2_read_write.py +++ b/python/python/benchmarks/test_v2_read_write.py @@ -9,7 +9,8 @@ import pyarrow.dataset as ds import pyarrow.parquet as pq from lance.file import LanceFileReader, LanceFileWriter -from lance.tracing import trace_to_chrome + +# from lance.tracing import trace_to_chrome # trace_to_chrome(level="trace")