diff --git a/.gitignore b/.gitignore index 6a84e2b8..60ec308d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.bak .idea .clickbench +*.parquet hits.csv diff --git a/chdb-dataframe/benchmark.sh b/chdb-dataframe/benchmark.sh new file mode 100755 index 00000000..1ac70ad0 --- /dev/null +++ b/chdb-dataframe/benchmark.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# Install + +sudo apt-get update +sudo apt-get install -y python3-pip +pip install pandas chdb + +# Download the data +wget --no-verbose --continue https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet + +# Run the queries + +./run.sh 2>&1 | tee log.txt diff --git a/chdb-dataframe/queries.sql b/chdb-dataframe/queries.sql new file mode 100644 index 00000000..67789487 --- /dev/null +++ b/chdb-dataframe/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM Python(hits); +SELECT COUNT(*) FROM Python(hits) WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM Python(hits); +SELECT AVG(UserID) FROM Python(hits); +SELECT COUNT(DISTINCT UserID) FROM Python(hits); +SELECT COUNT(DISTINCT SearchPhrase) FROM Python(hits); +SELECT MIN(EventDate), MAX(EventDate) FROM Python(hits); +SELECT AdvEngineID, COUNT(*) FROM Python(hits) WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM Python(hits) GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM Python(hits) GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM Python(hits) GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM Python(hits) WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM Python(hits) WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM Python(hits) WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM Python(hits) WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM Python(hits) WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM Python(hits) WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM Python(hits) WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM Python(hits); +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM Python(hits) GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM Python(hits) GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM Python(hits) GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000 \ No newline at end of file diff --git a/chdb-dataframe/query.py b/chdb-dataframe/query.py new file mode 100755 index 00000000..aba1cd21 --- /dev/null +++ b/chdb-dataframe/query.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +import pandas as pd +import timeit +import datetime +import json +import chdb + +start = timeit.default_timer() +hits = pd.read_parquet("hits.parquet") +end = timeit.default_timer() +load_time = end - start + +dataframe_size = hits.memory_usage().sum() + +# print("Dataframe(numpy) size:", dataframe_size, "bytes") + +# fix some types +hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s") +hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D") + +# fix all object columns to string +for col in hits.columns: + if hits[col].dtype == "O": + hits[col] = hits[col].astype(str) + +queries = [] +with open("queries.sql") as f: + queries = f.readlines() + +queries_times = [] +for q in queries: + times = [] + for _ in range(3): + start = timeit.default_timer() + result = chdb.query(q, "Null") + end = timeit.default_timer() + times.append(end - start) + queries_times.append(times) + +result_json = { + "system": "chDB (DataFrame)", + "date": datetime.date.today().strftime("%Y-%m-%d"), + "machine": "c6a.metal, 500gb gp2", + "cluster_size": 1, + "comment": "", + "tags": [ + "C++", + "column-oriented", + "embedded", + "stateless", + "serverless", + "dataframe", + "ClickHouse derivative", + ], + "load_time": 0, + "data_size": int(dataframe_size), + "result": queries_times, +} + +# if cpuinfo contains "AMD EPYC 9654" update machine and write result into results/epyc-9654.json +if "AMD EPYC 9654" in open("/proc/cpuinfo").read(): + result_json["machine"] = "EPYC 9654, 384G" + with open("results/epyc-9654.json", "w") as f: + f.write(json.dumps(result_json, indent=4)) +else: + # write result into results/c6a.metal.json + with open("results/c6a.metal.json", "w") as f: + f.write(json.dumps(result_json, indent=4)) diff --git a/chdb-dataframe/results/c6a.metal.json b/chdb-dataframe/results/c6a.metal.json new file mode 100644 index 00000000..bb4da068 --- /dev/null +++ b/chdb-dataframe/results/c6a.metal.json @@ -0,0 +1,235 @@ +{ + "system": "chDB (DataFrame)", + "date": "2024-09-09", + "machine": "c6a.metal, 500gb gp2", + "cluster_size": 1, + "comment": "", + "tags": [ + "C++", + "column-oriented", + "embedded", + "stateless", + "serverless", + "dataframe", + "ClickHouse derivative" + ], + "load_time": 0, + "data_size": 46998823722, + "result": [ + [ + 0.05626875200005088, + 0.03751970800021809, + 0.03710983100017984 + ], + [ + 0.06512281899995287, + 0.05849275699983991, + 0.0585379379999722 + ], + [ + 0.044165491000057955, + 0.041299934000107896, + 0.040226853000149276 + ], + [ + 0.08134618700023566, + 0.040427908999845386, + 0.04047265000008338 + ], + [ + 0.24362793399996008, + 0.2186527310004749, + 0.2025154820003081 + ], + [ + 0.2937789709999379, + 0.29077527099980216, + 0.2973619129998042 + ], + [ + 0.04860631699921214, + 0.044442590999096865, + 0.04185208899980353 + ], + [ + 0.07148085600010745, + 0.087940534000154, + 0.07362535899983413 + ], + [ + 0.32479551800133777, + 0.31021195799985435, + 0.3000845909991767 + ], + [ + 0.3567556970001533, + 0.3498798099999476, + 0.36433636799984015 + ], + [ + 0.1430718550000165, + 0.14249916199969448, + 0.13584852000030878 + ], + [ + 0.13416774699999223, + 0.12045774299986078, + 0.126722227999835 + ], + [ + 0.3129409140001371, + 0.39802245999999286, + 0.27576033200011807 + ], + [ + 0.30749968300006003, + 0.30776443899958394, + 0.2933942620002199 + ], + [ + 0.2840204679996532, + 0.29736796199995297, + 0.2656048210001245 + ], + [ + 0.16236161799997717, + 0.14227574799997456, + 0.1577743770003508 + ], + [ + 0.41774886599978345, + 0.45844158400041124, + 0.4236037320001742 + ], + [ + 0.343798914999752, + 0.34234521600001244, + 0.3242654260002382 + ], + [ + 0.8645628289996239, + 0.7159582540002702, + 0.701609888999883 + ], + [ + 0.07303793199980646, + 0.0385745369999313, + 0.039382633000059286 + ], + [ + 0.7685791720014095, + 0.7017175380005938, + 0.7004018799998448 + ], + [ + 0.7086737089998678, + 0.7338668900001721, + 0.726541903999987 + ], + [ + 1.0143638360000296, + 1.025565608999841, + 0.9419111859997429 + ], + [ + 4.812288134000028, + 2.167691587000263, + 2.528547120999974 + ], + [ + 0.155205888999717, + 0.14651630200023646, + 0.15077497899983427 + ], + [ + 0.1408380660000148, + 0.14518392399986624, + 0.13928737500009447 + ], + [ + 0.1470085519999884, + 0.15105727499985733, + 0.1438814180000918 + ], + [ + 0.8403685159992165, + 0.7649107939996611, + 0.8209212080000725 + ], + [ + 1.9075914280001598, + 1.8646036999998614, + 1.885373637000157 + ], + [ + 0.07688073700001041, + 0.07453166799996325, + 0.07560217999980523 + ], + [ + 0.20158837400003904, + 0.20706678800024747, + 0.19952737999983583 + ], + [ + 0.2738630439998815, + 0.2761814330001471, + 0.2812519489998522 + ], + [ + 0.7870267199996306, + 0.709162213000036, + 0.7544576690002032 + ], + [ + 1.2057435320002696, + 1.1144656840000607, + 1.1528750570003467 + ], + [ + 1.2744926979999036, + 1.227631830000064, + 1.2613582039998619 + ], + [ + 0.1441288920000261, + 0.1581041039999036, + 0.1318917770004191 + ], + [ + 0.7185039579999284, + 0.6852749829999993, + 0.6816314070001681 + ], + [ + 0.49516889899996386, + 0.46560566200014364, + 0.4326922939999349 + ], + [ + 0.7060461809996923, + 0.2074475480003457, + 0.2127349039997171 + ], + [ + 0.8890890540001237, + 0.4336082350000652, + 0.7149529159999474 + ], + [ + 0.09202534199994261, + 0.08829310200007967, + 0.08950286199979018 + ], + [ + 0.08598820699990029, + 0.08675313599997025, + 0.08428598599994075 + ], + [ + 0.08334934399999838, + 0.08340355500013175, + 0.0796592659999078 + ] + ] +} \ No newline at end of file diff --git a/chdb-dataframe/run.sh b/chdb-dataframe/run.sh new file mode 100755 index 00000000..bced9948 --- /dev/null +++ b/chdb-dataframe/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +./query.py diff --git a/duckdb-dataframe/benchmark.sh b/duckdb-dataframe/benchmark.sh new file mode 100755 index 00000000..75933a26 --- /dev/null +++ b/duckdb-dataframe/benchmark.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# Install + +sudo apt-get update +sudo apt-get install -y python3-pip +pip install pandas duckdb + +# Download the data +wget --no-verbose --continue https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet + +# Run the queries + +./run.sh 2>&1 | tee log.txt diff --git a/duckdb-dataframe/queries.sql b/duckdb-dataframe/queries.sql new file mode 100644 index 00000000..b4115ee3 --- /dev/null +++ b/duckdb-dataframe/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/duckdb-dataframe/query.py b/duckdb-dataframe/query.py new file mode 100755 index 00000000..d1715495 --- /dev/null +++ b/duckdb-dataframe/query.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +import pandas as pd +import timeit +import datetime +import json +import duckdb + +start = timeit.default_timer() +hits = pd.read_parquet("hits.parquet") +end = timeit.default_timer() +load_time = end - start + +dataframe_size = hits.memory_usage().sum() + +# print("Dataframe(numpy) size:", dataframe_size, "bytes") + +# fix some types +hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s") +hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D") + +# fix all object columns to string +for col in hits.columns: + if hits[col].dtype == "O": + hits[col] = hits[col].astype(str) + +queries = [] +with open("queries.sql") as f: + queries = f.readlines() + +conn = duckdb.connect() +queries_times = [] +for q in queries: + times = [] + for _ in range(3): + start = timeit.default_timer() + result = conn.execute(q).fetchall() + end = timeit.default_timer() + times.append(end - start) + queries_times.append(times) + +result_json = { + "system": "DuckDB (DataFrame)", + "date": datetime.date.today().strftime("%Y-%m-%d"), + "machine": "c6a.metal, 500gb gp2", + "cluster_size": 1, + "comment": "", + "tags": [ + "C++", + "column-oriented", + "embedded", + "stateless", + "serverless", + "dataframe", + ], + "load_time": 0, + "data_size": int(dataframe_size), + "result": queries_times, +} + +# if cpuinfo contains "AMD EPYC 9654" update machine and write result into results/epyc-9654.json +if "AMD EPYC 9654" in open("/proc/cpuinfo").read(): + result_json["machine"] = "EPYC 9654, 384G" + with open("results/epyc-9654.json", "w") as f: + f.write(json.dumps(result_json, indent=4)) +else: + # write result into results/c6a.metal.json + with open("results/c6a.metal.json", "w") as f: + f.write(json.dumps(result_json, indent=4)) diff --git a/duckdb-dataframe/results/c6a.metal.json b/duckdb-dataframe/results/c6a.metal.json new file mode 100644 index 00000000..ca1b6e05 --- /dev/null +++ b/duckdb-dataframe/results/c6a.metal.json @@ -0,0 +1,234 @@ +{ + "system": "DuckDB (DataFrame)", + "date": "2024-09-09", + "machine": "c6a.metal, 500gb gp2", + "cluster_size": 1, + "comment": "", + "tags": [ + "C++", + "column-oriented", + "embedded", + "stateless", + "serverless", + "dataframe" + ], + "load_time": 0, + "data_size": 46998823722, + "result": [ + [ + 0.05120044299997062, + 0.01451553300012165, + 0.016491983000150867 + ], + [ + 0.02931129100011276, + 0.01839072099983241, + 0.018927282000049672 + ], + [ + 0.02511615699995673, + 0.024862691000180348, + 0.02501659499989728 + ], + [ + 0.03695813600006659, + 0.02565254800015282, + 0.02570289799996317 + ], + [ + 0.27191674599998805, + 0.27759674200010522, + 0.2588871349999863 + ], + [ + 0.3774085860000923, + 0.32757853100006287, + 0.33302718000013556 + ], + [ + 0.038237650999917605, + 0.024178557999903205, + 0.02490608199991584 + ], + [ + 0.030545426000117004, + 0.029484825000054116, + 0.03014319799990517 + ], + [ + 0.35205814599989935, + 0.27471974500017495, + 0.26925484400007917 + ], + [ + 0.39639628200002335, + 0.28721119700012423, + 0.28630923800014898 + ], + [ + 0.06255191200011723, + 0.06675710700005766, + 0.06318350399988049 + ], + [ + 0.09806364399994891, + 0.07049120199985737, + 0.07045088099994246 + ], + [ + 0.25685957099994994, + 0.2923465579999629, + 0.24080298700005187 + ], + [ + 0.9493479239999397, + 0.8976369989999512, + 0.9234879899999214 + ], + [ + 0.3275632019999921, + 0.2994161829999848, + 0.2994098889998895 + ], + [ + 0.2028798320000078, + 0.19643380100002105, + 0.1967687379999552 + ], + [ + 0.7205587729998115, + 0.70485905700006697, + 0.69305262400014726 + ], + [ + 0.62671780199998466, + 0.46055906399997184, + 0.54356512099998326 + ], + [ + 1.7618106520000765, + 1.4338731269999244, + 1.4774597579998499 + ], + [ + 0.11568013700002666, + 0.11198447299989311, + 0.1095309229999657 + ], + [ + 0.8348995979999927, + 0.7820795490000819, + 0.7994894509999085 + ], + [ + 0.4610988360000192, + 0.41369037799995567, + 0.39819625199995244 + ], + [ + 1.174443158000031, + 1.1575830060000044, + 1.1208328130000154 + ], + [ + 2.0461591100000533, + 1.9994193829999858, + 1.9205524290000085 + ], + [ + 0.08894231599992963, + 0.07940710399998352, + 0.07883891300002688 + ], + [ + 0.2340729590000592, + 0.20119833499993547, + 0.20270610500015209 + ], + [ + 0.10939303000009204, + 0.09853198099995097, + 0.10583861800000705 + ], + [ + 0.8641946679999819, + 0.7357956940001259, + 0.7316882210001495 + ], + [ + 4.0055917649999174, + 3.9187831119999828, + 3.955649695000102 + ], + [ + 0.7100810240000101, + 0.69931781800000863, + 0.6358406850000392 + ], + [ + 0.30175355000005766, + 0.28016329600000063, + 0.2704750220000642 + ], + [ + 0.3884521920000225, + 0.2658412359999147, + 0.3451120859999719 + ], + [ + 1.3527775069998825, + 1.1267246480000722, + 1.1895273829998587 + ], + [ + 1.9144373749999431, + 1.9059109310001531, + 1.8425559129998874 + ], + [ + 1.7254979159998584, + 1.7914850100000422, + 1.7172489999999016 + ], + [ + 0.2766488639999807, + 0.3223322660001031, + 0.33668833700003233 + ], + [ + 0.45268178699984674, + 0.43472940599995127, + 0.4311858629998824 + ], + [ + 1.162701579000077, + 0.9709281650000321, + 0.9300721709998925 + ], + [ + 0.3190891100000499, + 0.3109210760001133, + 0.30955156799996075 + ], + [ + 0.5417726650000532, + 0.5106009759999779, + 0.4813571659999525 + ], + [ + 0.02814393799985737, + 0.0257830399998511, + 0.025415173000055802 + ], + [ + 0.035018445000114298, + 0.026704358999950273, + 0.03011910799978068 + ], + [ + 0.03502820700009579, + 0.03178924200005895, + 0.03449460599995291 + ] + ] +} \ No newline at end of file diff --git a/duckdb-dataframe/run.sh b/duckdb-dataframe/run.sh new file mode 100755 index 00000000..bced9948 --- /dev/null +++ b/duckdb-dataframe/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +./query.py diff --git a/pandas/benchmark.sh b/pandas/benchmark.sh new file mode 100755 index 00000000..814b3efe --- /dev/null +++ b/pandas/benchmark.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# Install + +sudo apt-get update +sudo apt-get install -y python3-pip +pip install pandas + +# Download the data +wget --no-verbose --continue https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet + +# Run the queries + +./run.sh 2>&1 | tee log.txt diff --git a/pandas/query.py b/pandas/query.py new file mode 100755 index 00000000..ee44e35a --- /dev/null +++ b/pandas/query.py @@ -0,0 +1,522 @@ +#!/usr/bin/env python3 + +import pandas as pd +import timeit +import datetime +import json + +start = timeit.default_timer() +hits = pd.read_parquet("hits.parquet") +end = timeit.default_timer() +load_time = end - start + +dataframe_size = hits.memory_usage().sum() + +# print("Dataframe(numpy) size:", dataframe_size, "bytes") + +# fix some types +hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s") +hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D") + +# fix all object columns to string +for col in hits.columns: + if hits[col].dtype == "O": + hits[col] = hits[col].astype(str) + +# 0: No., 1: SQL, 2: Pandas +queries = [ + ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.count()), + ( + "Q1", + "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", + lambda x: x[x["AdvEngineID"] != 0].count(), + ), + ( + "Q2", + "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", + lambda x: (x["AdvEngineID"].sum(), x.shape[0], x["ResolutionWidth"].mean()), + ), + ( + "Q3", + "SELECT AVG(UserID) FROM hits;", + lambda x: x["UserID"].mean(), + ), + ( + "Q4", + "SELECT COUNT(DISTINCT UserID) FROM hits;", + lambda x: x["UserID"].nunique(), + ), + ( + "Q5", + "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", + lambda x: x["SearchPhrase"].nunique(), + ), + ( + "Q6", + "SELECT MIN(EventDate), MAX(EventDate) FROM hits;", + lambda x: (x["EventDate"].min(), x["EventDate"].max()), + ), + ( + "Q7", + "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", + lambda x: x[x["AdvEngineID"] != 0] + .groupby("AdvEngineID") + .size() + .sort_values(ascending=False), + ), + ( + "Q8", + "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", + lambda x: x.groupby("RegionID")["UserID"].nunique().nlargest(10), + ), + ( + "Q9", + "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", + lambda x: x.groupby("RegionID") + .agg({"AdvEngineID": "sum", "ResolutionWidth": "mean", "UserID": "nunique"}) + .nlargest(10, "AdvEngineID"), + ), + ( + "Q10", + "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", + lambda x: x[x["MobilePhoneModel"] != ""] + .groupby("MobilePhoneModel")["UserID"] + .nunique() + .nlargest(10), + ), + ( + "Q11", + "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", + lambda x: x[x["MobilePhoneModel"] != ""] + .groupby(["MobilePhone", "MobilePhoneModel"])["UserID"] + .nunique() + .nlargest(10), + ), + ( + "Q12", + "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .groupby("SearchPhrase") + .size() + .nlargest(10), + ), + ( + "Q13", + "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .groupby("SearchPhrase")["UserID"] + .nunique() + .nlargest(10), + ), + ( + "Q14", + "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .groupby(["SearchEngineID", "SearchPhrase"]) + .size() + .nlargest(10), + ), + ( + "Q15", + "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", + lambda x: x.groupby("UserID").size().nlargest(10), + ), + ( + "Q16", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + lambda x: x.groupby(["UserID", "SearchPhrase"]).size().nlargest(10), + ), + ( + "Q17", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", + lambda x: x.groupby(["UserID", "SearchPhrase"]).size().head(10), + ), + ( + "Q18", + "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + lambda x: x.groupby([x["UserID"], x["EventTime"].dt.minute, "SearchPhrase"]) + .size() + .nlargest(10), + ), + ( + "Q19", + "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", + lambda x: x[x["UserID"] == 435090932899640449], + ), + ( + "Q20", + "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", + lambda x: x[x["URL"].str.contains("google")].shape[0], + ), + ( + "Q21", + "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + lambda x: x[(x["URL"].str.contains("google")) & (x["SearchPhrase"] != "")] + .groupby("SearchPhrase") + .agg({"URL": "min", "SearchPhrase": "size"}) + .nlargest(10, "SearchPhrase"), + ), + ( + "Q22", + "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + lambda x: x[ + (x["Title"].str.contains("Google")) + & (~x["URL"].str.contains(".google.")) + & (x["SearchPhrase"] != "") + ] + .groupby("SearchPhrase") + .agg( + {"URL": "min", "Title": "min", "SearchPhrase": "size", "UserID": "nunique"} + ) + .nlargest(10, "SearchPhrase"), + ), + ( + "Q23", + "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", + lambda x: x[x["URL"].str.contains("google")] + .sort_values(by="EventTime") + .head(10), + ), + ( + "Q24", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .sort_values(by="EventTime")[["SearchPhrase"]] + .head(10), + ), + ( + "Q25", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .sort_values(by="SearchPhrase")[["SearchPhrase"]] + .head(10), + ), + ( + "Q26", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .sort_values(by=["EventTime", "SearchPhrase"])[["SearchPhrase"]] + .head(10), + ), + ( + "Q27", + "SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + lambda x: x[x["URL"] != ""] + .groupby("CounterID") + .filter(lambda g: g["URL"].count() > 100000) + .agg({"URL": lambda url: url.str.len().mean(), "CounterID": "size"}) + .sort_values() + .head(25), + ), + ( + "Q28", + "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + lambda x: ( + x[x["Referer"] != ""] + .assign(k=x["Referer"].str.extract(r"^https?://(?:www\.)?([^/]+)/.*$")[0]) + .groupby("k") + .filter(lambda g: g["Referer"].count() > 100000) + .agg( + min_referer=("Referer", "min"), + average_length=("Referer", lambda r: r.str.len().mean()), + ) + .head(25) + ), + ), + ( + "Q29", + "SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;", + lambda x: x["ResolutionWidth"].sum() + + x["ResolutionWidth"].shift(1).sum() + + x["ResolutionWidth"].shift(2).sum() + + x["ResolutionWidth"].shift(3).sum() + + x["ResolutionWidth"].shift(4).sum() + + x["ResolutionWidth"].shift(5).sum() + + x["ResolutionWidth"].shift(6).sum() + + x["ResolutionWidth"].shift(7).sum() + + x["ResolutionWidth"].shift(8).sum() + + x["ResolutionWidth"].shift(9).sum() + + x["ResolutionWidth"].shift(10).sum() + + x["ResolutionWidth"].shift(11).sum() + + x["ResolutionWidth"].shift(12).sum() + + x["ResolutionWidth"].shift(13).sum() + + x["ResolutionWidth"].shift(14).sum() + + x["ResolutionWidth"].shift(15).sum() + + x["ResolutionWidth"].shift(16).sum() + + x["ResolutionWidth"].shift(17).sum() + + x["ResolutionWidth"].shift(18).sum() + + x["ResolutionWidth"].shift(19).sum() + + x["ResolutionWidth"].shift(20).sum() + + x["ResolutionWidth"].shift(21).sum() + + x["ResolutionWidth"].shift(22).sum() + + x["ResolutionWidth"].shift(23).sum() + + x["ResolutionWidth"].shift(24).sum() + + x["ResolutionWidth"].shift(25).sum() + + x["ResolutionWidth"].shift(26).sum() + + x["ResolutionWidth"].shift(27).sum() + + x["ResolutionWidth"].shift(28).sum() + + x["ResolutionWidth"].shift(29).sum() + + x["ResolutionWidth"].shift(30).sum() + + x["ResolutionWidth"].shift(31).sum() + + x["ResolutionWidth"].shift(32).sum() + + x["ResolutionWidth"].shift(33).sum() + + x["ResolutionWidth"].shift(34).sum() + + x["ResolutionWidth"].shift(35).sum() + + x["ResolutionWidth"].shift(36).sum() + + x["ResolutionWidth"].shift(37).sum() + + x["ResolutionWidth"].shift(38).sum() + + x["ResolutionWidth"].shift(39).sum() + + x["ResolutionWidth"].shift(40).sum() + + x["ResolutionWidth"].shift(41).sum() + + x["ResolutionWidth"].shift(42).sum() + + x["ResolutionWidth"].shift(43).sum() + + x["ResolutionWidth"].shift(44).sum() + + x["ResolutionWidth"].shift(45).sum() + + x["ResolutionWidth"].shift(46).sum() + + x["ResolutionWidth"].shift(47).sum() + + x["ResolutionWidth"].shift(48).sum() + + x["ResolutionWidth"].shift(49).sum() + + x["ResolutionWidth"].shift(50).sum() + + x["ResolutionWidth"].shift(51).sum() + + x["ResolutionWidth"].shift(52).sum() + + x["ResolutionWidth"].shift(53).sum() + + x["ResolutionWidth"].shift(54).sum() + + x["ResolutionWidth"].shift(55).sum() + + x["ResolutionWidth"].shift(56).sum() + + x["ResolutionWidth"].shift(57).sum() + + x["ResolutionWidth"].shift(58).sum() + + x["ResolutionWidth"].shift(59).sum() + + x["ResolutionWidth"].shift(60).sum() + + x["ResolutionWidth"].shift(61).sum() + + x["ResolutionWidth"].shift(62).sum() + + x["ResolutionWidth"].shift(63).sum() + + x["ResolutionWidth"].shift(64).sum() + + x["ResolutionWidth"].shift(65).sum() + + x["ResolutionWidth"].shift(66).sum() + + x["ResolutionWidth"].shift(67).sum() + + x["ResolutionWidth"].shift(68).sum() + + x["ResolutionWidth"].shift(69).sum() + + x["ResolutionWidth"].shift(70).sum() + + x["ResolutionWidth"].shift(71).sum() + + x["ResolutionWidth"].shift(72).sum() + + x["ResolutionWidth"].shift(73).sum() + + x["ResolutionWidth"].shift(74).sum() + + x["ResolutionWidth"].shift(75).sum() + + x["ResolutionWidth"].shift(76).sum() + + x["ResolutionWidth"].shift(77).sum() + + x["ResolutionWidth"].shift(78).sum() + + x["ResolutionWidth"].shift(79).sum() + + x["ResolutionWidth"].shift(80).sum() + + x["ResolutionWidth"].shift(81).sum() + + x["ResolutionWidth"].shift(82).sum() + + x["ResolutionWidth"].shift(83).sum() + + x["ResolutionWidth"].shift(84).sum() + + x["ResolutionWidth"].shift(85).sum() + + x["ResolutionWidth"].shift(86).sum() + + x["ResolutionWidth"].shift(87).sum() + + x["ResolutionWidth"].shift(88).sum() + + x["ResolutionWidth"].shift(89).sum(), + ), + ( + "Q30", + "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .groupby(["SearchEngineID", "ClientIP"]) + .agg( + c=("SearchEngineID", "size"), + IsRefreshSum=("IsRefresh", "sum"), + AvgResolutionWidth=("ResolutionWidth", "mean"), + ) + .nlargest(10, "c"), + ), + ( + "Q31", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .groupby(["WatchID", "ClientIP"]) + .agg( + c=("WatchID", "size"), + IsRefreshSum=("IsRefresh", "sum"), + AvgResolutionWidth=("ResolutionWidth", "mean"), + ) + .nlargest(10, "c"), + ), + ( + "Q32", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + lambda x: x.groupby(["WatchID", "ClientIP"]) + .agg( + c=("WatchID", "size"), + IsRefreshSum=("IsRefresh", "sum"), + AvgResolutionWidth=("ResolutionWidth", "mean"), + ) + .nlargest(10, "c"), + ), + ( + "Q33", + "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", + lambda x: x.groupby("URL").size().nlargest(10).reset_index(name="c"), + ), + ( + "Q34", + "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", + lambda x: x.groupby(["URL"]).size().nlargest(10).reset_index(name="c"), + ), + ( + "Q35", + "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", + lambda x: x.assign( + **{f"ClientIP_minus_{i}": x["ClientIP"] - i for i in range(1, 4)} + ) + .groupby( + ["ClientIP", "ClientIP_minus_1", "ClientIP_minus_2", "ClientIP_minus_3"] + ) + .size() + .nlargest(10) + .reset_index(name="c"), + ), + ( + "Q36", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", + lambda x: x[ + (x["CounterID"] == 62) + & (x["EventDate"] >= "2013-07-01") + & (x["EventDate"] <= "2013-07-31") + & (x["DontCountHits"] == 0) + & (x["IsRefresh"] == 0) + & (x["URL"] != "") + ] + .groupby("URL") + .size() + .nlargest(10), + ), + ( + "Q37", + "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", + lambda x: x[ + (x["CounterID"] == 62) + & (x["EventDate"] >= "2013-07-01") + & (x["EventDate"] <= "2013-07-31") + & (x["DontCountHits"] == 0) + & (x["IsRefresh"] == 0) + & (x["Title"] != "") + ] + .groupby("Title") + .size() + .nlargest(10), + ), + ( + "Q38", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + lambda x: x[ + (x["CounterID"] == 62) + & (x["EventDate"] >= "2013-07-01") + & (x["EventDate"] <= "2013-07-31") + & (x["IsRefresh"] == 0) + & (x["IsLink"] != 0) + & (x["IsDownload"] == 0) + ] + .groupby("URL") + .size() + .nlargest(10) + .reset_index(name="PageViews") + .iloc[1000:1010], + ), + ( + "Q39", + "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + lambda x: x[ + (x["CounterID"] == 62) + & (x["EventDate"] >= "2013-07-01") + & (x["EventDate"] <= "2013-07-31") + & (x["IsRefresh"] == 0) + ] + .groupby(["TraficSourceID", "SearchEngineID", "AdvEngineID", "Referer", "URL"]) + .size() + .nlargest(10) + .reset_index(name="PageViews") + .iloc[1000:1010], + ), + ( + "Q40", + "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", + lambda x: x[ + (x["CounterID"] == 62) + & (x["EventDate"] >= "2013-07-01") + & (x["EventDate"] <= "2013-07-31") + & (x["IsRefresh"] == 0) + & (x["TraficSourceID"].isin([-1, 6])) + & (x["RefererHash"] == 3594120000172545465) + ] + .groupby(["URLHash", "EventDate"]) + .size() + .nlargest(10) + .reset_index(name="PageViews") + .iloc[100:110], + ), + ( + "Q41", + "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", + lambda x: x[ + (x["CounterID"] == 62) + & (x["EventDate"] >= "2013-07-01") + & (x["EventDate"] <= "2013-07-31") + & (x["IsRefresh"] == 0) + & (x["DontCountHits"] == 0) + & (x["URLHash"] == 2868770270353813622) + ] + .groupby(["WindowClientWidth", "WindowClientHeight"]) + .size() + .nlargest(10) + .reset_index(name="PageViews") + .iloc[10000:10010], + ), + ( + "Q42", + "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", + lambda x: x[ + (x["CounterID"] == 62) + & (x["EventDate"] >= "2013-07-14") + & (x["EventDate"] <= "2013-07-15") + & (x["IsRefresh"] == 0) + & (x["DontCountHits"] == 0) + ] + .groupby(pd.Grouper(key="EventTime", freq="T")) + .size() + .reset_index(name="PageViews") + .iloc[1000:1010], + ), +] + +queries_times = [] +for q in queries: + times = [] + for _ in range(3): + start = timeit.default_timer() + result = q[2](hits) + end = timeit.default_timer() + times.append(end - start) + queries_times.append(times) + +result_json = { + "system": "Pandas (DataFrame)", + "date": datetime.date.today().strftime("%Y-%m-%d"), + "machine": "c6a.metal, 500gb gp2", + "cluster_size": 1, + "comment": "", + "tags": [ + "C++", + "column-oriented", + "embedded", + "stateless", + "serverless", + "dataframe", + ], + "load_time": 0, + "data_size": int(dataframe_size), + "result": queries_times, +} + +# write result into results/c6a.metal.json +with open("results/c6a.metal.json", "w") as f: + f.write(json.dumps(result_json, indent=4)) diff --git a/pandas/results/c6a.metal.json b/pandas/results/c6a.metal.json new file mode 100644 index 00000000..89c31d75 --- /dev/null +++ b/pandas/results/c6a.metal.json @@ -0,0 +1,234 @@ +{ + "system": "Pandas (DataFrame)", + "date": "2024-09-09", + "machine": "c6a.metal, 500gb gp2", + "cluster_size": 1, + "comment": "", + "tags": [ + "C++", + "column-oriented", + "embedded", + "stateless", + "serverless", + "dataframe" + ], + "load_time": 0, + "data_size": 46998823718, + "result": [ + [ + 94.85904947930048, + 94.34601983195181, + 94.37713289545036 + ], + [ + 1.431384237598831, + 1.3912876943999437, + 1.3844853995993616 + ], + [ + 0.08555814599894802, + 0.08529179399993154, + 0.08538529799989192 + ], + [ + 0.07446354079984303, + 0.07420929889885884, + 0.07405653089972475 + ], + [ + 4.045293917999516, + 4.054952681998839, + 4.051108713600843 + ], + [ + 9.397236594001151, + 8.553888651601301, + 8.551957798800868 + ], + [ + 0.20142598079983145, + 0.20082141240054624, + 0.20077682039845968 + ], + [ + 0.6888897023993195, + 0.7089972360001411, + 0.7090266588005761 + ], + [ + 9.793415095200907, + 9.746425586400437, + 9.77463934079933 + ], + [ + 11.31212157124992, + 11.317409126249913, + 11.365981332498905 + ], + [ + 9.561819545998878, + 9.526289238000026, + 9.549710942401726 + ], + [ + 9.638392472399573, + 9.68899648799852, + 9.516723232799268 + ], + [ + 36.02233861439891, + 35.351831825997944, + 35.030546258400866 + ], + [ + 40.99436554069816, + 41.21673876189979, + 41.246581903798866 + ], + [ + 99.42101739960053, + 98.99104403760284, + 99.61387698720223 + ], + [ + 11.37903030359885, + 11.301075006001338, + 11.354333800797758 + ], + [ + 328.1500654404023, + 327.7806476567988, + 329.1575996675994 + ], + [ + 40.46806235400145, + 40.32875272679958, + 40.21332760319783 + ], + [ + 710.873256987604, + 709.9021454064001, + 709.1302060236034 + ], + [ + 0.12158637359971182, + 0.0502676939999219, + 0.05029934999911348 + ], + [ + 25.416243308401317, + 23.607750895197385, + 23.531343283200113 + ], + [ + 28.19667632640194, + 28.307106355203724, + 28.03666319760232 + ], + [ + 109.4524946891979, + 107.89275825840158, + 108.05671904040064 + ], + [ + 23.69155465800141, + 23.749371099603015, + 23.711980612398608 + ], + [ + 27.952041746398027, + 27.95151388080121, + 27.995164492801995 + ], + [ + 59.47395566519991, + 59.619707318401196, + 58.98795632160036 + ], + [ + 53.49264745560212, + 53.33148341399937, + 53.44261157879809 + ], + [ + 149.89341028799827, + 150.04405806719732, + 150.43764930480103 + ], + [ + 351.62001316439855, + 359.9997044964009, + 351.64450961039693 + ], + [ + 33.07338013560075, + 33.072663941998326, + 33.077342296800635 + ], + [ + 18.24362592239923, + 18.262576292401356, + 18.19058843999956 + ], + [ + 27.786943907999376, + 27.594876044396368, + 27.847514002797835 + ], + [ + 115.49374400880042, + 111.91312766519778, + 111.8312668188024 + ], + [ + 104.13023711280111, + 97.22372503439982, + 97.7520352571999 + ], + [ + 97.18665714719973, + 97.11860653920158, + 97.45631518800073 + ], + [ + 299.7628146300005, + 297.58259715119927, + 297.4566010572016 + ], + [ + 7.761557435998111, + 7.722232234799594, + 7.732103157199095 + ], + [ + 6.957877979998012, + 6.616552991600551, + 6.931742102401041 + ], + [ + 0.6273289979973924, + 0.6247513715992681, + 0.6277366368012735 + ], + [ + 6.127344027599611, + 6.063661066796341, + 6.084284174400091 + ], + [ + 1.3988516340003116, + 1.3979855832018074, + 1.4039395200001308 + ], + [ + 0.7085309304005932, + 0.7092239279983914, + 0.7087875852012076 + ], + [ + 2.2844865708000724, + 2.2784988779996637, + 2.2800013651987888 + ] + ] +} \ No newline at end of file diff --git a/pandas/run.sh b/pandas/run.sh new file mode 100755 index 00000000..bced9948 --- /dev/null +++ b/pandas/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +./query.py diff --git a/polars/benchmark.sh b/polars/benchmark.sh new file mode 100755 index 00000000..fd7536c7 --- /dev/null +++ b/polars/benchmark.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# Install + +sudo apt-get update +sudo apt-get install -y python3-pip +pip install pandas polars + +# Download the data +wget --no-verbose --continue https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet + +# Run the queries + +./run.sh 2>&1 | tee log.txt diff --git a/polars/query.py b/polars/query.py new file mode 100755 index 00000000..96df85dd --- /dev/null +++ b/polars/query.py @@ -0,0 +1,820 @@ +#!/usr/bin/env python3 + +import pandas as pd +import polars as pl +import timeit +import datetime +import json + +hits = pd.read_parquet("hits.parquet") + +dataframe_size = hits.memory_usage().sum() + +# print("Dataframe(numpy) size:", dataframe_size, "bytes") + +# fix some types +hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s") +hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D") + +# fix all object columns to string +for col in hits.columns: + if hits[col].dtype == "O": + hits[col] = hits[col].astype(str) + +start = timeit.default_timer() +pl_df = pl.DataFrame(hits) +stop = timeit.default_timer() +load_time = stop - start + +# 0: No., 1: SQL, 2: Pandas, 3: Polars +queries = queries = [ + ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.count(), lambda x: x.height), + ( + "Q1", + "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", + lambda x: x[x["AdvEngineID"] != 0].count(), + lambda x: x.filter(pl.col("AdvEngineID") != 0).height, + ), + ( + "Q2", + "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", + lambda x: (x["AdvEngineID"].sum(), x.shape[0], x["ResolutionWidth"].mean()), + lambda x: (x["AdvEngineID"].sum(), x.height, x["ResolutionWidth"].mean()), + ), + ( + "Q3", + "SELECT AVG(UserID) FROM hits;", + lambda x: x["UserID"].mean(), + lambda x: x["UserID"].mean(), + ), + ( + "Q4", + "SELECT COUNT(DISTINCT UserID) FROM hits;", + lambda x: x["UserID"].nunique(), + lambda x: x["UserID"].n_unique(), + ), + ( + "Q5", + "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", + lambda x: x["SearchPhrase"].nunique(), + lambda x: x["SearchPhrase"].n_unique(), + ), + ( + "Q6", + "SELECT MIN(EventDate), MAX(EventDate) FROM hits;", + lambda x: (x["EventDate"].min(), x["EventDate"].max()), + lambda x: (x["EventDate"].min(), x["EventDate"].max()), + ), + ( + "Q7", + "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", + lambda x: x[x["AdvEngineID"] != 0] + .groupby("AdvEngineID") + .size() + .sort_values(ascending=False), + lambda x: x.filter(pl.col("AdvEngineID") != 0) + .group_by("AdvEngineID") + .agg(pl.len().alias("count")) + .sort("count", descending=True), + ), + ( + "Q8", + "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", + lambda x: x.groupby("RegionID")["UserID"].nunique().nlargest(10), + lambda x: x.group_by("RegionID") + .agg(pl.col("UserID").n_unique()) + .sort("UserID", descending=True) + .head(10), + ), + ( + "Q9", + "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", + lambda x: x.groupby("RegionID") + .agg({"AdvEngineID": "sum", "ResolutionWidth": "mean", "UserID": "nunique"}) + .nlargest(10, "AdvEngineID"), + lambda x: x.group_by("RegionID") + .agg( + [ + pl.sum("AdvEngineID").alias("AdvEngineID_sum"), + pl.mean("ResolutionWidth").alias("ResolutionWidth_mean"), + pl.col("UserID").n_unique().alias("UserID_nunique"), + ] + ) + .sort("AdvEngineID_sum", descending=True) + .head(10), + ), + ( + "Q10", + "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", + lambda x: x[x["MobilePhoneModel"] != ""] + .groupby("MobilePhoneModel")["UserID"] + .nunique() + .nlargest(10), + lambda x: x.filter(pl.col("MobilePhoneModel") != "") + .group_by("MobilePhoneModel") + .agg(pl.col("UserID").n_unique()) + .sort("UserID", descending=True) + .head(10), + ), + ( + "Q11", + "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", + lambda x: x[x["MobilePhoneModel"] != ""] + .groupby(["MobilePhone", "MobilePhoneModel"])["UserID"] + .nunique() + .nlargest(10), + lambda x: x.filter(pl.col("MobilePhoneModel") != "") + .group_by(["MobilePhone", "MobilePhoneModel"]) + .agg(pl.col("UserID").n_unique()) + .sort("UserID", descending=True) + .head(10), + ), + ( + "Q12", + "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .groupby("SearchPhrase") + .size() + .nlargest(10), + lambda x: x.filter(pl.col("SearchPhrase") != "") + .group_by("SearchPhrase") + .agg(pl.len().alias("count")) + .sort("count", descending=True) + .head(10), + ), + ( + "Q13", + "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .groupby("SearchPhrase")["UserID"] + .nunique() + .nlargest(10), + lambda x: x.filter(pl.col("SearchPhrase") != "") + .group_by("SearchPhrase") + .agg(pl.col("UserID").n_unique()) + .sort("UserID", descending=True) + .head(10), + ), + ( + "Q14", + "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .groupby(["SearchEngineID", "SearchPhrase"]) + .size() + .nlargest(10), + lambda x: x.filter(pl.col("SearchPhrase") != "") + .group_by(["SearchEngineID", "SearchPhrase"]) + .agg(pl.len().alias("count")) + .sort("count", descending=True) + .head(10), + ), + ( + "Q15", + "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", + lambda x: x.groupby("UserID").size().nlargest(10), + lambda x: x.group_by("UserID") + .agg(pl.len().alias("count")) + .sort("count", descending=True) + .head(10), + ), + ( + "Q16", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + lambda x: x.groupby(["UserID", "SearchPhrase"]).size().nlargest(10), + lambda x: x.group_by(["UserID", "SearchPhrase"]) + .agg(pl.len().alias("count")) + .sort("count", descending=True) + .head(10), + ), + ( + "Q17", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", + lambda x: x.groupby(["UserID", "SearchPhrase"]).size().head(10), + lambda x: x.group_by(["UserID", "SearchPhrase"]).agg(pl.len()).head(10), + ), + ( + "Q18", + "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + lambda x: x.groupby([x["UserID"], x["EventTime"].dt.minute, "SearchPhrase"]) + .size() + .nlargest(10), + lambda x: x.group_by( + [pl.col("UserID"), x["EventTime"].dt.minute(), "SearchPhrase"] + ) + .agg(pl.len().alias("count")) + .sort("count", descending=True) + .head(10), + ), + ( + "Q19", + "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", + lambda x: x[x["UserID"] == 435090932899640449], + lambda x: x.filter(pl.col("UserID") == 435090932899640449), + ), + ( + "Q20", + "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", + lambda x: x[x["URL"].str.contains("google")].shape[0], + lambda x: x.filter(pl.col("URL").str.contains("google")).height, + ), + ( + "Q21", + "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + lambda x: x[(x["URL"].str.contains("google")) & (x["SearchPhrase"] != "")] + .groupby("SearchPhrase") + .agg({"URL": "min", "SearchPhrase": "size"}) + .nlargest(10, "SearchPhrase"), + lambda x: x.filter( + (pl.col("URL").str.contains("google")) & (pl.col("SearchPhrase") != "") + ) + .group_by("SearchPhrase") + .agg([pl.col("URL").min(), pl.len().alias("count")]) + .sort("count", descending=True) + .head(10), + ), + ( + "Q22", + "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + lambda x: x[ + (x["Title"].str.contains("Google")) + & (~x["URL"].str.contains(".google.")) + & (x["SearchPhrase"] != "") + ] + .groupby("SearchPhrase") + .agg( + {"URL": "min", "Title": "min", "SearchPhrase": "size", "UserID": "nunique"} + ) + .nlargest(10, "SearchPhrase"), + lambda x: x.filter( + (pl.col("Title").str.contains("Google")) + & (~pl.col("URL").str.contains(".google.")) + & (pl.col("SearchPhrase") != "") + ) + .group_by("SearchPhrase") + .agg( + [ + pl.col("URL").min(), + pl.col("Title").min(), + pl.len().alias("count"), + pl.col("UserID").n_unique(), + ] + ) + .sort("count", descending=True) + .head(10), + ), + ( + "Q23", + "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", + lambda x: x[x["URL"].str.contains("google")] + .sort_values(by="EventTime") + .head(10), + lambda x: x.filter(pl.col("URL").str.contains("google")) + .sort("EventTime") + .head(10), + ), + ( + "Q24", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .sort_values(by="EventTime")[["SearchPhrase"]] + .head(10), + lambda x: x.filter(pl.col("SearchPhrase") != "") + .sort("EventTime") + .select("SearchPhrase") + .head(10), + ), + ( + "Q25", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .sort_values(by="SearchPhrase")[["SearchPhrase"]] + .head(10), + lambda x: x.filter(pl.col("SearchPhrase") != "") + .sort("SearchPhrase") + .select("SearchPhrase") + .head(10), + ), + ( + "Q26", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .sort_values(by=["EventTime", "SearchPhrase"])[["SearchPhrase"]] + .head(10), + lambda x: x.filter(pl.col("SearchPhrase") != "") + .sort(["EventTime", "SearchPhrase"]) + .select("SearchPhrase") + .head(10), + ), + ( + "Q27", + "SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + lambda x: x[x["URL"] != ""] + .groupby("CounterID") + .filter(lambda g: g["URL"].count() > 100000) + .agg({"URL": lambda url: url.str.len().mean(), "CounterID": "size"}) + .sort_values() + .head(25), + lambda x: x.filter(pl.col("URL") != "") # WHERE URL <> '' + .group_by("CounterID") # GROUP BY CounterID + .agg( + [ + pl.col("URL") + .map_elements(lambda y: len(y), return_dtype=pl.Int64) + .alias("l"), # AVG(STRLEN(URL)) + pl.len().alias("c"), # COUNT(*) + ] + ) + .filter(pl.col("c") > 100000) # HAVING COUNT(*) > 100000 + .sort("l", descending=True) # ORDER BY l DESC + .limit(25), # LIMIT 25, + ), + ( + "Q28", + "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + lambda x: ( + x[x["Referer"] != ""] + .assign(k=x["Referer"].str.extract(r"^https?://(?:www\.)?([^/]+)/.*$")[0]) + .groupby("k") + .filter(lambda g: g["Referer"].count() > 100000) + .agg( + min_referer=("Referer", "min"), + average_length=("Referer", lambda r: r.str.len().mean()), + ) + .head(25) + ), + lambda x: ( + x.filter(pl.col("Referer") != "") + .with_columns( + pl.col("Referer") + .str.extract(r"^https?://(?:www\\.)?([^/]+)/.*$") + .alias("k") + ) + .group_by("k") + .agg( + [ + pl.col("Referer").map_elements( + lambda y: len(y), return_dtype=pl.Int64 + ) + # .mean() # skip mean for now + .alias("l"), # AVG(STRLEN(Referer)) + pl.col("Referer").min().alias("min_referer"), # MIN(Referer) + pl.len().alias("c"), # COUNT(*) + ] + ) + .filter(pl.col("c") > 100000) # HAVING COUNT(*) > 100000 + .sort("l", descending=True) # ORDER BY l DESC + .limit(25) # LIMIT 25 + ), + ), + ( + "Q29", + "SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;", + lambda x: x["ResolutionWidth"].sum() + + x["ResolutionWidth"].shift(1).sum() + + x["ResolutionWidth"].shift(2).sum() + + x["ResolutionWidth"].shift(3).sum() + + x["ResolutionWidth"].shift(4).sum() + + x["ResolutionWidth"].shift(5).sum() + + x["ResolutionWidth"].shift(6).sum() + + x["ResolutionWidth"].shift(7).sum() + + x["ResolutionWidth"].shift(8).sum() + + x["ResolutionWidth"].shift(9).sum() + + x["ResolutionWidth"].shift(10).sum() + + x["ResolutionWidth"].shift(11).sum() + + x["ResolutionWidth"].shift(12).sum() + + x["ResolutionWidth"].shift(13).sum() + + x["ResolutionWidth"].shift(14).sum() + + x["ResolutionWidth"].shift(15).sum() + + x["ResolutionWidth"].shift(16).sum() + + x["ResolutionWidth"].shift(17).sum() + + x["ResolutionWidth"].shift(18).sum() + + x["ResolutionWidth"].shift(19).sum() + + x["ResolutionWidth"].shift(20).sum() + + x["ResolutionWidth"].shift(21).sum() + + x["ResolutionWidth"].shift(22).sum() + + x["ResolutionWidth"].shift(23).sum() + + x["ResolutionWidth"].shift(24).sum() + + x["ResolutionWidth"].shift(25).sum() + + x["ResolutionWidth"].shift(26).sum() + + x["ResolutionWidth"].shift(27).sum() + + x["ResolutionWidth"].shift(28).sum() + + x["ResolutionWidth"].shift(29).sum() + + x["ResolutionWidth"].shift(30).sum() + + x["ResolutionWidth"].shift(31).sum() + + x["ResolutionWidth"].shift(32).sum() + + x["ResolutionWidth"].shift(33).sum() + + x["ResolutionWidth"].shift(34).sum() + + x["ResolutionWidth"].shift(35).sum() + + x["ResolutionWidth"].shift(36).sum() + + x["ResolutionWidth"].shift(37).sum() + + x["ResolutionWidth"].shift(38).sum() + + x["ResolutionWidth"].shift(39).sum() + + x["ResolutionWidth"].shift(40).sum() + + x["ResolutionWidth"].shift(41).sum() + + x["ResolutionWidth"].shift(42).sum() + + x["ResolutionWidth"].shift(43).sum() + + x["ResolutionWidth"].shift(44).sum() + + x["ResolutionWidth"].shift(45).sum() + + x["ResolutionWidth"].shift(46).sum() + + x["ResolutionWidth"].shift(47).sum() + + x["ResolutionWidth"].shift(48).sum() + + x["ResolutionWidth"].shift(49).sum() + + x["ResolutionWidth"].shift(50).sum() + + x["ResolutionWidth"].shift(51).sum() + + x["ResolutionWidth"].shift(52).sum() + + x["ResolutionWidth"].shift(53).sum() + + x["ResolutionWidth"].shift(54).sum() + + x["ResolutionWidth"].shift(55).sum() + + x["ResolutionWidth"].shift(56).sum() + + x["ResolutionWidth"].shift(57).sum() + + x["ResolutionWidth"].shift(58).sum() + + x["ResolutionWidth"].shift(59).sum() + + x["ResolutionWidth"].shift(60).sum() + + x["ResolutionWidth"].shift(61).sum() + + x["ResolutionWidth"].shift(62).sum() + + x["ResolutionWidth"].shift(63).sum() + + x["ResolutionWidth"].shift(64).sum() + + x["ResolutionWidth"].shift(65).sum() + + x["ResolutionWidth"].shift(66).sum() + + x["ResolutionWidth"].shift(67).sum() + + x["ResolutionWidth"].shift(68).sum() + + x["ResolutionWidth"].shift(69).sum() + + x["ResolutionWidth"].shift(70).sum() + + x["ResolutionWidth"].shift(71).sum() + + x["ResolutionWidth"].shift(72).sum() + + x["ResolutionWidth"].shift(73).sum() + + x["ResolutionWidth"].shift(74).sum() + + x["ResolutionWidth"].shift(75).sum() + + x["ResolutionWidth"].shift(76).sum() + + x["ResolutionWidth"].shift(77).sum() + + x["ResolutionWidth"].shift(78).sum() + + x["ResolutionWidth"].shift(79).sum() + + x["ResolutionWidth"].shift(80).sum() + + x["ResolutionWidth"].shift(81).sum() + + x["ResolutionWidth"].shift(82).sum() + + x["ResolutionWidth"].shift(83).sum() + + x["ResolutionWidth"].shift(84).sum() + + x["ResolutionWidth"].shift(85).sum() + + x["ResolutionWidth"].shift(86).sum() + + x["ResolutionWidth"].shift(87).sum() + + x["ResolutionWidth"].shift(88).sum() + + x["ResolutionWidth"].shift(89).sum(), + lambda x: sum(x["ResolutionWidth"][:90] + pl.Series(range(90))), + ), + ( + "Q30", + "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .groupby(["SearchEngineID", "ClientIP"]) + .agg( + c=("SearchEngineID", "size"), + IsRefreshSum=("IsRefresh", "sum"), + AvgResolutionWidth=("ResolutionWidth", "mean"), + ) + .nlargest(10, "c"), + lambda x: x.filter(pl.col("SearchPhrase") != "") + .group_by(["SearchEngineID", "ClientIP"]) + .agg( + [ + pl.len().alias("c"), + pl.sum("IsRefresh").alias("IsRefreshSum"), + pl.mean("ResolutionWidth").alias("AvgResolutionWidth"), + ] + ) + .sort("c", descending=True) + .head(10), + ), + ( + "Q31", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + lambda x: x[x["SearchPhrase"] != ""] + .groupby(["WatchID", "ClientIP"]) + .agg( + c=("WatchID", "size"), + IsRefreshSum=("IsRefresh", "sum"), + AvgResolutionWidth=("ResolutionWidth", "mean"), + ) + .nlargest(10, "c"), + lambda x: x.filter(pl.col("SearchPhrase") != "") + .group_by(["WatchID", "ClientIP"]) + .agg( + [ + pl.len().alias("c"), + pl.sum("IsRefresh").alias("IsRefreshSum"), + pl.mean("ResolutionWidth").alias("AvgResolutionWidth"), + ] + ) + .sort("c", descending=True) + .head(10), + ), + ( + "Q32", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + lambda x: x.groupby(["WatchID", "ClientIP"]) + .agg( + c=("WatchID", "size"), + IsRefreshSum=("IsRefresh", "sum"), + AvgResolutionWidth=("ResolutionWidth", "mean"), + ) + .nlargest(10, "c"), + lambda x: x.group_by(["WatchID", "ClientIP"]) + .agg( + [ + pl.len().alias("c"), + pl.sum("IsRefresh").alias("IsRefreshSum"), + pl.mean("ResolutionWidth").alias("AvgResolutionWidth"), + ] + ) + .sort("c", descending=True) + .head(10), + ), + ( + "Q33", + "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", + lambda x: x.groupby("URL").size().nlargest(10).reset_index(name="c"), + lambda x: x.group_by("URL") + .agg(pl.len().alias("c")) + .sort("c", descending=True) + .head(10), + ), + ( + "Q34", + "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", + lambda x: x.groupby(["URL"]).size().nlargest(10).reset_index(name="c"), + lambda x: x.group_by("URL") + .agg(pl.len().alias("c")) + .sort("c", descending=True) + .head(10), + ), + ( + "Q35", + "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", + lambda x: x.assign( + **{f"ClientIP_minus_{i}": x["ClientIP"] - i for i in range(1, 4)} + ) + .groupby( + ["ClientIP", "ClientIP_minus_1", "ClientIP_minus_2", "ClientIP_minus_3"] + ) + .size() + .nlargest(10) + .reset_index(name="c"), + lambda x: x.with_columns([pl.col("ClientIP")]) + .group_by(["ClientIP"]) + .agg(pl.len().alias("c")) + .sort("c", descending=True) + .head(10), + ), + ( + "Q36", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", + lambda x: x[ + (x["CounterID"] == 62) + & (x["EventDate"] >= "2013-07-01") + & (x["EventDate"] <= "2013-07-31") + & (x["DontCountHits"] == 0) + & (x["IsRefresh"] == 0) + & (x["URL"] != "") + ] + .groupby("URL") + .size() + .nlargest(10), + lambda x: x.filter( + (pl.col("CounterID") == 62) + & (pl.col("EventDate") >= pl.datetime(2013, 7, 1)) + & (pl.col("EventDate") <= pl.datetime(2013, 7, 31)) + & (pl.col("DontCountHits") == 0) + & (pl.col("IsRefresh") == 0) + & (pl.col("URL") != "") + ) + .group_by("URL") + .agg(pl.len().alias("PageViews")) + .sort("PageViews", descending=True) + .head(10), + ), + ( + "Q37", + "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", + lambda x: x[ + (x["CounterID"] == 62) + & (x["EventDate"] >= "2013-07-01") + & (x["EventDate"] <= "2013-07-31") + & (x["DontCountHits"] == 0) + & (x["IsRefresh"] == 0) + & (x["Title"] != "") + ] + .groupby("Title") + .size() + .nlargest(10), + lambda x: x.filter( + (pl.col("CounterID") == 62) + & (pl.col("EventDate") >= pl.datetime(2013, 7, 1)) + & (pl.col("EventDate") <= pl.datetime(2013, 7, 31)) + & (pl.col("DontCountHits") == 0) + & (pl.col("IsRefresh") == 0) + & (pl.col("Title") != "") + ) + .group_by("Title") + .agg(pl.len().alias("PageViews")) + .sort("PageViews", descending=True) + .head(10), + ), + ( + "Q38", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + lambda x: x[ + (x["CounterID"] == 62) + & (x["EventDate"] >= "2013-07-01") + & (x["EventDate"] <= "2013-07-31") + & (x["IsRefresh"] == 0) + & (x["IsLink"] != 0) + & (x["IsDownload"] == 0) + ] + .groupby("URL") + .size() + .nlargest(10) + .reset_index(name="PageViews") + .iloc[1000:1010], + lambda x: x.filter( + (pl.col("CounterID") == 62) + & (pl.col("EventDate") >= pl.datetime(2013, 7, 1)) + & (pl.col("EventDate") <= pl.datetime(2013, 7, 31)) + & (pl.col("IsRefresh") == 0) + & (pl.col("IsLink") != 0) + & (pl.col("IsDownload") == 0) + ) + .group_by("URL") + .agg(pl.len().alias("PageViews")) + .sort("PageViews", descending=True) + .slice(1000, 10), + ), + ( + "Q39", + "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + lambda x: x[ + (x["CounterID"] == 62) + & (x["EventDate"] >= "2013-07-01") + & (x["EventDate"] <= "2013-07-31") + & (x["IsRefresh"] == 0) + ] + .groupby(["TraficSourceID", "SearchEngineID", "AdvEngineID", "Referer", "URL"]) + .size() + .nlargest(10) + .reset_index(name="PageViews") + .iloc[1000:1010], + lambda x: None, + # Crash with: + # thread '' panicked at crates/polars-time/src/windows/duration.rs:215:21: + # expected leading integer in the duration string, found m + # note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace + # lambda x: x.filter( + # (pl.col("CounterID") == 62) + # & (pl.col("EventDate") >= pl.datetime(2013, 7, 1)) + # & (pl.col("EventDate") <= pl.datetime(2013, 7, 31)) + # & (pl.col("IsRefresh") == 0) + # ) + # .group_by( + # [ + # "TraficSourceID", + # "SearchEngineID", + # "AdvEngineID", + # # pl.when(pl.col("SearchEngineID").eq(0) & pl.col("AdvEngineID").eq(0)) + # # .then(pl.col("Referer")) + # # .otherwise("") + # # .alias("Src"), + # "URL", + # ] + # ) + # .agg(pl.len().alias("PageViews")) + # .sort("PageViews", descending=True) + # .slice(1000, 10), + ), + ( + "Q40", + "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", + lambda x: x[ + (x["CounterID"] == 62) + & (x["EventDate"] >= "2013-07-01") + & (x["EventDate"] <= "2013-07-31") + & (x["IsRefresh"] == 0) + & (x["TraficSourceID"].isin([-1, 6])) + & (x["RefererHash"] == 3594120000172545465) + ] + .groupby(["URLHash", "EventDate"]) + .size() + .nlargest(10) + .reset_index(name="PageViews") + .iloc[100:110], + lambda x: x.filter( + (pl.col("CounterID") == 62) + & (pl.col("EventDate") >= pl.datetime(2013, 7, 1)) + & (pl.col("EventDate") <= pl.datetime(2013, 7, 31)) + & (pl.col("IsRefresh") == 0) + & (pl.col("TraficSourceID").is_in([-1, 6])) + & (pl.col("RefererHash") == 3594120000172545465) + ) + .group_by(["URLHash", "EventDate"]) + .agg(pl.len().alias("PageViews")) + .sort("PageViews", descending=True) + .slice(100, 10), + ), + ( + "Q41", + "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", + lambda x: x[ + (x["CounterID"] == 62) + & (x["EventDate"] >= "2013-07-01") + & (x["EventDate"] <= "2013-07-31") + & (x["IsRefresh"] == 0) + & (x["DontCountHits"] == 0) + & (x["URLHash"] == 2868770270353813622) + ] + .groupby(["WindowClientWidth", "WindowClientHeight"]) + .size() + .nlargest(10) + .reset_index(name="PageViews") + .iloc[10000:10010], + lambda x: x.filter( + (pl.col("CounterID") == 62) + & (pl.col("EventDate") >= pl.datetime(2013, 7, 1)) + & (pl.col("EventDate") <= pl.datetime(2013, 7, 31)) + & (pl.col("IsRefresh") == 0) + & (pl.col("DontCountHits") == 0) + & (pl.col("URLHash") == 2868770270353813622) + ) + .group_by(["WindowClientWidth", "WindowClientHeight"]) + .agg(pl.len().alias("PageViews")) + .sort("PageViews", descending=True) + .slice(10000, 10), + ), + ( + "Q42", + "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", + lambda x: x[ + (x["CounterID"] == 62) + & (x["EventDate"] >= "2013-07-14") + & (x["EventDate"] <= "2013-07-15") + & (x["IsRefresh"] == 0) + & (x["DontCountHits"] == 0) + ] + .groupby(pd.Grouper(key="EventTime", freq="T")) + .size() + .reset_index(name="PageViews") + .iloc[1000:1010], + lambda x: None, + # Crash with: + # thread '' panicked at crates/polars-time/src/windows/duration.rs:215:21: + # expected leading integer in the duration string, found m + # lambda x: x.filter( + # (pl.col("CounterID") == 62) + # & (pl.col("EventDate") >= pl.datetime(2013, 7, 14)) + # & (pl.col("EventDate") <= pl.datetime(2013, 7, 15)) + # & (pl.col("IsRefresh") == 0) + # & (pl.col("DontCountHits") == 0) + # ) + # .group_by(pl.col("EventTime").dt.truncate("minute")) + # .agg(pl.len().alias("PageViews")) + # .slice(1000, 10), + ), +] + +queries_times = [] +for q in queries: + times = [] + for _ in range(3): + start = timeit.default_timer() + result = q[3](pl_df) + end = timeit.default_timer() + if result is None: + times.append(None) + else: + times.append(end - start) + queries_times.append(times) + +result_json = { + "system": "Polars (DataFrame)", + "date": datetime.date.today().strftime("%Y-%m-%d"), + "machine": "c6a.metal, 500gb gp2", + "cluster_size": 1, + "comment": "", + "tags": [ + "C++", + "column-oriented", + "embedded", + "stateless", + "serverless", + "dataframe", + ], + "load_time": float(load_time), + "data_size": int(dataframe_size), + "result": queries_times, +} + +# if cpuinfo contains "AMD EPYC 9654" update machine and write result into results/epyc-9654.json +if "AMD EPYC 9654" in open("/proc/cpuinfo").read(): + result_json["machine"] = "EPYC 9654, 384G" + with open("results/epyc-9654.json", "w") as f: + f.write(json.dumps(result_json, indent=4)) +else: + # write result into results/c6a.metal.json + with open("results/c6a.metal.json", "w") as f: + f.write(json.dumps(result_json, indent=4)) diff --git a/polars/results/c6a.metal.json b/polars/results/c6a.metal.json new file mode 100644 index 00000000..97e71122 --- /dev/null +++ b/polars/results/c6a.metal.json @@ -0,0 +1,234 @@ +{ + "system": "Polars (DataFrame)", + "date": "2024-09-09", + "machine": "c6a.metal, 500gb gp2", + "cluster_size": 1, + "comment": "", + "tags": [ + "C++", + "column-oriented", + "embedded", + "stateless", + "serverless", + "dataframe" + ], + "load_time": 274.5956620259999, + "data_size": 46998823722, + "result": [ + [ + 2.7551000130188186e-05, + 2.0500001483014785e-06, + 3.8999996831989847e-07 + ], + [ + 0.3393856870000036, + 0.08128961900001741, + 0.0792162879999978 + ], + [ + 0.46830895299990516, + 0.4213311490000251, + 0.41986769899995124 + ], + [ + 0.27111638099995616, + 0.0691876239998237, + 0.06879308600014156 + ], + [ + 0.9961474940000699, + 1.1656686840001385, + 1.341034622000052 + ], + [ + 3.293661254999961, + 3.3244774100000996, + 3.31177472100012 + ], + [ + 0.05548804800014295, + 0.05496264599992173, + 0.05497070699993856 + ], + [ + 0.30676389199993537, + 0.11210982800002967, + 0.10042662099999689 + ], + [ + 5.758702494999852, + 1.412199709000106, + 0.9961499670000649 + ], + [ + 1.269696352999972, + 1.0784102180000446, + 1.089092165000011 + ], + [ + 1.5691383170001245, + 0.7789256089999981, + 0.7921542070000669 + ], + [ + 1.0894663530000344, + 0.9373611309999887, + 0.8721730039999329 + ], + [ + 2.494931741999835, + 1.7917862040001182, + 1.7991658439998446 + ], + [ + 95.30520302599984, + 96.30335870499971, + 95.42217894499981 + ], + [ + 1.9459286860001157, + 1.9641287069998725, + 1.9325827739999113 + ], + [ + 1.4602280210001481, + 1.2033372910000253, + 1.3688985580001827 + ], + [ + 3.3947668380001232, + 2.9921093740003926, + 2.996665767000195 + ], + [ + 2.16839200000004, + 2.5575646139996024, + 2.477013052000075 + ], + [ + 5.922572512999977, + 6.192915480000011, + 7.058452599999782 + ], + [ + 0.21617201800017938, + 0.051977289999740606, + 0.05087722700000086 + ], + [ + 6.605031917999895, + 5.489592963999712, + 3.316997033000007 + ], + [ + 2.9719723640000666, + 1.9411770279998564, + 1.9698260360000859 + ], + [ + 5.622727766000025, + 4.052559812999789, + 4.165921265999714 + ], + [ + 2.1939042029998745, + 2.0240198039996358, + 2.0558673149998867 + ], + [ + 3.2231196239999917, + 2.574916228999882, + 2.6234278580000137 + ], + [ + 3.1981313249998493, + 3.2646530639999582, + 3.29449439300015 + ], + [ + 2.607552430999931, + 2.5564862259998336, + 2.6211176460001298 + ], + [ + 8.782033818999935, + 9.022877838000113, + 8.618930766000176 + ], + [ + 65.78594618099987, + 65.80185137000035, + 65.21734767399994 + ], + [ + 0.04148712999995041, + 0.00017722400025377283, + 2.6159999833907932e-05 + ], + [ + 5.197321627000292, + 1.8464567339997302, + 1.8262284829997952 + ], + [ + 2.020474259999901, + 2.1032231610001872, + 2.150142378000055 + ], + [ + 6.235942041999806, + 8.480420082000364, + 7.408725919000062 + ], + [ + 3.7926399410002887, + 3.562234329000148, + 3.638766590999694 + ], + [ + 3.3556073149998156, + 3.7019931729996642, + 3.4804760279998845 + ], + [ + 0.5585857320002106, + 0.6201126340001792, + 0.6397678640000777 + ], + [ + 20.811287902000004, + 20.985524244000317, + 20.879136653000387 + ], + [ + 18.704310231999898, + 19.610116691000258, + 18.715476107000086 + ], + [ + 7.068463735000023, + 7.329527066000082, + 7.057720248999885 + ], + [ + null, + null, + null + ], + [ + 2.366720836999775, + 2.364191364999897, + 2.4704000149999956 + ], + [ + 0.06867151899996315, + 0.06851112599997577, + 0.09032966999984637 + ], + [ + null, + null, + null + ] + ] +} \ No newline at end of file diff --git a/polars/run.sh b/polars/run.sh new file mode 100755 index 00000000..bced9948 --- /dev/null +++ b/polars/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +./query.py