From 3b054ad62d96bcb8f541763fd88c8ec499137d01 Mon Sep 17 00:00:00 2001 From: Tyler Thomas Date: Sun, 5 Nov 2023 22:35:13 -0800 Subject: [PATCH 1/5] bump version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 3f6cf5c..df3f37a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "RAPIDS" uuid = "2764e59e-7dd7-4b2d-a28d-ce06411bac13" authors = ["tylerjthomas9 "] -version = "0.4.0" +version = "0.5.0" [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" From aaf33a476633b7306c8f8b4d0ae8b2098a43790a Mon Sep 17 00:00:00 2001 From: Tyler Thomas Date: Sun, 5 Nov 2023 22:50:20 -0800 Subject: [PATCH 2/5] import pandas --- src/RAPIDS.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/RAPIDS.jl b/src/RAPIDS.jl index 9d954a5..13bd84a 100644 --- a/src/RAPIDS.jl +++ b/src/RAPIDS.jl @@ -30,6 +30,7 @@ if !CUDA.has_cuda_gpu() const dask_cuda = nothing const dask_cudf = nothing const numpy = nothing + const pandas = nothing const pickle = nothing abstract type Py end macro py(x...) end @@ -65,7 +66,8 @@ else PythonCall.pycopy!(dask, pyimport("dask")) PythonCall.pycopy!(dask_cuda, pyimport("dask_cuda")) PythonCall.pycopy!(dask_cudf, pyimport("dask_cudf")) - PythonCall.pycopy!(numpy, pyimport("numpy")) + PythonCall.pycopy!(pandas, pyimport("numpy")) + PythonCall.pycopy!(numpy, pyimport("pandas")) return PythonCall.pycopy!(pickle, pyimport("pickle")) end end @@ -83,7 +85,8 @@ export VERSION, dask, dask_cuda, dask_cudf, - numpy + numpy, + pandas include("CuDF/CuDF.jl") include("CuML/CuML.jl") From 9ee5bf576be98df20936cf16d5c20ee0a8c8689a Mon Sep 17 00:00:00 2001 From: Tyler Thomas Date: Mon, 6 Nov 2023 07:03:02 +0000 Subject: [PATCH 3/5] add initial cudf I/O --- src/CuDF/CuDF.jl | 34 ++++++++++++++++++++++++++++++++++ src/RAPIDS.jl | 1 + 2 files changed, 35 insertions(+) diff --git a/src/CuDF/CuDF.jl b/src/CuDF/CuDF.jl index a9eccab..282725d 100644 --- a/src/CuDF/CuDF.jl +++ b/src/CuDF/CuDF.jl @@ -5,4 +5,38 @@ using RAPIDS: cudf using PythonCall using Tables +# I/O +read_csv(args...; kwargs...) = cudf.read_csv(args...; kwargs...) +to_csv(df::Py, args...; kwargs) = df.to_csv(args...; kwargs...) +read_text(args...; kwargs...) = cudf.read_text(args...; kwargs...) +read_json(args...; kwargs...) = cudf.read_json(args...; kwargs...) +to_json(df::Py, args...; kwargs) = df.to_json(args...; kwargs...) +read_parquet(args...; kwargs...) = cudf.read_parquet(args...; kwargs...) +to_parquet(df::Py, args...; kwargs) = df.to_parquet(args...; kwargs...) +read_orc(args...; kwargs...) = cudf.read_orc(args...; kwargs...) +to_orc(df::Py, args...; kwargs) = df.to_orc(args...; kwargs...) +read_hdf(args...; kwargs...) = cudf.read_hdf(args...; kwargs...) +to_hdf(df::Py, args...; kwargs) = df.to_hdf(args...; kwargs...) +read_feather(args...; kwargs...) = cudf.read_feather(args...; kwargs...) +to_feather(df::Py, args...; kwargs) = df.to_feather(args...; kwargs...) +read_arvo(args...; kwargs...) = cudf.read_arvo(args...; kwargs...) + + +export + # I/O + read_csv, + to_csv, + read_text, + read_json, + to_json, + read_parquet, + to_parquet, + read_orc, + to_orc, + read_hdf, + to_hdf, + read_feather, + to_feather, + read_arvo + end diff --git a/src/RAPIDS.jl b/src/RAPIDS.jl index 13bd84a..2038762 100644 --- a/src/RAPIDS.jl +++ b/src/RAPIDS.jl @@ -54,6 +54,7 @@ else const dask_cuda = PythonCall.pynew() const dask_cudf = PythonCall.pynew() const numpy = PythonCall.pynew() + const pandas = PythonCall.pynew() const pickle = PythonCall.pynew() function __init__() PythonCall.pycopy!(cucim, pyimport("cucim")) From db03b13a3443167e959ddc21ff575403e1e20f4c Mon Sep 17 00:00:00 2001 From: Tyler Thomas Date: Fri, 24 Nov 2023 17:56:09 -0700 Subject: [PATCH 4/5] [WIP] Pandas.jl port --- CondaPkg.toml | 2 +- Project.toml | 18 +- README.md | 83 +----- format/Manifest.toml | 104 ++++---- src/CuDF/CuDF.jl | 549 ++++++++++++++++++++++++++++++++++++--- src/CuDF/README.md | 3 + src/CuDF/exports.jl | 209 +++++++++++++++ src/CuDF/operators.jl | 10 + src/CuDF/tables.jl | 14 + src/CuDF/tabletraits.jl | 58 +++++ src/CuDF/util.jl | 11 + src/CuML/README.md | 75 ++++++ src/RAPIDS.jl | 6 +- test.jl | 96 +++++++ test/cudf.jl | 85 ++++++ test/cudf_tables.jl | 29 +++ test/cudf_tabletraits.jl | 59 +++++ test/runtests.jl | 3 +- test/test.csv | 3 + 19 files changed, 1243 insertions(+), 174 deletions(-) create mode 100644 src/CuDF/README.md create mode 100644 src/CuDF/exports.jl create mode 100644 src/CuDF/operators.jl create mode 100644 src/CuDF/tables.jl create mode 100644 src/CuDF/tabletraits.jl create mode 100644 src/CuDF/util.jl create mode 100644 src/CuML/README.md create mode 100644 test.jl create mode 100644 test/cudf_tables.jl create mode 100644 test/cudf_tabletraits.jl create mode 100644 test/test.csv diff --git a/CondaPkg.toml b/CondaPkg.toml index dacdda1..bcc4cfe 100644 --- a/CondaPkg.toml +++ b/CondaPkg.toml @@ -7,7 +7,7 @@ cucim = "=23.10" cuspatial = "=23.10" cugraph = "=23.10" cuml = "=23.10" -python = ">=3.9,<=3.10" +python = ">=3.9,<3.11" [deps.cuda-version] channel = "conda-forge" diff --git a/Project.toml b/Project.toml index df3f37a..fe24996 100644 --- a/Project.toml +++ b/Project.toml @@ -5,16 +5,29 @@ version = "0.5.0" [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" CondaPkg = "992eb4ea-22a4-4c89-a5bb-47a3300528ab" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +DataValues = "e7dc6d0d-1eca-5fa6-8ad6-5aecde8b7ea5" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +IteratorInterfaceExtensions = "82899510-4779-5014-852e-03e436cf321d" +Lazy = "50d2b5c4-7a5e-59d5-8109-a42b560f39c0" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" +MLJTestInterface = "72560011-54dd-4dc2-94f3-c5de45b75ecd" +OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d" +Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +TableTraits = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" +TableTraitsUtils = "382cd787-c1b6-5bf2-a167-d5b971a19bda" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -Aqua = "0.7" +Aqua = "0.8" CUDA = "3, 4, 5" CondaPkg = "0.2" +DataFrames = "1.6" MLJBase = "1" MLJModelInterface = "1" PythonCall = "0.9" @@ -23,9 +36,10 @@ julia = "1.8" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" MLJTestInterface = "72560011-54dd-4dc2-94f3-c5de45b75ecd" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "Aqua", "MLJBase", "MLJTestInterface"] +test = ["Test", "Aqua", "DataFrames", "MLJBase", "MLJTestInterface"] diff --git a/README.md b/README.md index 15cfc5e..711a2b5 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,6 @@ [![Lifecycle:Maturing](https://img.shields.io/badge/Lifecycle-Maturing-007EC6)](https://github.com/bcgov/repomountie/blob/master/doc/lifecycle-badges.md) [![Code Style: YASGuide](https://img.shields.io/badge/code%20style-yas-violet.svg)](https://github.com/jrevels/YASGuide) - -:warning: RAPIDS.jl is only supported on Julia 1.8.5+. For previous Julia versions, you have to manually upgrade to libraries from GCC 12. - # RAPIDS.jl Unofficial Julia wrapper for the [RAPIDS.ai](https://rapids.ai/index.html) ecosystem. @@ -42,9 +39,14 @@ julia> ]add https://github.com/tylerjthomas9/RAPIDS.jl julia> using Pkg; Pkg.add(url="https://github.com/tylerjthomas9/RAPIDS.jl") ``` +## Julia Interfaces + +- `CuDF` +- `CuML` + ## Python API -You can access the following python libraries with their standard syntax: +You can access the following python libraries with their standard Python syntax: - `cupy` - `cudf` - `cuml` @@ -55,76 +57,9 @@ You can access the following python libraries with their standard syntax: - `dask_cuda` - `dask_cudf` - `numpy` +- `pandas` (cudf pandas) - `pickle` -Here is an example of using `LogisticRegression`, `make_classification` via the Python API. - -```julia -using RAPIDS -const make_classification = cuml.datasets.classification.make_classification - -X_py, y_py = make_classification(n_samples=200, n_features=4, - n_informative=2, n_classes=2) -lr = cuml.LogisticRegression(max_iter=100) -lr.fit(X_py, y_py) -preds = lr.predict(X_py) - -print(lr.coef_) -``` - -## MLJ Interface - -A MLJ interface is also available for supported models. The model hyperparameters are the same as described in the [cuML docs](https://docs.rapids.ai/api/cuml/stable/api.html). The only difference is that the models will always input/output numpy arrays, which will be converted back to Julia arrays (`output_type="input"`). - -```julia -using MLJBase -using RAPIDS.CuML -const make_classification = cuml.datasets.classification.make_classification - -X_py, y_py = make_classification(n_samples=200, n_features=4, - n_informative=2, n_classes=2) -X = RAPIDS.pyconvert(Matrix{Float32}, X_py.get()) -y = RAPIDS.pyconvert(Vector{Float32}, y_py.get().flatten()) - -lr = LogisticRegression(max_iter=100) -mach = machine(lr, X, y) -fit!(mach) -preds = predict(mach, X) - -print(mach.fitresult.coef_) -``` -MLJ Support: -- Clustering - - `KMeans` - - `DBSCAN` - - `AgglomerativeClustering` - - `HDBSCAN` -- Classification - - `LogisticRegression` - - `MBSGDClassifier` - - `RandomForestClassifier` - - `SVC` - - `LinearSVC` - - `KNeighborsClassifier` -- Regression - - `LinearRegression` - - `Ridge` - - `Lasso` - - `ElasticNet` - - `MBSGDRegressor` - - `RandomForestRegressor` - - `CD` - - `SVR` - - `LinearSVR` - - `KNeighborsRegressor` -- Dimensionality Reduction - - `PCA` - - `IncrementalPCA` - - `TruncatedSVD` - - `UMAP` - - `TSNE` - - `GaussianRandomProjection` -- Time Series - - `ExponentialSmoothing` - - `ARIMA` +## Known Issues +- RAPIDS.jl is only supported on Julia 1.8.5+. For previous Julia versions, you have to manually upgrade to libraries from GCC 12. diff --git a/format/Manifest.toml b/format/Manifest.toml index b04a19c..c4aceb7 100644 --- a/format/Manifest.toml +++ b/format/Manifest.toml @@ -1,6 +1,6 @@ # This file is machine-generated - editing it directly is not advised -julia_version = "1.8.5" +julia_version = "1.9.4" manifest_format = "2.0" project_hash = "30b405be1c677184b7703a9bfb3d2100029ccad0" @@ -21,21 +21,23 @@ uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f" version = "3.3.6" [[deps.CommonMark]] -deps = ["Crayons", "JSON", "SnoopPrecompile", "URIs"] -git-tree-sha1 = "e2f4627b0d3f2c1876360e0b242a7c23923b469d" +deps = ["Crayons", "JSON", "PrecompileTools", "URIs"] +git-tree-sha1 = "532c4185d3c9037c0237546d817858b23cf9e071" uuid = "a80b9123-70ca-4bc0-993e-6e3bcb318db6" -version = "0.8.10" +version = "0.8.12" [[deps.Compat]] -deps = ["Dates", "LinearAlgebra", "UUIDs"] -git-tree-sha1 = "7a60c856b9fa189eb34f5f8a6f6b5529b7942957" +deps = ["UUIDs"] +git-tree-sha1 = "8a62af3e248a8c4bad6b32cbbe663ae02275e32c" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "4.6.1" +version = "4.10.0" -[[deps.CompilerSupportLibraries_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" -version = "1.0.1+0" + [deps.Compat.extensions] + CompatLinearAlgebraExt = "LinearAlgebra" + + [deps.Compat.weakdeps] + Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" + LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [[deps.Crayons]] git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" @@ -44,9 +46,9 @@ version = "4.1.1" [[deps.DataStructures]] deps = ["Compat", "InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "d1fff3a548102f48987a52a2e0d114fa97d730f0" +git-tree-sha1 = "3dbd312d370723b6bb43ba9d02fc36abade4518d" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.18.13" +version = "0.18.15" [[deps.Dates]] deps = ["Printf"] @@ -76,20 +78,20 @@ uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" version = "0.21.4" [[deps.JuliaFormatter]] -deps = ["CSTParser", "CommonMark", "DataStructures", "Glob", "Pkg", "SnoopPrecompile", "Tokenize"] -git-tree-sha1 = "0f6545dd63fec03d0cfe0c1d28f851e2d804e942" +deps = ["CSTParser", "CommonMark", "DataStructures", "Glob", "Pkg", "PrecompileTools", "Tokenize"] +git-tree-sha1 = "3d5b5b539e4606dcca0e6a467b98a64c8da4850b" uuid = "98e50ef6-434e-11e9-1051-2b60c6c9e899" -version = "1.0.25" +version = "1.0.42" [[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" -version = "0.6.3" +version = "0.6.4" [[deps.LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" -version = "7.84.0+0" +version = "8.4.0+0" [[deps.LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] @@ -98,15 +100,11 @@ uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" [[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" -version = "1.10.2+0" +version = "1.11.0+1" [[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" -[[deps.LinearAlgebra]] -deps = ["Libdl", "libblastrampoline_jll"] -uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" - [[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" @@ -117,45 +115,46 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" [[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" -version = "2.28.0+0" +version = "2.28.2+0" [[deps.Mmap]] uuid = "a63ad114-7e13-5084-954f-fe012c677804" [[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" -version = "2022.2.1" +version = "2022.10.11" [[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" version = "1.2.0" -[[deps.OpenBLAS_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] -uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" -version = "0.3.20+0" - [[deps.OrderedCollections]] -git-tree-sha1 = "d321bf2de576bf25ec4d3e4360faca399afca282" +git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.6.0" +version = "1.6.2" [[deps.Parsers]] -deps = ["Dates", "SnoopPrecompile"] -git-tree-sha1 = "478ac6c952fddd4399e71d4779797c538d0ff2bf" +deps = ["Dates", "PrecompileTools", "UUIDs"] +git-tree-sha1 = "a935806434c9d4c506ba941871b327b96d41f2bf" uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "2.5.8" +version = "2.8.0" [[deps.Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -version = "1.8.0" +version = "1.9.2" + +[[deps.PrecompileTools]] +deps = ["Preferences"] +git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f" +uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" +version = "1.2.0" [[deps.Preferences]] deps = ["TOML"] -git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d" +git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.3.0" +version = "1.4.1" [[deps.Printf]] deps = ["Unicode"] @@ -176,34 +175,28 @@ version = "0.7.0" [[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" -[[deps.SnoopPrecompile]] -deps = ["Preferences"] -git-tree-sha1 = "e760a70afdcd461cf01a575947738d359234665c" -uuid = "66db9d55-30c0-4569-8b51-7e840670fc0c" -version = "1.0.3" - [[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" [[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" -version = "1.0.0" +version = "1.0.3" [[deps.Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" -version = "1.10.1" +version = "1.10.0" [[deps.Tokenize]] -git-tree-sha1 = "90538bf898832b6ebd900fa40f223e695970e3a5" +git-tree-sha1 = "0454d9a9bad2400c7ccad19ca832a2ef5a8bc3a1" uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624" -version = "0.5.25" +version = "0.5.26" [[deps.URIs]] -git-tree-sha1 = "074f993b0ca030848b897beff716d93aca60f06a" +git-tree-sha1 = "67db6cc7b3821e19ebe75791a9dd19c9b1188f2b" uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" -version = "1.4.2" +version = "1.5.1" [[deps.UUIDs]] deps = ["Random", "SHA"] @@ -215,17 +208,12 @@ uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" -version = "1.2.12+3" - -[[deps.libblastrampoline_jll]] -deps = ["Artifacts", "Libdl", "OpenBLAS_jll"] -uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" -version = "5.1.1+0" +version = "1.2.13+0" [[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" -version = "1.48.0+0" +version = "1.52.0+1" [[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] diff --git a/src/CuDF/CuDF.jl b/src/CuDF/CuDF.jl index 282725d..e0ec616 100644 --- a/src/CuDF/CuDF.jl +++ b/src/CuDF/CuDF.jl @@ -1,42 +1,519 @@ module CuDF -using RAPIDS: cudf +using Dates using PythonCall -using Tables - -# I/O -read_csv(args...; kwargs...) = cudf.read_csv(args...; kwargs...) -to_csv(df::Py, args...; kwargs) = df.to_csv(args...; kwargs...) -read_text(args...; kwargs...) = cudf.read_text(args...; kwargs...) -read_json(args...; kwargs...) = cudf.read_json(args...; kwargs...) -to_json(df::Py, args...; kwargs) = df.to_json(args...; kwargs...) -read_parquet(args...; kwargs...) = cudf.read_parquet(args...; kwargs...) -to_parquet(df::Py, args...; kwargs) = df.to_parquet(args...; kwargs...) -read_orc(args...; kwargs...) = cudf.read_orc(args...; kwargs...) -to_orc(df::Py, args...; kwargs) = df.to_orc(args...; kwargs...) -read_hdf(args...; kwargs...) = cudf.read_hdf(args...; kwargs...) -to_hdf(df::Py, args...; kwargs) = df.to_hdf(args...; kwargs...) -read_feather(args...; kwargs...) = cudf.read_feather(args...; kwargs...) -to_feather(df::Py, args...; kwargs) = df.to_feather(args...; kwargs...) -read_arvo(args...; kwargs...) = cudf.read_arvo(args...; kwargs...) - - -export - # I/O - read_csv, - to_csv, - read_text, - read_json, - to_json, - read_parquet, - to_parquet, - read_orc, - to_orc, - read_hdf, - to_hdf, - read_feather, - to_feather, - read_arvo +using Lazy +using Compat +using TableTraits +using Statistics + +using RAPIDS: pandas, numpy + +import Base: getindex, setindex!, length, size, show, merge, convert, + join, replace, lastindex, sum, abs, any, count, + cumprod, cumsum, diff, filter, first, last, + min, sort, truncate, +, -, *, /, !, + ==, >, <, >=, <=, !=, &, |, + keys, close, get +import Statistics: mean, std, var, cov, median, quantile + +include("exports.jl") + +""" + version() + +Returns the version of the underlying Python Pandas library as a VersionNumber. +""" +version() = VersionNumber(pandas.__version__) + +const pre_type_map = [] + +abstract type PandasWrapped end + +PythonCall.Py(x::PandasWrapped) = x.pyo + +macro pytype(name, class) + quote + struct $(name) <: PandasWrapped + pyo::Py + $(esc(name))(pyo::Py) = new(pyo) + function $(esc(name))(args...; kwargs...) + pandas_method = ($class)() + return new(pycall(pandas_method, args...; kwargs...)) + end + + function $(esc(name))(dict::Dict, args...; kwargs...) + pandas_method = ($class)() + dict_string = Dict(string(k) => v for (k, v) in dict) + return new(pycall(pandas_method, pydict(dict_string), args...; kwargs...)) + end + end + + function Base.iterate(x::$name, state...) + res = Base.iterate(x.pyo, state...) + if res === nothing + return nothing + else + value, state = res + return pandas_wrap(value), state + end + end + + push!(pre_type_map, ($class, $name)) + end +end + +quot(x) = Expr(:quote, x) + +function convert_datetime_series_to_julia_vector(series) + N = length(series) + out = Array{Dates.DateTime}(undef, N) + for i in 1:N + # PyCall.jl overloads the getindex method on `series` to automatically convert + # to a Julia date type. + out[i] = series[i] + end + return out +end + +function Base.Array(x::PandasWrapped) + if typeof(x) <: Series && x.pyo.dtype == np.dtype(" type_map + type_map = Dict{Py, Any}() + type_map[pandas.core.frame."DataFrame"] = DataFrame + type_map[pandas.core.indexing."_iLocIndexer"] = Iloc + type_map[pandas.core.series."Series"] = Series + type_map[pandas.core.indexing."MultiIndex"] = MultiIndex + type_map[pandas.core.indexes.multi."Index"] = Index + type_map[pandas.core.indexing."_LocIndexer"] = Loc + type_map[pandas.core.groupby."DataFrameGroupBy"] = GroupBy + type_map[pandas.core.groupby."SeriesGroupBy"] = SeriesGroupBy + type_map[pandas.core.window."Rolling"] = Rolling + type_map[pandas.io.pytables.HDFStore] = HDFStore + for (pyt, pyv) in type_map + pyt === nothing && continue + if pyisinstance(pyo, pyt) + return pyv(pyo) + end + end + return pyconvert(Any, pyo) +end + +pandas_wrap(x::Union{AbstractArray,Tuple}) = [pandas_wrap(_) for _ in x] + +pandas_wrap(pyo) = pyo + +fix_arg(x::StepRange) = pyeval("slice($(x.start), $(x.start+length(x)*x.step), $(x.step))", Main) +fix_arg(x::UnitRange) = fix_arg(StepRange(x.start, 1, x.stop)) +fix_arg(x::Colon) = pybuiltin("slice")(nothing, nothing, nothing) +fix_arg(x) = x + +function fix_arg(x, offset) + if offset + fix_arg(x .- 1) + else + fix_arg(x) + end +end + +fix_arg(x::Colon, offset) = pybuiltin("slice")(nothing, nothing, nothing) + +pyattr(class, method) = pyattr(class, method, method) + +function pyattr(class, jl_method, py_method) + quote + function $(esc(jl_method))(pyt::$class, args...; kwargs...) + new_args = fix_arg.(args) + method = pyt.pyo.$(string(py_method)) + pyo = pycall(method, new_args...; kwargs...) + return wrapped = pandas_wrap(pyo) + end + end +end + +macro pyattr(class, method) + return pyattr(class, method) +end + +macro pyattr(class, method, orig_method) + return pyattr(class, method, orig_method) +end + +""" + pyattr_set(types, methods...) + +For each Julia type `T<:PandasWrapped` in `types` and each method `m` in `methods`, +define a new function `m(t::T, args...)` that delegates to the underlying +Py wrapped by `t`. +""" +function pyattr_set(classes, methods...) + for class in classes + for method in methods + @eval @pyattr($class, $method) + end + end +end + +macro pyasvec(class) + index_expr = quote + function $(esc(:getindex))(pyt::$class, args...) + offset = should_offset(pyt, args...) + new_args = tuple([fix_arg(arg, offset) for arg in args]...) + new_args = (length(new_args) == 1 ? new_args[1] : new_args) + pyo = pycall(pyt.pyo.__getitem__, Py, new_args) + return pandas_wrap(pyo) + end + + function $(esc(:setindex!))(pyt::$class, value, idxs...) + offset = should_offset(pyt, idxs...) + new_idx = [fix_arg(idx, offset) for idx in idxs] + if length(new_idx) > 1 + pandas_wrap(pycall(pyt.pyo.__setitem__, Py, tuple(new_idx...), value)) + else + pandas_wrap(pycall(pyt.pyo.__setitem__, Py, new_idx[1], value)) + end + end + end + if class in [:Iloc, :Loc, :Ix] + length_expr = quote + function $(esc(:length))(x::$class) + return x.pyo.obj.__len__() + 1 + end + end + else + length_expr = quote + function $(esc(:length))(x::$class) + return x.pyo.__len__() + end + end + end + + quote + $index_expr + $length_expr + function $(esc(:lastindex))(x::$class) + return length(x) + end + end +end + +@pytype DataFrame () -> pandas.core.frame."DataFrame" +@pytype Iloc () -> pandas.core.indexing."_iLocIndexer" +@pytype Series () -> pandas.core.series."Series" +@pytype Ix () -> version() < VersionNumber(1) ? pandas.core.indexing."_IXIndexer" : + nothing +@pytype MultiIndex () -> version() < VersionNumber(1) ? pandas.core.index."MultiIndex" : + pandas.core.indexes.multi."MultiIndex" +@pytype Index () -> version() < VersionNumber(1) ? pandas.core.index."Index" : + pandas.core.indexes.multi."Index" +@pytype Loc () -> pandas.core.indexing."_LocIndexer" +@pytype GroupBy () -> pandas.core.groupby."DataFrameGroupBy" +@pytype SeriesGroupBy () -> pandas.core.groupby."SeriesGroupBy" +@pytype Rolling () -> pandas.core.window."Rolling" +@pytype HDFStore () -> pandas.io.pytables.HDFStore + +@pyattr GroupBy app apply +@pyattr Rolling app apply + +pyattr_set([GroupBy, SeriesGroupBy], :mean, :std, :agg, :aggregate, :median, + :var, :ohlc, :transform, :groups, :indices, :get_group, :hist, :plot, :count) + +pyattr_set([Rolling], :agg, :aggregate, :apply, :corr, :count, :cov, :kurt, :max, :mean, + :median, :min, :ndim, :quantile, :skew, :std, :sum, :validate, :var) + +@pyattr GroupBy siz size + +pyattr_set([DataFrame, Series], :T, :abs, :align, :any, :argsort, :asfreq, :asof, + :boxplot, :clip, :clip_lower, :clip_upper, :corr, :corrwith, :count, :cov, + :cummax, :cummin, :cumprod, :cumsum, :delevel, :describe, :diff, :drop, + :drop_duplicates, :dropna, :duplicated, :fillna, :filter, :first, + :first_valid_index, + :head, :hist, :idxmax, :idxmin, :iloc, :isin, :join, :last, :last_valid_index, + :loc, :mean, :median, :min, :mode, :order, :pct_change, :pivot, :plot, :quantile, + :rank, :reindex, :reindex_axis, :reindex_like, :rename, :reorder_levels, + :replace, :resample, :reset_index, :sample, :select, :set_index, :shift, :skew, + :sort, :sort_index, :sortlevel, :stack, :std, :sum, :swaplevel, :tail, :take, + :to_clipboard, :to_csv, :to_dense, :to_dict, :to_excel, :to_gbq, :to_hdf, + :to_html, + :to_json, :to_latex, :to_msgpack, :to_panel, :to_pickle, :to_records, :to_sparse, + :to_sql, :to_string, :truncate, :tz_conert, :tz_localize, :unstack, :var, + :weekday, + :xs, :merge, :equals, :to_parquet) +pyattr_set([DataFrame], :groupby) +pyattr_set([Series, DataFrame], :rolling) +pyattr_set([HDFStore], :put, :append, :get, :select, :info, :keys, :groups, :walk, :close) + +Base.size(x::Union{Loc,Iloc,Ix}) = x.pyo.obj.shape +Base.size(df::PandasWrapped, i::Integer) = size(df)[i] +Base.size(df::PandasWrapped) = df.pyo.shape + +Base.isempty(df::PandasWrapped) = df.pyo.empty +Base.empty!(df::PandasWrapped) = df.pyo.drop(df.pyo.index; inplace=true) + +should_offset(::Any, args...) = false +should_offset(::Union{Iloc,Index}, args...) = true + +function should_offset(s::Series, arg) + if eltype(arg) == Int64 + if eltype(index(s)) ≠ Int64 + return true + end + end + return false +end + +for attr in [:index, :columns] + @eval function $attr(x::PandasWrapped) + return pyconvert(Array, x.pyo.$(string(attr)).values) + end +end + +@pyasvec Series +@pyasvec Loc +@pyasvec Ix +@pyasvec Iloc +@pyasvec DataFrame +@pyasvec Index +@pyasvec GroupBy +@pyasvec Rolling +@pyasvec HDFStore + +Base.ndims(df::Union{DataFrame,Series}) = length(size(df)) + +for m in + [:read_pickle, :read_csv, :read_gbq, :read_html, :read_json, :read_excel, :read_table, + :save, :stats, :melt, :ewma, :concat, :pivot_table, :crosstab, :cut, + :qcut, :get_dummies, :resample, :date_range, :to_datetime, :to_timedelta, + :bdate_range, :period_range, :ewmstd, :ewmvar, :ewmcorr, :ewmcov, :rolling_count, + :expanding_count, :rolling_sum, :expanding_sum, :rolling_mean, :expanding_mean, + :rolling_median, :expanding_median, :rolling_var, :expanding_var, :rolling_std, + :expanding_std, :rolling_min, :expanding_min, :rolling_max, :expanding_max, + :rolling_corr, :expanding_corr, :rolling_corr_pairwise, :expanding_corr_pairwise, + :rolling_cov, :expanding_cov, :rolling_skew, :expanding_skew, :rolling_kurt, + :expanding_kurt, :rolling_apply, :expanding_apply, :rolling_quantile, + :expanding_quantile, :rolling_window, :to_numeric, :read_sql, :read_sql_table, + :read_sql_query, :read_hdf, :read_parquet] + @eval begin + function $m(args...; kwargs...) + method = pandas.$(string(m)) + result = pycall(method, args...; kwargs...) + return pandas_wrap(result) + end + end +end + +function show(io::IO, df::PandasWrapped) + s = df.pyo.__str__() + return println(io, s) +end + +function show(io::IO, ::MIME"text/html", df::PandasWrapped) + obj = df.pyo + try + return println(io, obj.to_html()) + catch + return show(io, df) + end +end + +function query(df::DataFrame, s::AbstractString) + return @pyexec (df=df.pyo, s="$s") => `res=df.query(s)` => res +end + +function query(df::DataFrame, e::Expr) # This whole method is a terrible hack + s = string(e) + for (target, repl) in [("&&", "&"), ("||", "|"), ("∈", "=="), (r"!(?!=)", "~")] + s = replace(s, target => repl) + end + return query(df, s) +end + +macro query(df, e) + quote + query($(esc(df)), $(QuoteNode(e))) + end +end + +for m in [:from_arrays, :from_tuples] + @eval function $m(args...; kwargs...) + f = pandas."MultiIndex"[string($(quot(m)))] + res = pycall(f, Py, args...; kwargs...) + return pandas_wrap(res) + end +end + +for (jl_op, py_op, py_opᵒ) in [(:+, :__add__, :__add__), (:*, :__mul__, :__mul__), + (:/, :__div__, :__rdiv__), (:-, :__sub__, :__rsub__), + (:>, :__gt__, :__lt__), (:<, :__lt__, :__gt__), + (:>=, :__ge__, :__le__), (:<=, :__le__, :__ge__), + (:&, :__and__, :__and__), (:|, :__or__, :__or__)] + @eval begin + function $(jl_op)(x::PandasWrapped, y) + res = x.pyo.$(string(py_op))(y) + return pandas_wrap(res) + end + + function $(jl_op)(x::PandasWrapped, y::PandasWrapped) + return invoke($(jl_op), Tuple{PandasWrapped,Any}, x, y) + end + + function $(jl_op)(y, x::PandasWrapped) + res = x.pyo.$(string(py_opᵒ))(y) + return pandas_wrap(res) + end + end +end + +# Special-case the handling of equality-testing to always consider PandasWrapped +# objects as unequal to non-wrapped objects. +(==)(x::PandasWrapped, y) = false +(==)(x, y::PandasWrapped) = false +(!=)(x::PandasWrapped, y) = true +(!=)(x, y::PandasWrapped) = true +function (==)(x::PandasWrapped, y::PandasWrapped) + return pandas_wrap(x.pyo.__eq__(y)) +end +function (!=)(x::PandasWrapped, y::PandasWrapped) + return pandas_wrap(x.pyo.__neq__(y)) +end + +for op in [(:-, :__neg__)] + @eval begin + $(op[1])(x::PandasWrapped) = pandas_wrap(x.pyo.$(quot(op[2]))()) + end +end + +function setcolumns!(df::PandasWrapped, new_columns) + return df.pyo.__setattr__("columns", new_columns) +end + +function deletecolumn!(df::DataFrame, column) + return df.pyo.__delitem__(column) +end + +name(s::Series) = s.pyo.name +name!(s::Series, name) = s.pyo.name = name + +include("operators.jl") + +function DataFrame(pairs::Pair...) + return DataFrame(Dict(pairs...)) +end + +function index!(df::PandasWrapped, new_index) + df.pyo.index = new_index + return df +end + +function Base.eltype(s::Series) + dtype_map = Dict(np.dtype("int64") => Int64, + np.dtype("float64") => Float64, + np.dtype("object") => String) + return get(dtype_map, s.pyo.dtype, Any) +end + +function Base.eltype(df::DataFrame) + types = [] + for column in columns(df) + push!(types, eltype(df[column])) + end + return Tuple{types...} +end + +function Base.map(f::Function, s::Series) + if eltype(s) ∈ (Int64, Float64) + Series([f(_) for _ in values(s)]) + else + Series([f(_) for _ in s]) + end +end + +function Base.map(x, s::Series; na_action=nothing) + return pandas_wrap(s.pyo.map(x, na_action)) +end + +function Base.get(df::PandasWrapped, key, default) + return pandas_wrap(df.pyo.get(key; default=default)) +end + +function Base.getindex(s::Series, c::CartesianIndex{1}) + return s[c[1]] +end + +function Base.copy(df::PandasWrapped) + return pandas_wrap(df.pyo.copy()) +end + +function !(df::PandasWrapped) + return pandas_wrap(df.pyo.__neg__()) +end + +include("tabletraits.jl") +include("tables.jl") + +function DataFrame(obj) + y = _construct_pandas_from_iterabletable(obj) + if y === nothing + y = _construct_pandas_from_tables(obj) + if y === nothing + return invoke(DataFrame, Tuple{Vararg{Any}}, obj) + else + return y + end + else + return y + end +end + +function has_named_attr(x::Index, s) + return x.pyo.__contains__(Symbol(s)) +end + +named_index(x::DataFrame) = columns(x) +named_index(x::Series) = index(x) + +function has_named_attr(x::PyIterable{Any}, s::Symbol) + pyhasattr(x, "$s") +end + +function Base.getproperty(x::Union{DataFrame,Series}, s::Symbol) + if s == :pyo + return getfield(x, s) + end + if pyhasattr(x.pyo, "$s") + return x.pyo["$s"] + else + return getfield(x, s) + end +end end diff --git a/src/CuDF/README.md b/src/CuDF/README.md new file mode 100644 index 0000000..02da027 --- /dev/null +++ b/src/CuDF/README.md @@ -0,0 +1,3 @@ +# CuDF + + diff --git a/src/CuDF/exports.jl b/src/CuDF/exports.jl new file mode 100644 index 0000000..5fa0510 --- /dev/null +++ b/src/CuDF/exports.jl @@ -0,0 +1,209 @@ +export + abs, + align, + any, + argsort, + clip, + clip_lower, + clip_upper, + corr, + corrwith, + count, + cov, + cummax, + cummin, + cumprod, + cumsum, + drop_duplicates, + duplicated, + filter, + first, + head, + hist, + idxmax, + idxmin, + iloc, + index, + join, + last, + loc, + plot, + reindex, + reindex_axis, + reindex_like, + rename, + resample, + reset_index, + rolling_count, + select, + set_index, + sort_index, + sum, + tail, + to_csv, + to_latex, + truncate, + xs, + GroupBy, + Rolling, + Iloc, + Index, + Loc, + MultiIndex, + Series, + agg, + aggregate, + get_group, + groups, + indices, + mean, + median, + ohlc, + pivot_table, + read_csv, + std, + transform, + var, + melt, + @>, + @query, + app, + columns, + concat, + crosstab, + cut, + deletecolumn!, + ewma, + expanding_apply, + expanding_corr, + expanding_corr_pairwise, + expanding_count, + expanding_cov, + expanding_kurt, + expanding_max, + expanding_mean, + expanding_median, + expanding_min, + expanding_quantile, + expanding_skew, + expanding_std, + expanding_sum, + expanding_var, + from_arrays, + from_tuples, + get_dummies, + groupby, + rolling, + index!, + isin, + name, + name!, + qcut, + read_excel, + read_gbq, + read_html, + read_json, + read_pickle, + read_table, + rolling_apply, + rolling_corr, + rolling_corr_pairwise, + rolling_count, + rolling_cov, + rolling_kurt, + rolling_max, + rolling_mean, + rolling_median, + rolling_min, + rolling_quantile, + rolling_quantile, + rolling_skew, + rolling_std, + rolling_sum, + rolling_sum, + rolling_var, + rolling_window, + sample, + save, + setcolumns!, + setname!, + siz, + stats, + to_clipboard, + to_numeric, + values, + DataFrame, + T, + asfreq, + asof, + bdate_range, + date_range, + delevel, + describe, + diff, + dropna, + ewma, + ewmcorr, + ewmcov, + ewmstd, + ewmvar, + fillna, + first_valid_index, + last_valid_index, + mean, + median, + min, + mode, + order, + pct_change, + period_range, + pivot, + quantile, + query, + rank, + reodrer_levels, + replace, + resample, + shift, + skew, + sort, + sortlevel, + stack, + std, + swaplevel, + to_datetime, + to_dense, + to_dict, + to_excel, + to_gbq, + to_hdf, + to_html, + to_json, + to_msgpack, + to_panel, + to_pickle, + to_records, + to_sparse, + to_sql, + to_string, + to_timedelta, + tz_conert, + tz_localize, + unstack, + var, + weekday, + read_sql, + read_hdf, + HDFStore, + info, + put, + walk, + equals + +if !isdefined(Base, :drop) + export drop +end + +if !isdefined(Base, :take) + export take +end diff --git a/src/CuDF/operators.jl b/src/CuDF/operators.jl new file mode 100644 index 0000000..c04e745 --- /dev/null +++ b/src/CuDF/operators.jl @@ -0,0 +1,10 @@ +import Base: ==, >, <, >=, <=, != + +for (op, pyop) in + [(:(==), :__eq__), (:>, :__gt__), (:<, :__lt__), (:>=, :__ge__), (:<=, :__le__), + (:!=, :__ne__)] + @eval function Base.broadcast(::typeof($op), s::PandasWrapped, x) + method = s.pyo.$(QuoteNode(pyop)) + return pandas_wrap(pycall(method, x)) + end +end diff --git a/src/CuDF/tables.jl b/src/CuDF/tables.jl new file mode 100644 index 0000000..a2574de --- /dev/null +++ b/src/CuDF/tables.jl @@ -0,0 +1,14 @@ +using Tables + +function _construct_pandas_from_tables(source) + if (!Tables.istable(source)) + return nothing + end + source_columns = Tables.columns(source) + source_as_dict = Dict(column => Tables.getcolumn(source_columns, column) + for column in Tables.columnnames(source_columns)) + return invoke(DataFrame, Tuple{Vararg{Any}}, source_as_dict) +end + +Tables.columnaccess(::DataFrame) = true +Tables.istable(::DataFrame) = true diff --git a/src/CuDF/tabletraits.jl b/src/CuDF/tabletraits.jl new file mode 100644 index 0000000..ef376d3 --- /dev/null +++ b/src/CuDF/tabletraits.jl @@ -0,0 +1,58 @@ +using IteratorInterfaceExtensions +using TableTraitsUtils +using OrderedCollections: OrderedDict +using DataValues: DataValues + +IteratorInterfaceExtensions.isiterable(x::DataFrame) = true +TableTraits.isiterabletable(x::DataFrame) = true + +function TableTraits.getiterator(df::DataFrame) + col_names_raw = [i for i in Pandas.columns(df)] + col_names = Symbol.(col_names_raw) + + column_data = [eltype(df[i]) == String ? [df[i][j] for j in 1:length(df)] : + values(df[i]) for i in col_names_raw] + + return create_tableiterator(column_data, col_names) +end + +TableTraits.supports_get_columns_copy_using_missing(df::DataFrame) = true + +function TableTraits.get_columns_copy_using_missing(df::CuDF.DataFrame) + # return a named tuple of columns here + col_names_raw = [i for i in CuDF.columns(df)] + col_names = Symbol.(col_names_raw) + cols = (Array(eltype(df[i]) == String ? [df[i][j] for j in 1:length(df)] : df[i]) for i in + col_names_raw) + return NamedTuple{tuple(col_names...)}(tuple(cols...)) +end + +function _construct_pandas_from_iterabletable(source) + y = create_columns_from_iterabletable(source; errorhandling=:returnvalue) + y === nothing && return nothing + columns, column_names = y[1], y[2] + cols = OrderedDict{Symbol,Any}(i[1] => i[2] for i in zip(column_names, columns)) + + for (k, v) in pairs(cols) + if eltype(v) <: DataValues.DataValue + T = eltype(eltype(v)) + if T <: AbstractFloat + + # Issue 71 + # If the column is all 'missing' values, we have to decide what type to give it in Pandas. + # We arbitrarily default to Float64. + if T == Union{} + T = Float64 + end + + cols[k] = T[get(i, NaN) for i in v] + elseif T <: Integer + cols[k] = Float64[DataValues.isna(i) ? NaN : Float64(get(i)) for i in v] + else + throw(ArgumentError("Can't create a Pandas.DataFrame from a source that has missing data.")) + end + end + end + + return DataFrame(cols) +end diff --git a/src/CuDF/util.jl b/src/CuDF/util.jl new file mode 100644 index 0000000..3757407 --- /dev/null +++ b/src/CuDF/util.jl @@ -0,0 +1,11 @@ +function gen_window_names() + s_set = {} + for m in ["count", "sum", "mean", "median", "var", "std", "min", "max", "corr", + "corr_pairwise", "cov", "skew", "kurt", "apply", "quantile"] + for f in ["rolling", "expanding"] + s = string(":", f, "_", m) + push!(s_set, s) + end + end + return join(s_set, ", ") +end diff --git a/src/CuML/README.md b/src/CuML/README.md new file mode 100644 index 0000000..ab437b8 --- /dev/null +++ b/src/CuML/README.md @@ -0,0 +1,75 @@ +# CuML + +## Python API + +Here is an example of using `LogisticRegression`, `make_classification` via the Python API. + +```julia +using RAPIDS +const make_classification = cuml.datasets.classification.make_classification + +X_py, y_py = make_classification(n_samples=200, n_features=4, + n_informative=2, n_classes=2) +lr = cuml.LogisticRegression(max_iter=100) +lr.fit(X_py, y_py) +preds = lr.predict(X_py) + +print(lr.coef_) +``` + +## MLJ Interface + +A MLJ interface is also available for supported models. The model hyperparameters are the same as described in the [cuML docs](https://docs.rapids.ai/api/cuml/stable/api.html). The only difference is that the models will always input/output numpy arrays, which will be converted back to Julia arrays (`output_type="input"`). + +```julia +using MLJBase +using RAPIDS.CuML +const make_classification = cuml.datasets.classification.make_classification + +X_py, y_py = make_classification(n_samples=200, n_features=4, + n_informative=2, n_classes=2) +X = RAPIDS.pyconvert(Matrix{Float32}, X_py.get()) +y = RAPIDS.pyconvert(Vector{Float32}, y_py.get().flatten()) + +lr = LogisticRegression(max_iter=100) +mach = machine(lr, X, y) +fit!(mach) +preds = predict(mach, X) + +print(mach.fitresult.coef_) +``` + +MLJ Support: +- Clustering + - `KMeans` + - `DBSCAN` + - `AgglomerativeClustering` + - `HDBSCAN` +- Classification + - `LogisticRegression` + - `MBSGDClassifier` + - `RandomForestClassifier` + - `SVC` + - `LinearSVC` + - `KNeighborsClassifier` +- Regression + - `LinearRegression` + - `Ridge` + - `Lasso` + - `ElasticNet` + - `MBSGDRegressor` + - `RandomForestRegressor` + - `CD` + - `SVR` + - `LinearSVR` + - `KNeighborsRegressor` +- Dimensionality Reduction + - `PCA` + - `IncrementalPCA` + - `TruncatedSVD` + - `UMAP` + - `TSNE` + - `GaussianRandomProjection` +- Time Series + - `ExponentialSmoothing` + - `ARIMA` diff --git a/src/RAPIDS.jl b/src/RAPIDS.jl index 2038762..07c845c 100644 --- a/src/RAPIDS.jl +++ b/src/RAPIDS.jl @@ -59,6 +59,8 @@ else function __init__() PythonCall.pycopy!(cucim, pyimport("cucim")) PythonCall.pycopy!(cudf, pyimport("cudf")) + cudf_pandas = pyimport("cudf.pandas") + cudf_pandas.install() # PythonCall.pycopy!(cugraph, pyimport("cugraph")) https://github.com/tylerjthomas9/RAPIDS.jl/issues/37 PythonCall.pycopy!(cuml, pyimport("cuml")) PythonCall.pycopy!(cuspatial, pyimport("cuspatial")) @@ -67,8 +69,8 @@ else PythonCall.pycopy!(dask, pyimport("dask")) PythonCall.pycopy!(dask_cuda, pyimport("dask_cuda")) PythonCall.pycopy!(dask_cudf, pyimport("dask_cudf")) - PythonCall.pycopy!(pandas, pyimport("numpy")) - PythonCall.pycopy!(numpy, pyimport("pandas")) + PythonCall.pycopy!(numpy, pyimport("numpy")) + PythonCall.pycopy!(pandas, pyimport("pandas")) return PythonCall.pycopy!(pickle, pyimport("pickle")) end end diff --git a/test.jl b/test.jl new file mode 100644 index 0000000..cc450cf --- /dev/null +++ b/test.jl @@ -0,0 +1,96 @@ +using Revise +using MLJBase +using MLJTestInterface +using RAPIDS +using RAPIDS.CuML +using RAPIDS.CuDF +using Tables +using Test + +using Test +import DataFrames +using PythonCall +using Dates + + +df = DataFrame(Dict(:name=>["a", "b"], :age=>[27, 30])) +age = values(df.age) +df.age[1] = 100 +@test pyconvert(Int, loc(df)[1, "age"]) == 100 + +query(df, :(age!=27)) # Issue #26 + +text = repr(MIME("text/html"), df) +@test text isa String +@test occursin("[1,2], :y=>[NaN, NaN])[["x", "y"]] +@test Pandas.equals(py_df, expected_df) + +# Issue #68 +py""" +import pandas as pd + +def get_df(): + df = pd.DataFrame({ + "a":pd.to_datetime(["2021.01.15","2021.01.15","2020.04.06"]) + }) + return df +""" + +py_df = py"get_df"()|>Pandas.DataFrame +julia_df = DataFrames.DataFrame(py_df) + +@test julia_df.a == [DateTime(2021, 1, 15), DateTime(2021, 1, 15), DateTime(2020, 4, 6)] + +# Issue #72 +julia_df= DataFrames.DataFrame(C = 1:4, A = 5:8, B = 9:12) +py_df = Pandas.DataFrame(julia_df) +@test all(Pandas.columns(py_df) .== ["C","A","B"]) + +df1 = Pandas.Series(1:2) +df2 = Pandas.Series(1:2) +df3 = Pandas.Series(3:4) + +@test all(df1 == df1) +@test all(df1 == df2) +@test df1 != [1, 2] + +# Issue #93 +df = DataFrame(:a=>[1,2], :b=>[4,5], :c=>["a","b"]) +@test values(df) == [1 4 "a"; 2 5 "b"] \ No newline at end of file diff --git a/test/cudf.jl b/test/cudf.jl index 8b13789..eeece73 100644 --- a/test/cudf.jl +++ b/test/cudf.jl @@ -1 +1,86 @@ +using Test +import DataFrames +using PythonCall +using Dates +df = DataFrame(Dict(:name=>["a", "b"], :age=>[27, 30])) +age = values(df.age) +df.age[1] = 100 +@test pyconvert(Int, loc(df)[1, "age"]) == 100 + +query(df, :(age!=27)) # Issue #26 + +text = repr(MIME("text/html"), df) +@test text isa String +@test occursin("[1,2], :y=>[NaN, NaN])[["x", "y"]] +@test Pandas.equals(py_df, expected_df) + +# Issue #68 +py""" +import pandas as pd + +def get_df(): + df = pd.DataFrame({ + "a":pd.to_datetime(["2021.01.15","2021.01.15","2020.04.06"]) + }) + return df +""" + +py_df = py"get_df"()|>Pandas.DataFrame +julia_df = DataFrames.DataFrame(py_df) + +@test julia_df.a == [DateTime(2021, 1, 15), DateTime(2021, 1, 15), DateTime(2020, 4, 6)] + +# Issue #72 +julia_df= DataFrames.DataFrame(C = 1:4, A = 5:8, B = 9:12) +py_df = Pandas.DataFrame(julia_df) +@test all(Pandas.columns(py_df) .== ["C","A","B"]) + +df1 = Pandas.Series(1:2) +df2 = Pandas.Series(1:2) +df3 = Pandas.Series(3:4) + +@test all(df1 == df1) +@test all(df1 == df2) +@test df1 != [1, 2] + +# Issue #93 +df = DataFrame(:a=>[1,2], :b=>[4,5], :c=>["a","b"]) +@test values(df) == [1 4 "a"; 2 5 "b"] \ No newline at end of file diff --git a/test/cudf_tables.jl b/test/cudf_tables.jl new file mode 100644 index 0000000..4d7c41d --- /dev/null +++ b/test/cudf_tables.jl @@ -0,0 +1,29 @@ +@testset "tables" begin + + file = IOBuffer("""Temp;Val;Gr + 20;7863;1 + 100;7834;1 + 200;7803;1""") + csv = CSV.File(file, types=[Float64, Float64, Int]) + df = DataFrame(csv) + expected_df = DataFrame(:Val=>[7863.0, 7834.0, 7803.0], :Temp=>[20.0, 100.0, 200.0], :Gr=>[1,1,1]) + @test equals(df, expected_df) + + @test Tables.istable(df) + df_cols = Tables.columns(df) + @test Tables.getcolumn(df_cols, :Gr) == [1, 1, 1] + @test Tables.getcolumn(df_cols, :Val) == [7863.0, 7834.0, 7803.0] + @test Tables.getcolumn(df_cols, :Temp) == [20.0, 100.0, 200.0] + + @test Tables.columnaccess(df) + @test Tables.istable(df) + + ct = Tables.columntable(df) + rt = Tables.rowtable(df) + @test Tables.columntable(rt) == ct + @test length(ct) == 3 + @test ct.Val == [7863.0, 7834.0, 7803.0] + @test ct.Temp == [20.0, 100.0, 200.0] + @test ct.Gr == [1, 1, 1] + + end \ No newline at end of file diff --git a/test/cudf_tabletraits.jl b/test/cudf_tabletraits.jl new file mode 100644 index 0000000..b5436a7 --- /dev/null +++ b/test/cudf_tabletraits.jl @@ -0,0 +1,59 @@ +using IteratorInterfaceExtensions +using TableTraits +using DataValues +using Test + +@testset "TableTraits" begin + +table_array = [(a=1, b="John", c=3.2), (a=2, b="Sally", c=5.8)] + +df = DataFrame(table_array) + +@test collect(columns(df)) == ["a", "b", "c"] +@test values(df[:a]) == [1,2] +@test values(df[:c]) == [3.2, 5.8] + +# TODO(malmaud): Understand why this line makes the Windows CI fail +if !Sys.iswindows() + @test [df[:b][i] for i in 1:2] == ["John", "Sally"] +end + +@test TableTraits.isiterabletable(df) == true + +it = IteratorInterfaceExtensions.getiterator(df) + +@test eltype(it) == NamedTuple{(:a,:b,:c),Tuple{Int,String,Float64}} + +it_collected = collect(it) + +@test eltype(it_collected) == NamedTuple{(:a,:b,:c),Tuple{Int,String,Float64}} +@test length(it_collected) == 2 +@test it_collected[1] == (a=1, b="John", c=3.2) +@test it_collected[2] == (a=2, b="Sally", c=5.8) + +@test TableTraits.supports_get_columns_copy_using_missing(df) == true +cols = TableTraits.get_columns_copy_using_missing(df) +@test cols == (a=[1,2], b=["John", "Sally"], c=[3.2, 5.8]) + +table_array2 = [(a=1, b=DataValue("John"), c=3.2), (a=2, b=DataValue("Sally"), c=5.8)] + +@test_throws ArgumentError DataFrame(table_array2) + +table_array3 = [(a=DataValue{Int}(), b="John", c=DataValue(3.2)), (a=DataValue(2), b="Sally", c=DataValue{Float64}())] + +df3 = DataFrame(table_array3) + +it3_collected = collect(IteratorInterfaceExtensions.getiterator(df3)) + +@test length(it3_collected) == 2 +@test isnan(it3_collected[1].a) +@test it3_collected[1].b == "John" +@test it3_collected[1].c == 3.2 +@test it3_collected[2].a == 2 +@test it3_collected[2].b == "Sally" +@test isnan(it3_collected[2].c) + +cols3 = TableTraits.get_columns_copy_using_missing(df3) +@test isequal(cols3, (a=[NaN,2.], b=["John", "Sally"], c=[3.2, NaN])) + +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index a51fba0..63fe343 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,10 +6,11 @@ if CUDA.functional() using MLJTestInterface using RAPIDS using RAPIDS.CuML + using RAPIDS.CuDF using Tables using Test - include("cudf.jl") + # include("cudf.jl") include("cuml.jl") include("cuml_integration.jl") diff --git a/test/test.csv b/test/test.csv new file mode 100644 index 0000000..d68cd10 --- /dev/null +++ b/test/test.csv @@ -0,0 +1,3 @@ +x,y,z +1,2,test +3,4,string \ No newline at end of file From ad4e5a3d6f5f73374d1bb8ca64cdcf2e8d5f2181 Mon Sep 17 00:00:00 2001 From: bcicc Date: Sun, 21 Apr 2024 12:15:18 -0700 Subject: [PATCH 5/5] bump python, cuml, cuda bounds --- CondaPkg.toml | 16 ++++++++-------- src/CuDF/CuDF.jl | 4 ++-- src/utils.jl | 2 +- test/cuml.jl | 14 +++++++------- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/CondaPkg.toml b/CondaPkg.toml index bcc4cfe..f62a68a 100644 --- a/CondaPkg.toml +++ b/CondaPkg.toml @@ -1,14 +1,14 @@ channels = ["rapidsai", "nvidia", "conda-forge"] [deps] -cudf = "=23.10" -cuxfilter = "=23.10" -cucim = "=23.10" -cuspatial = "=23.10" -cugraph = "=23.10" -cuml = "=23.10" -python = ">=3.9,<3.11" +cudf = "=24.04" +cuxfilter = "=24.04" +cucim = "=24.04" +cuspatial = "=24.04" +cugraph = "=24.04" +cuml = "=24.04" +python = ">=3.9,<=3.11" [deps.cuda-version] channel = "conda-forge" - version = "=12.0" + version = "=12.2" diff --git a/src/CuDF/CuDF.jl b/src/CuDF/CuDF.jl index e0ec616..642076d 100644 --- a/src/CuDF/CuDF.jl +++ b/src/CuDF/CuDF.jl @@ -1,10 +1,10 @@ module CuDF -using Dates +using Dates: Date using PythonCall using Lazy -using Compat +using Compat: Compat using TableTraits using Statistics diff --git a/src/utils.jl b/src/utils.jl index 1f94a37..0e0db6d 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,6 +1,6 @@ # List of CUDA versions supported by RAPIDSAI -const supported_versions = ["11.2", "11.8", "12.0"] +const supported_versions = ["11.2", "11.8", "12.0", "12.2"] function find_closest_supported_version(major, supported_versions) major_versions = filter(v -> startswith(v, major), supported_versions) diff --git a/test/cuml.jl b/test/cuml.jl index 56e4e33..3bed40d 100644 --- a/test/cuml.jl +++ b/test/cuml.jl @@ -170,7 +170,7 @@ end model = PCA() mach = machine(model, X) fit!(mach) - X_trans = transform(mach, X) + X_trans = MLJBase.transform(mach, X) inverse_transform(mach, X) end @@ -178,14 +178,14 @@ end model = IncrementalPCA() mach = machine(model, X) fit!(mach) - X_trans = transform(mach, X) + X_trans = MLJBase.transform(mach, X) end @testset "TruncatedSVD" begin model = TruncatedSVD(; n_components=2) mach = machine(model, X) fit!(mach) - X_trans = transform(mach, X) + X_trans = MLJBase.transform(mach, X) inverse_transform(mach, X) end @@ -193,28 +193,28 @@ end model = UMAP(; n_components=2) mach = machine(model, X) fit!(mach) - X_trans = transform(mach, X) + X_trans = MLJBase.transform(mach, X) end @testset "GaussianRandomProjection" begin model = GaussianRandomProjection(; n_components=2) mach = machine(model, X) fit!(mach) - X_trans = transform(mach, X) + X_trans = MLJBase.transform(mach, X) end @testset "SparseRandomProjection" begin model = SparseRandomProjection(; n_components=2) mach = machine(model, X) fit!(mach) - X_trans = transform(mach, X) + X_trans = MLJBase.transform(mach, X) end @testset "TSNE" begin model = TSNE(; n_components=2) mach = machine(model, X) fit!(mach) - X_trans = transform(mach, X) + X_trans = MLJBase.transform(mach, X) end end