diff --git a/.Rbuildignore b/.Rbuildignore index 4e090adda..6dffd9d4b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -32,3 +32,7 @@ ^\.editorconfig$ ^rustfmt\.toml$ ^\.lintr\.R$ +^\.mega-linter\.yml$ +^\.lycheeignore$ +^\.yamllint\.yml$ +^\.ecrc$ diff --git a/.ecrc b/.ecrc new file mode 100644 index 000000000..42c2952d5 --- /dev/null +++ b/.ecrc @@ -0,0 +1,21 @@ +{ + "Verbose": false, + "Debug": false, + "IgnoreDefaults": false, + "SpacesAftertabs": false, + "NoColor": false, + "Exclude": [ + "LICENSE\\.note$", + "tests/testthat/_snaps/" + ], + "AllowedContentTypes": [], + "PassedFiles": [], + "Disable": { + "EndOfLine": false, + "Indentation": false, + "IndentSize": false, + "InsertFinalNewline": false, + "TrimTrailingWhitespace": false, + "MaxLineLength": false + } +} diff --git a/.editorconfig b/.editorconfig index 98f40b5ae..882a0d3ca 100644 --- a/.editorconfig +++ b/.editorconfig @@ -3,10 +3,9 @@ root = true [*] insert_final_newline = true end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true [*.md] indent_style = space indent_size = 2 - -[NEWS.md] -indent_size = 4 diff --git a/.github/actions/setup/action.yaml b/.github/actions/setup/action.yaml index 76dcff064..fa2dbae99 100644 --- a/.github/actions/setup/action.yaml +++ b/.github/actions/setup/action.yaml @@ -5,7 +5,7 @@ inputs: description: Set up Rust nightly toolchain? required: false target: - description: Rust target triple to use + description: Rust target triple to use. If empty, the default target is used. required: true token: description: GitHub token diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index 9d680d7f1..4e456c254 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -41,6 +41,8 @@ defaults: run: shell: bash +permissions: read-all + jobs: R-CMD-check: runs-on: ${{ matrix.config.os }} @@ -74,16 +76,16 @@ jobs: id: rust-target run: | if [ "${{ runner.os }}" == "Windows" ]; then - echo "TARGET=x86_64-pc-windows-gnu" >>$GITHUB_OUTPUT + echo "TARGET=x86_64-pc-windows-gnu" >>"$GITHUB_OUTPUT" else - echo "TARGET=$(rustc -vV | grep host | cut -d' ' -f2)" >>$GITHUB_OUTPUT + echo "TARGET=$(rustc -vV | grep host | cut -d' ' -f2)" >>"$GITHUB_OUTPUT" fi - name: Set env vars for build option if: matrix.config.full-features run: | - echo "LIBR_POLARS_FEATURES=full_features" >>$GITHUB_ENV - echo "LIBR_POLARS_PROFILE=release" >>$GITHUB_ENV + echo "LIBR_POLARS_FEATURES=full_features" >>"$GITHUB_ENV" + echo "LIBR_POLARS_PROFILE=release" >>"$GITHUB_ENV" - uses: ./.github/actions/setup with: @@ -111,7 +113,7 @@ jobs: NOT_CRAN: "true" run: | Rscript -e 'pkgbuild::compile_dll()' - echo "LIBR_POLARS_PATH=$(pwd)/src/rust/target/${{ steps.rust-target.outputs.TARGET }}/release/libr_polars.a" >>$GITHUB_ENV + echo "LIBR_POLARS_PATH=$(pwd)/src/rust/target/${{ steps.rust-target.outputs.TARGET }}/release/libr_polars.a" >>"$GITHUB_ENV" - uses: r-lib/actions/check-r-package@v2 env: diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index bd2ef5966..43c956876 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -44,6 +44,8 @@ env: LIBR_POLARS_BUILD: "true" LIBR_POLARS_PROFILE: release +permissions: read-all + jobs: documentation: runs-on: ubuntu-latest @@ -63,9 +65,9 @@ jobs: id: rust-target run: | if [ "${{ runner.os }}" == "Windows" ]; then - echo "TARGET=x86_64-pc-windows-gnu" >>$GITHUB_OUTPUT + echo "TARGET=x86_64-pc-windows-gnu" >>"$GITHUB_OUTPUT" else - echo "TARGET=$(rustc -vV | grep host | cut -d' ' -f2)" >>$GITHUB_OUTPUT + echo "TARGET=$(rustc -vV | grep host | cut -d' ' -f2)" >>"$GITHUB_OUTPUT" fi - uses: ./.github/actions/setup diff --git a/.github/workflows/mega-linter.yaml b/.github/workflows/mega-linter.yaml new file mode 100644 index 000000000..634646370 --- /dev/null +++ b/.github/workflows/mega-linter.yaml @@ -0,0 +1,84 @@ +--- +# MegaLinter GitHub Action configuration file +# More info at https://megalinter.io +name: MegaLinter + +on: + pull_request: + branches: + - main + push: + branches: + - main + workflow_dispatch: + +concurrency: + group: ${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + +permissions: read-all + +jobs: + megalinter: + name: MegaLinter + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout Code + uses: actions/checkout@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + fetch-depth: 0 + + - name: MegaLinter + id: ml + # You can override MegaLinter flavor used to have faster performances + # More info at https://megalinter.io/flavors/ + uses: oxsecurity/megalinter/flavors/cupcake@v7.10.0 + env: + # All available variables are described in documentation + # https://megalinter.io/configuration/ + VALIDATE_ALL_CODEBASE: ${{ github.event_name != 'pull_request' }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # ADD YOUR CUSTOM ENV VARIABLES HERE OR DEFINE THEM IN A FILE .mega-linter.yml AT THE ROOT OF YOUR REPOSITORY + # DISABLE: COPYPASTE,SPELL # Uncomment to disable copy-paste and spell checks + + # Upload MegaLinter artifacts + - name: Archive production artifacts + if: success() || failure() + uses: actions/upload-artifact@v4 + with: + name: MegaLinter reports + path: | + megalinter-reports + mega-linter.log + + # Create pull request if applicable (for now works only on PR from same repository, not from forks) + - name: Create Pull Request with applied fixes + id: cpr + if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'pull_request' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix') + uses: peter-evans/create-pull-request@v6 + with: + token: ${{ secrets.PAT || secrets.GITHUB_TOKEN }} + commit-message: "[MegaLinter] Apply linters automatic fixes" + title: "[MegaLinter] Apply linters automatic fixes" + labels: bot + - name: Create PR output + if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'pull_request' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix') + run: | + echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" + echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" + + # Push new commit if applicable (for now works only on PR from same repository, not from forks) + - name: Prepare commit + if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'commit' && github.ref != 'refs/heads/main' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix') + run: sudo chown -Rc $UID .git/ + - name: Commit and push applied linter fixes + if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'commit' && github.ref != 'refs/heads/main' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix') + uses: stefanzweifel/git-auto-commit-action@v4 + with: + branch: ${{ github.event.pull_request.head.ref || github.head_ref || github.ref }} + commit_message: "[MegaLinter] Apply linters fixes" + commit_user_name: megalinter-bot + commit_user_email: nicolas.vuillamy@ox.security diff --git a/.github/workflows/release-lib.yaml b/.github/workflows/release-lib.yaml index afb9b972f..77827fdde 100644 --- a/.github/workflows/release-lib.yaml +++ b/.github/workflows/release-lib.yaml @@ -28,6 +28,8 @@ env: LIBR_POLARS_FEATURES: "full_features" LIBR_POLARS_PROFILE: release-optimized +permissions: read-all + jobs: build: runs-on: ${{ matrix.os }} @@ -208,11 +210,11 @@ jobs: - name: create checksums working-directory: libs run: | - sha256sum * >"../sha256sums.txt" - md5sum * >"../md5sums.txt" + sha256sum -- * >"../sha256sums.txt" + md5sum -- * >"../md5sums.txt" - name: create release - uses: softprops/action-gh-release@v1 + uses: softprops/action-gh-release@v2 with: prerelease: true files: | diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 3be43d89d..306adefaf 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -28,6 +28,8 @@ env: LIBR_POLARS_PROFILE: release-optimized NOT_CRAN: "true" +permissions: read-all + jobs: build: runs-on: ${{ matrix.config.os }} @@ -52,6 +54,7 @@ jobs: - uses: ./.github/actions/setup with: rust-nightly: true + target: '' token: "${{ secrets.GITHUB_TOKEN }}" - uses: r-lib/actions/setup-pandoc@v2 @@ -68,7 +71,6 @@ jobs: needs: dev - name: Build lib - if: matrix.config.target == '' run: | if [ "${{ runner.os }}" == "Windows" ]; then TARGET=x86_64-pc-windows-gnu @@ -76,10 +78,9 @@ jobs: TARGET="$(rustc -vV | grep host | cut -d' ' -f2)" fi Rscript -e 'pkgbuild::compile_dll()' - echo "LIBR_POLARS_PATH=$(pwd)/src/rust/target/${TARGET}/${{ env.LIBR_POLARS_PROFILE }}/libr_polars.a" >>$GITHUB_ENV + echo "LIBR_POLARS_PATH=$(pwd)/src/rust/target/${TARGET}/${{ env.LIBR_POLARS_PROFILE }}/libr_polars.a" >>"$GITHUB_ENV" - name: make binary R package + test on host arch - if: matrix.config.target == '' run: | devtools::install(quick = TRUE) devtools::test(stop_on_failure = TRUE) @@ -105,7 +106,7 @@ jobs: polars.zip - name: Upload produced R packages (source or binary) to release - uses: softprops/action-gh-release@v1 + uses: softprops/action-gh-release@v2 if: startsWith(github.ref, 'refs/tags/') && github.event_name != 'pull_request' with: files: | diff --git a/.lycheeignore b/.lycheeignore new file mode 100644 index 000000000..046616f76 --- /dev/null +++ b/.lycheeignore @@ -0,0 +1,5 @@ +https://megalinter.io/configuration/ +https://r-lib.github.io/p/pak/stable/%s/%s/%s +https://megalinter.io/flavors/ +https://rpolars.github.io/vignettes +https://rpolars.github.io/man diff --git a/.mega-linter.yml b/.mega-linter.yml new file mode 100644 index 000000000..90f392478 --- /dev/null +++ b/.mega-linter.yml @@ -0,0 +1,11 @@ +APPLY_FIXES: all +DISABLE: + - RUST + - R +DISABLE_LINTERS: + - C_CPPLINT + - CSS_STYLELINT + - REPOSITORY_KICS + - SPELL_CSPELL +DISABLE_ERRORS_LINTERS: + - COPYPASTE_JSCPD diff --git a/.yamllint.yml b/.yamllint.yml new file mode 100644 index 000000000..68fd5c00b --- /dev/null +++ b/.yamllint.yml @@ -0,0 +1,13 @@ +extends: default + +rules: + braces: + max-spaces-inside: 1 + comments-indentation: disable + document-start: disable + empty-lines: + level: warning + line-length: + max: 120 + level: warning + truthy: disable diff --git a/DESCRIPTION b/DESCRIPTION index 08eae06d1..43b8d05b2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: polars Title: Lightning-Fast 'DataFrame' Library -Version: 0.14.1.9000 +Version: 0.15.1.9000 Depends: R (>= 4.2) Imports: utils, codetools, methods Authors@R: @@ -109,12 +109,13 @@ Collate: 'polars_options.R' 'rbackground.R' 'rust_result.R' - 's3_methods.R' + 's3-methods.R' + 's3-methods-operator.R' 'series__series.R' 'sql.R' 'vctrs.R' 'zzz.R' Config/rextendr/version: 0.3.1 VignetteBuilder: knitr -Config/polars/LibVersion: 0.38.0 +Config/polars/LibVersion: 0.38.1 Config/polars/RustToolchainVersion: nightly-2024-02-23 diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index cad227d0e..62afb8faa 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -91,10 +91,10 @@ When updating the Rust Polars crate that the R package depends on, the following steps are required: 1. Since the version of the Polars crate is specified by the Git revision, - update the `rev` of all `polars-*` crates in the `src/rust/Cargo.toml` file. + update the `rev` of all `polars-*` crates in the `src/rust/Cargo.toml` file. 2. Update the `Config/polars/RustToolchainVersion` field in the `DESCRIPTION` - file to the version of the Rust toolchain specified in the `toolchain.channel` - field of the `rust-toolchain.toml` file in the Polars crate Git repository. + file to the version of the Rust toolchain specified in the `toolchain.channel` + field of the `rust-toolchain.toml` file in the Polars crate Git repository. 3. Update the toolchain to the version specified in the `DESCRIPTION` file. 4. Repeat the build, test, and bug fixes of the R package. @@ -127,7 +127,7 @@ Rscript dev/generate-lib-sums.R The R package releases are done on GitHub pull requests. 1. Create a local branch for the release, push it to the remote repository (main - repository), then open a pull request to the `main` branch. + repository), then open a pull request to the `main` branch. 2. Bump the R package version with the `usethis` package. ```r @@ -137,9 +137,9 @@ usethis::use_version() 3. Check the CI status of the pull request. 4. Push a tag named starting with `v` (e.g. `v0.10.0`). It triggers the GitHub - action to build the website and create a GitHub release. + action to build the website and create a GitHub release. 5. Bump the R package version to "dev version" with the `usethis` package - before merging the pull request. + before merging the pull request. ```r usethis::use_dev_version() diff --git a/NAMESPACE b/NAMESPACE index 49c04156d..e82f275c5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -45,9 +45,11 @@ S3method("$<-",RPolarsDataFrame) S3method("$<-",RPolarsRField) S3method("%%",RPolarsChainedThen) S3method("%%",RPolarsExpr) +S3method("%%",RPolarsSeries) S3method("%%",RPolarsThen) S3method("%/%",RPolarsChainedThen) S3method("%/%",RPolarsExpr) +S3method("%/%",RPolarsSeries) S3method("%/%",RPolarsThen) S3method("&",RPolarsChainedThen) S3method("&",RPolarsExpr) @@ -117,6 +119,7 @@ S3method("[[",RPolarsVecDataFrame) S3method("[[",RPolarsWhen) S3method("^",RPolarsChainedThen) S3method("^",RPolarsExpr) +S3method("^",RPolarsSeries) S3method("^",RPolarsThen) S3method("|",RPolarsChainedThen) S3method("|",RPolarsExpr) @@ -158,6 +161,7 @@ S3method(as_polars_df,RPolarsRollingGroupBy) S3method(as_polars_df,RPolarsSeries) S3method(as_polars_df,data.frame) S3method(as_polars_df,default) +S3method(as_polars_df,nanoarrow_array) S3method(as_polars_df,nanoarrow_array_stream) S3method(as_polars_lf,RPolarsLazyFrame) S3method(as_polars_lf,RPolarsLazyGroupBy) diff --git a/NEWS.0.md b/NEWS.0.md index 930038df9..461420473 100644 --- a/NEWS.0.md +++ b/NEWS.0.md @@ -1,9 +1,189 @@ -# Old (pre-v0.2.0) changelog +# Old NEWS (pre-v0.5.0, the package name was `rpolars`) - - update 24th November: minipolars is getting bigger and is changing name to **rpolars** and is hosted on [github.com/rpolars/rpolars](https://github.com/rpolars/rpolars/). Translation, testing and documenting progress is unfortunately not fast enough to finish in 2022. Goal postponed to March 2023. rlang is dropped as install dependency. No dependencies should make it very easy to install and manage versions long term. +## rpolars v0.4.7 - - update 10th November 2022: Full support for Windows, see installation section. After digging through gnu ld linker documentation and R source code idiosyncrasies, rpolars, can now be build for windows (nighly-gnu). In the end adding this super simple [linker export definition file](https://github.com/sorhawell/rpolars/blob/main/src/rpolars-win.def) prevented the linker from trying to export all +160_000 internal variables into a 16bit symbol table maxing out at 65000 variables. Many thanks for 24-hour support from extendr-team. +### What's changed - - update 4th November 2022: [Latest documentation shows half (125) of all expression functions are now ported](https://sorhawell.github.io/reference/index.html#expr). Automatic binary release for Mac and Linux. Windows still pending. It is now very easy to install rpolars from binary. See install section. +- Revamped docs that includes a new introductory vignette (#81 @grantmcdermott) +- Misc documentation improvements - - update: 5th October 2022 Currently ~20% of features have been translated. To make polars call R multi-threaded was a really hard nut to crack as R has no Global-interpreter-lock feature. My solution is to have a main thread in charge of R calls, and any abitrary polars child threads can request to have R user functions executed. Implemented with flume mpsc channels. No serious obstacles left known to me. Just a a lot of writing. Preliminary performance benchmarking promise rpolars is going to perform just as fast pypolars. +## rpolars v0.4.6 + +Release date: 2023-03-13. Full changelog: [v0.4.5...v0.4.6](https://github.com/pola-rs/r-polars/compare/v0.4.5...v0.4.6) + +### What's new + +- Almost all Expr translated, only missing 'binary'-expr now. #52 #53 +- Run polars queries in detached background threads, no need for any parallel libraries or cluster config #56 #59 +- Full support for when-then-otherwise-syntax #65 +- **rpolars** now uses bit64 integer64 vectors as input/output for i64 vectors: #68 #69 +- use `pl$from_arrow` to zero-copy(almost) import `Table`/`Array` from **r-arrow**. #67 +- Support inter process connections with `scan_ipc` +- Implement `scan_ipc` by @Sicheng-Pan in #63 +- 'Backend' improvements + - (prepare support for aarch64-linux) Touch libgcc_eh.a by @yutannihilation in #49 + - Use py-polars rust file structure (to help devs) by @sorhawell in #55 + - Refactor Makefiles by @eitsupi in #58 + - Build **rpolars** from Nix by @Sicheng-Pan in #54 + - `extendr_api` 0.4 by @sorhawell in #6 + - Add r-universe URL by @jeroen in #71 + - chore: install **nanoarrow** from cran by @eitsupi in #72 + - chore: install **nanoarrow** from cran (#72) by @sorhawell in #73 + - Fix pdf latex errors by @sorhawell in #74 + - re-enable devel test, **pak** R-devel issue went away by @sorhawell in #75 + - DO NOT MERGE: tracking hello_r_universe branch by @eitsupi in #38 + - revert to nightly by @sorhawell in #78 + +### New Contributors + +- @Sicheng-Pan made their first contribution in #54 +- @jeroen made their first contribution in #71 + +## rpolars v0.4.5 + +Release date: 2023-02-21. Full Changelog: [v0.4.3...v0.4.5](https://github.com/pola-rs/r-polars/compare/v0.4.3...v0.4.5) + +### What's Changed + +- bump rust polars to latest rust-polars and fix all errors by @sorhawell in #42 +- Customize **extendr** to better support cross Rust-R/R-Rust error handling + - bump extendr_api by @sorhawell in #44 + - Str even more by @sorhawell in #47 +- **rpolars** is now available for install from [rpolars.r-universe.dev](https://rpolars.r-universe.dev/polars#install) @eitsupi + - advertise R-universe by @sorhawell in #39 + - Includes reasonably easy pre-compiled installation for arm64-MacBooks +- All string Expressions available + - Expr str strptime by @sorhawell in #40 + - rust_result tests + fixes by @sorhawell in #41 + - Str continued by @sorhawell in #43 + - Str even more by @sorhawell in #47 +- Starting to roll out new error-handling and type-conversions between R and rust. + + - Precise source of error should be very clear even in a long method-chain e.g. + + ```r + pl$lit("hey-you-there")$str$splitn("-",-3)$alias("struct_of_words")$to_r() + > Error: in str$splitn the arg [n] the value -3 cannot be less than zero + when calling : + pl$lit("hey-you-there")$str$splitn("-", -3) + ``` + +- Misc + - Clippy + tiny optimization by @sorhawell in #45 + - Tidying by @sorhawell in #37 + +## rpolars v0.4.3 + +Release date: 2023-02-01. Full Changelog: [v0.4.2...v0.4.3](https://github.com/pola-rs/r-polars/compare/v0.4.2...v0.4.3) + +### What's Changed + +- All DateTime expresssions implemented + update rust-polars to latest commit. + - Arr str by @sorhawell in #32 + - Datetime continued by @sorhawell in #33 + - Datatime remaining tests + tidy util functions by @sorhawell in #36 + +### Developer changes + +- Refactoring GitHub Actions workflows by @eitsupi in #24 +- Fix cache and check scan by @sorhawell in #30 + +## rpolars v0.4.2 + +Release date: 2023-01-17. Full Changelog: [V0.4.1...v0.4.2](https://github.com/pola-rs/r-polars/compare/V0.4.1...v0.4.2) + +### What's Changed + +- fix minor Series syntax issue #8 @sorhawell in #22 +- nanoarrow followup: docs + adjust test by @sorhawell in #21 +- Add R CMD check workflow by @eitsupi in #23 +- `usethis::use_mit_license()` by @yutannihilation in #27 +- Fix check errors by @sorhawell in #26 + +### New Contributors + +- @eitsupi made their first contribution in #23 +- @yutannihilation made their first contribution in #27 + +## rpolars v0.4.1 + +Release date: 2023-01-12. Full Changelog: [v0.4.0...V0.4.1](https://github.com/pola-rs/r-polars/compare/v0.4.0...V0.4.1) + +### What's Changed + +- Export ArrowArrayStream from polars data frame by @paleolimbot in #5 +- Minor arithmetics syntax improvement @sorhawell in #20 + +### Dev env + +- Renv is deactivated as default. Renv.lock still defines package stack on build server @sorhawell in #19 + +### Minor stuff + +- Improve docs by @sorhawell in #16 +- Update rust polars to +26.1 by @sorhawell in #18 + +### New Contributors + +- @paleolimbot made their first contribution in #5 + +## rpolars v0.4.0 + +Release date: 2023-01-11. Full Changelog: [v0.3.1...v0.4.0](https://github.com/pola-rs/r-polars/compare/V0.3.1...v0.4.0) + +### Breaking changes + +- Class label "DataType" is now called "RPolarsDataType". Syntax wise 'DataType' can still be used, e.g. `.pr$DataType$` +- try fix name space collision with arrow by @sorhawell in #15 + +### New features + +- all list Expr$arr$list functions have been translated: +- Expr list 2.0 by @sorhawell in #10 +- Expr list 3.0 by @sorhawell in #12 + +### Dev environment + +- update rextendr by @sorhawell in #13 + +## rpolars v0.3.1 + +Release date: 2023-01-07. Full Changelog: [v0.3.0...v0.3.1](https://github.com/pola-rs/r-polars/compare/v0.3.0...V0.3.1) + +### What's Changed + +- drop github action upload pre-release of PR's by @sorhawell in #7 +- Fix readme typo by @erjanmx in #6 +- Expr arr list functions + rework r_to_series by @sorhawell in #2 + +### New Contributors + +- @erjanmx made their first contribution in #6 + +## rpolars v0.3.0 + +Release date: 2022-12-31. Full Changelog: [v0.2.1...v0.3.0](https://github.com/pola-rs/r-polars/compare/v0.2.1...v0.3.0) + +### What's Changed + +- use jemalloc(linux) else mimallac as py-polars by @sorhawell in #1 +- Bump rust polars 26.1 by @sorhawell in #3 +- Expr_interpolate now has two methods, linear, nearest +- Expr_quantile also takes quantile value as an expression +- map_alias improved error handling + +## rpolars v0.2.1 + +Release date: 2022-12-27 + +- **rpolars** is now hosted at . Happy to be here. + +## pre-v0.2.0 + +- update 24th November: minipolars is getting bigger and is changing name to **rpolars** and is hosted on [github.com/rpolars/rpolars](https://github.com/rpolars/rpolars/). Translation, testing and documenting progress is unfortunately not fast enough to finish in 2022. Goal postponed to March 2023. rlang is dropped as install dependency. No dependencies should make it very easy to install and manage versions long term. + +- update 10th November 2022: Full support for Windows, see installation section. After digging through gnu ld linker documentation and R source code idiosyncrasies, rpolars, can now be build for windows (nighly-gnu). In the end adding this super simple [linker export definition file](https://github.com/rpolars/rpolars/blob/main/src/minipolars-win.def) prevented the linker from trying to export all +160_000 internal variables into a 16bit symbol table maxing out at 65000 variables. Many thanks for 24-hour support from extendr-team. + +- update 4th November 2022: Latest documentation shows half (125) of all expression functions are now ported. Automatic binary release for Mac and Linux. Windows still pending. It is now very easy to install rpolars from binary. See install section. + +- update: 5th October 2022 Currently ~20% of features have been translated. To make polars call R multi-threaded was a really hard nut to crack as R has no Global-interpreter-lock feature. My solution is to have a main thread in charge of R calls, and any abitrary polars child threads can request to have R user functions executed. Implemented with flume mpsc channels. No serious obstacles left known to me. Just a a lot of writing. Preliminary performance benchmarking promise rpolars is going to perform just as fast pypolars. diff --git a/NEWS.md b/NEWS.md index 227efa524..1cccef9d9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,262 +2,366 @@ ## Polars R Package (development version) +## Polars R Package 0.15.1 + +### New features + +- rust-polars is updated to 0.38.2 (#907). + - Minimum supported Rust version (MSRV) is now 1.76.0. +- `as_polars_df()` is added (#893). +- It is now possible to create an empty `DataFrame` with a specific schema + with `pl$DataFrame(schema = my_schema)` (#901). +- New arguments `dtype` and `nan_to_null` for `pl$Series()` (#902). +- New method `$partition_by()` (#898). + +### Bug fixes + +- The default value of the `format` of `$str$strptime()` is now correctly set (#892). + +### Other improvements + +- Performance of `as_polars_df()` is improved (#896). + +## Polars R Package 0.15.0 + ### Breaking changes due to Rust-polars update -- rust-polars is updated to 0.38.1 (#865, #872). - - in `$pivot()`, arguments `aggregate_function`, `maintain_order`, - `sort_columns` and `separator` must be named. Values that are passed - by position are ignored. - - in `$describe()`, the name of the first column changed from `"describe"` - to `"statistic"`. +- rust-polars is updated to 0.38.1 (#865, #872). + - in `$pivot()`, arguments `aggregate_function`, `maintain_order`, + `sort_columns` and `separator` must be named. Values that are passed + by position are ignored. + - in `$describe()`, the name of the first column changed from `"describe"` + to `"statistic"`. + - `$mod()` methods and `%%` works correctly to guarantee + `x == (x %% y) + y * (x %/% y)`. ### Other breaking changes -- Removed `as.list()` for class `RPolarsExpr` as it is a simple wrapper around - `list()` (#843). -- Several functions have been rewritten to match the behavior of Python Polars. - - `pl$col(...)` requires at least one argument. (#852) - - `pl$head()`, `pl$tail()`, `pl$count()`, `pl$first()`, `pl$last()`, `pl$max()`, - `pl$min()`, `pl$mean()`, `pl$media()`, `pl$std()`, `pl$sum()`, `pl$var()`, - `pl$n_unique()`, and `pl$approx_n_unique()` are syntactic sugar for - `pl$col(...)$`. The argument `...` now only accepts characters, - that are either column names or regular expressions (#852). - - There is no argument for `pl$len()`. If you want to measure the length of - specific columns, you should use `pl$count(...)` (#852). - - `$str$concat()` method's `delimiter` argument's default value is - changed from `"-"` to `""` (#853). - - `$str$concat()` method's `ignore_nulls` argument must be a - named argument (#853). -- `pl$Categorical()` has been improved to allow specifying the `ordering` type - (either lexical or physical). This also means that calling `pl$Categorical` - doesn't create a `DataType` anymore. All calls to `pl$Categorical` must be - replaced by `pl$Categorical()` (#860). +- Removed `as.list()` for class `RPolarsExpr` as it is a simple wrapper around + `list()` (#843). +- Several functions have been rewritten to match the behavior of Python Polars. + - `pl$col(...)` requires at least one argument. (#852) + - `pl$head()`, `pl$tail()`, `pl$count()`, `pl$first()`, `pl$last()`, `pl$max()`, + `pl$min()`, `pl$mean()`, `pl$media()`, `pl$std()`, `pl$sum()`, `pl$var()`, + `pl$n_unique()`, and `pl$approx_n_unique()` are syntactic sugar for + `pl$col(...)$`. The argument `...` now only accepts characters, + that are either column names or regular expressions (#852). + - There is no argument for `pl$len()`. If you want to measure the length of + specific columns, you should use `pl$count(...)` (#852). + - `$str$concat()` method's `delimiter` argument's default value is + changed from `"-"` to `""` (#853). + - `$str$concat()` method's `ignore_nulls` argument must be a + named argument (#853). + - `pl$Datetime()`'s arguments are renamed: `tu` to `time_unit`, + and `tz` to `time_zone` (#887). +- `pl$Categorical()` has been improved to allow specifying the `ordering` type + (either lexical or physical). This also means that calling `pl$Categorical` + doesn't create a `DataType` anymore. All calls to `pl$Categorical` must be + replaced by `pl$Categorical()` (#860). +- `$rem()` is removed. Use `$mod()` instead (#886). +- The conversion strategy between the POSIXct type without time zone attribute + and Polars datetime has been changed (#878). + `POSIXct` class vectors without a time zone attribute have UTC time internally + and is displayed based on the system's time zone. Previous versions of `polars` + only considered the internal value and interpreted it as UTC time, so the + time displayed as `POSIXct` and in Polars was different. + + ```r + # polars 0.14.1 + Sys.setenv(TZ = "Europe/Paris") + datetime = as.POSIXct("1900-01-01") + datetime + #> [1] "1900-01-01 PMT" + + s = polars::as_polars_series(datetime) + s + #> polars Series: shape: (1,) + #> Series: '' [datetime[ms]] + #> [ + #> 1899-12-31 23:50:39 + #> ] + + as.vector(s) + #> [1] "1900-01-01 PMT" + ``` + + Now the internal value is updated to match the displayed value. + + ```r + # polars 0.15.0 + Sys.setenv(TZ = "Europe/Paris") + datetime = as.POSIXct("1900-01-01") + datetime + #> [1] "1900-01-01 PMT" + + s = polars::as_polars_series(datetime) + s + #> polars Series: shape: (1,) + #> Series: '' [datetime[ms]] + #> [ + #> 1900-01-01 00:00:00 + #> ] + + as.vector(s) + #> [1] "1900-01-01 PMT" + ``` + + This update may cause errors when converting from Polars to `POSIXct` for non-existent + or ambiguous times. It is recommended to explicitly add a time zone before converting + from Polars to R. + + ```r + Sys.setenv(TZ = "America/New_York") + ambiguous_time = as.POSIXct("2020-11-01 01:00:00") + ambiguous_time + #> [1] "2020-11-01 01:00:00 EDT" + + pls = polars::as_polars_series(ambiguous_time) + pls + #> polars Series: shape: (1,) + #> Series: '' [datetime[ms]] + #> [ + #> 2020-11-01 01:00:00 + #> ] + + ## This will be error! + # pls |> as.vector() + + pls$dt$replace_time_zone("UTC") |> as.vector() + #> [1] "2020-11-01 01:00:00 UTC" + ``` + +- Removed argument `eager` in `pl$date_range()` and `pl$struct()` for more + consistency of output. It is possible to replace `eager = TRUE` by calling + `$to_series()` (#882). ### New features -- In the when-then-otherwise expressions, the last `$otherwise()` is now optional, - as in Python Polars. If `$otherwise()` is not specified, rows that don't respect - the condition set in `$when()` will be filled with `null` (#836). -- `$head()` and `$tail()` methods now support negative - row numbers (#840). -- `$group_by()` now works with named expressions (#846). -- New methods for the `arr` subnamespace: `$median()`, `$var()`, `$std()`, - `$shift()`, `$to_struct()` (#867). -- `$min()` and `max()` now work on categorical variables (#868). -- New methods for the `list` subnamespace: `$n_unique()`, `$gather_every()` - (#869). -- Converts `clock_time_point` and `clock_zoned_time` objects from - the `{clock}` package to Polars datetime type (#861). -- New methods for the `name` subnamespace: `$prefix_fields()` and - `suffix_fields()` (#873). +- In the when-then-otherwise expressions, the last `$otherwise()` is now optional, + as in Python Polars. If `$otherwise()` is not specified, rows that don't respect + the condition set in `$when()` will be filled with `null` (#836). +- `$head()` and `$tail()` methods now support negative + row numbers (#840). +- `$group_by()` now works with named expressions (#846). +- New methods for the `arr` subnamespace: `$median()`, `$var()`, `$std()`, + `$shift()`, `$to_struct()` (#867). +- `$min()` and `max()` now work on categorical variables (#868). +- New methods for the `list` subnamespace: `$n_unique()`, `$gather_every()` + (#869). +- Converts `clock_time_point` and `clock_zoned_time` objects from + the `{clock}` package to Polars datetime type (#861). +- New methods for the `name` subnamespace: `$prefix_fields()` and + `suffix_fields()` (#873). +- `pl$Datetime()`'s `time_zone` argument now accepts `"*"` to match + any time zone (#887). ### Bug fixes -- R no longer crashes when calling an invalid Polars object that points - to a null pointer (#874). This was occurring, such as when a Polars object - was saved in an RDS file and loaded from another session. +- R no longer crashes when calling an invalid Polars object that points + to a null pointer (#874). This was occurring, such as when a Polars object + was saved in an RDS file and loaded from another session. ## Polars R Package 0.14.1 ### Breaking changes -- Since most of the methods of `Expr` are now available for `Series`, the - experimental `$expr` subnamespace is removed (#831). - Use `$` instead of `$expr$`. +- Since most of the methods of `Expr` are now available for `Series`, the + experimental `$expr` subnamespace is removed (#831). + Use `$` instead of `$expr$`. ### New features -- New active bindings `$flags` for `DataFrame` to show the flags used internally - for each column. The output of `$flags` for `Series` was also improved and now - contains `FAST_EXPLODE` for `Series` of type `list` and `array` (#809). -- Most of `Expr` methods are also available for `Series` (#819, #828, #831). -- `as_polars_df()` for `data.frame` is more memory-efficient and new arguments - `schema` and `schema_overrides` are added (#817). -- Use `polars_code_completion_activate()` to enable code suggestions and - autocompletion after `$` on polars objects. This is an experimental feature - that is disabled by default. For now, it is only supported in the native R - terminal and in RStudio (#597). +- New active bindings `$flags` for `DataFrame` to show the flags used internally + for each column. The output of `$flags` for `Series` was also improved and now + contains `FAST_EXPLODE` for `Series` of type `list` and `array` (#809). +- Most of `Expr` methods are also available for `Series` (#819, #828, #831). +- `as_polars_df()` for `data.frame` is more memory-efficient and new arguments + `schema` and `schema_overrides` are added (#817). +- Use `polars_code_completion_activate()` to enable code suggestions and + autocompletion after `$` on polars objects. This is an experimental feature + that is disabled by default. For now, it is only supported in the native R + terminal and in RStudio (#597). ### Bug fixes -- `$list` sub namespace methods returns `Series` class object correctly (#819). +- `$list` sub namespace methods returns `Series` class object correctly (#819). ## Polars R Package 0.14.0 ### Breaking changes due to Rust-polars update -- rust-polars is updated to 0.37.0 (#776). - - Minimum supported Rust version (MSRV) is now 1.74.1. - - `$with_row_count()` for `DataFrame` and `LazyFrame` is deprecated and - will be removed in 0.15.0. It is replaced by `$with_row_index()`. - - `pl$count()` is deprecated and will be removed in 0.15.0. It is replaced - by `pl$len()`. - - `$explode()` for `DataFrame` and `LazyFrame` doesn't work anymore on - string columns. - - `$list$join()` and `pl$concat_str()` gain an argument `ignore_nulls`. - The current behavior is to return a `null` if the row contains any `null`. - Setting `ignore_nulls = TRUE` changes that. - - All `row_count_*` args in reading/scanning functions are renamed - `row_index_*`. - - `$sort()` for `Series` gains an argument `nulls_last`. - - `$str$extract()` and `$str$zfill()` now accept an `Expr` and parse - strings as column names. Use `pl$lit()` to recover the old behavior. - - `$cum_count()` now starts from 1 instead of 0. +- rust-polars is updated to 0.37.0 (#776). + - Minimum supported Rust version (MSRV) is now 1.74.1. + - `$with_row_count()` for `DataFrame` and `LazyFrame` is deprecated and + will be removed in 0.15.0. It is replaced by `$with_row_index()`. + - `pl$count()` is deprecated and will be removed in 0.15.0. It is replaced + by `pl$len()`. + - `$explode()` for `DataFrame` and `LazyFrame` doesn't work anymore on + string columns. + - `$list$join()` and `pl$concat_str()` gain an argument `ignore_nulls`. + The current behavior is to return a `null` if the row contains any `null`. + Setting `ignore_nulls = TRUE` changes that. + - All `row_count_*` args in reading/scanning functions are renamed + `row_index_*`. + - `$sort()` for `Series` gains an argument `nulls_last`. + - `$str$extract()` and `$str$zfill()` now accept an `Expr` and parse + strings as column names. Use `pl$lit()` to recover the old behavior. + - `$cum_count()` now starts from 1 instead of 0. ### Other breaking changes -- The `simd` feature of the Rust library is removed in favor of - the new `nightly` feature (#800). - If you specified `simd` via the `LIBR_POLARS_FEATURES` environment variable - during source installations, please use `nightly` instead; - there is no change if you specified `full_features` because - it now contains `nightly` instead of `simd`. -- The following functions were deprecated in 0.13.0 and are now removed (#783): - - `$list$lengths()` -> `$list$len()` - - `pl$from_arrow()` -> `as_polars_df()` or `as_polars_series()` - - `pl$set_options()` and `pl$reset_options()` -> `polars_options()` -- `$is_between()` had several changes (#788): - - arguments `start` and `end` are renamed `lower_bound` and `upper_bound`. - Their behaviour doesn't change. - - `include_bounds` is renamed `closed` and must be one of `"left"`, - `"right"`, `"both"`, or `"none"`. -- `polars_info()` returns a slightly changed list. - - `$threadpool_size`, which means the number of threads used by Polars, - is changed to `$thread_pool_size` (#784) - - `$version`, which indicates the version of this package, - is changed to `$versions$r_package` (#791). - - `$rust_polars`, which indicates the version of the dependent Rust Polars, - is changed to `$versions$rust_crate` (#791). -- New behavior when creating a `DataFrame` with a single list-variable. - `pl$DataFrame(x = list(1:2, 3:4))` used to create a `DataFrame` with two - columns named "new_column" and "new_column_1", which was unexpected. It now - produces a `DataFrame` with a single `list` variable. This also applies to - list-column created in `$with_columns()` and `$select()` (#794). +- The `simd` feature of the Rust library is removed in favor of + the new `nightly` feature (#800). + If you specified `simd` via the `LIBR_POLARS_FEATURES` environment variable + during source installations, please use `nightly` instead; + there is no change if you specified `full_features` because + it now contains `nightly` instead of `simd`. +- The following functions were deprecated in 0.13.0 and are now removed (#783): + - `$list$lengths()` -> `$list$len()` + - `pl$from_arrow()` -> `as_polars_df()` or `as_polars_series()` + - `pl$set_options()` and `pl$reset_options()` -> `polars_options()` +- `$is_between()` had several changes (#788): + - arguments `start` and `end` are renamed `lower_bound` and `upper_bound`. + Their behaviour doesn't change. + - `include_bounds` is renamed `closed` and must be one of `"left"`, + `"right"`, `"both"`, or `"none"`. +- `polars_info()` returns a slightly changed list. + - `$threadpool_size`, which means the number of threads used by Polars, + is changed to `$thread_pool_size` (#784) + - `$version`, which indicates the version of this package, + is changed to `$versions$r_package` (#791). + - `$rust_polars`, which indicates the version of the dependent Rust Polars, + is changed to `$versions$rust_crate` (#791). +- New behavior when creating a `DataFrame` with a single list-variable. + `pl$DataFrame(x = list(1:2, 3:4))` used to create a `DataFrame` with two + columns named "new_column" and "new_column_1", which was unexpected. It now + produces a `DataFrame` with a single `list` variable. This also applies to + list-column created in `$with_columns()` and `$select()` (#794). ### Deprecations -- `pl$threadpool_size()` is deprecated and will be removed in 0.15.0. Use - `pl$thread_pool_size()` instead (#784). +- `pl$threadpool_size()` is deprecated and will be removed in 0.15.0. Use + `pl$thread_pool_size()` instead (#784). ### New features -- Implementation of the subnamespace `$arr` for expressions on `array`-type - columns. An `array` column is similar to a `list` column, but is stricter as - each sub-array must have the same number of elements (#790). +- Implementation of the subnamespace `$arr` for expressions on `array`-type + columns. An `array` column is similar to a `list` column, but is stricter as + each sub-array must have the same number of elements (#790). ### Other improvements -- The `sql` feature is included in the default feature (#800). - This means that functionality related to the `RPolarsSQLContext` class - is now always included in the binary package. +- The `sql` feature is included in the default feature (#800). + This means that functionality related to the `RPolarsSQLContext` class + is now always included in the binary package. ## Polars R Package 0.13.1 ### New features -- New method `$write_parquet()` for DataFrame (#758). -- S3 methods of `as.data.frame()` for `RPolarsDataFrame` and `RPolarsLazyFrame` - accepts more arguments of `as_polars_df()` and `$to_data_frame()` (#762). -- S3 methods of `arrow::as_arrow_table()` and `arrow::as_record_batch_reader()` for - `RPolarsDataFrame` no longer need the `{nanoarrow}` package (#754). -- Some S3 methods for the `{nanoarrow}` package are added (#730). - - `as_polars_df()` - - `as_polars_series()` - - `as_polars_series()` +- New method `$write_parquet()` for DataFrame (#758). +- S3 methods of `as.data.frame()` for `RPolarsDataFrame` and `RPolarsLazyFrame` + accepts more arguments of `as_polars_df()` and `$to_data_frame()` (#762). +- S3 methods of `arrow::as_arrow_table()` and `arrow::as_record_batch_reader()` for + `RPolarsDataFrame` no longer need the `{nanoarrow}` package (#754). +- Some S3 methods for the `{nanoarrow}` package are added (#730). + - `as_polars_df()` + - `as_polars_series()` + - `as_polars_series()` ### Bug fixes -- `$sort()` no longer panicks when `descending = NULL` (#748). +- `$sort()` no longer panicks when `descending = NULL` (#748). ### Other enhancements -- `downlit::autolink()` now recognize the reference pages of this package (#739). +- `downlit::autolink()` now recognize the reference pages of this package (#739). ## Polars R Package 0.13.0 ### Breaking changes -- `$where()` is removed. Use `$filter()` instead (#718). -- Deprecated functions from 0.12.x are removed (#714). - - `$apply()` and `$map()`, use `$map_elements()` and - `$map_batches()` instead. - - `pl$polars_info()`, use `polars_info()` instead. -- The environment variables used when building the library have been changed - (#693). This only affects selecting the feature flag and selecting profiles - during source installation. - - `RPOLARS_PROFILE` is renamed to `LIBR_POLARS_PROFILE` - - `RPOLARS_FULL_FEATURES` is removed and `LIBR_POLARS_FEATURES` is added. - To select the `full_features`, set `LIBR_POLARS_FEATURES="full_features"`. - - `RPOLARS_RUST_SOURCE`, which was used for development, has been removed. - If you want to use library binaries located elsewhere, use `LIBR_POLARS_PATH` - instead. -- Remove the `eager` argument of `$execute()`. - Use the `$collect()` method after `$execute()` or `as_polars_df` to get the - result as a `DataFrame`. (#719) -- The argument `name_generator` of `$list$to_struct()` is renamed `fields` - (#724). -- The S3 method `[` for the `$list` subnamespace is removed (#724). -- The option `polars.df_print` has been renamed `polars.df_knitr_print` (#726). +- `$where()` is removed. Use `$filter()` instead (#718). +- Deprecated functions from 0.12.x are removed (#714). + - `$apply()` and `$map()`, use `$map_elements()` and + `$map_batches()` instead. + - `pl$polars_info()`, use `polars_info()` instead. +- The environment variables used when building the library have been changed + (#693). This only affects selecting the feature flag and selecting profiles + during source installation. + - `RPOLARS_PROFILE` is renamed to `LIBR_POLARS_PROFILE` + - `RPOLARS_FULL_FEATURES` is removed and `LIBR_POLARS_FEATURES` is added. + To select the `full_features`, set `LIBR_POLARS_FEATURES="full_features"`. + - `RPOLARS_RUST_SOURCE`, which was used for development, has been removed. + If you want to use library binaries located elsewhere, use `LIBR_POLARS_PATH` + instead. +- Remove the `eager` argument of `$execute()`. + Use the `$collect()` method after `$execute()` or `as_polars_df` to get the + result as a `DataFrame`. (#719) +- The argument `name_generator` of `$list$to_struct()` is renamed `fields` + (#724). +- The S3 method `[` for the `$list` subnamespace is removed (#724). +- The option `polars.df_print` has been renamed `polars.df_knitr_print` (#726). ### Deprecations -- `$list$lengths()` is deprecated and will be removed in 0.14.0. Use - `$list$len()` instead (#724). -- `pl$from_arrow()` is deprecated and will be removed in 0.14.0. - Use `as_polars_df()` or `as_polars_series()` instead (#728). -- `pl$set_options()` and `pl$reset_options()` are deprecated and will be - removed in 0.14.0. See `?polars_options` for details (#726). +- `$list$lengths()` is deprecated and will be removed in 0.14.0. Use + `$list$len()` instead (#724). +- `pl$from_arrow()` is deprecated and will be removed in 0.14.0. + Use `as_polars_df()` or `as_polars_series()` instead (#728). +- `pl$set_options()` and `pl$reset_options()` are deprecated and will be + removed in 0.14.0. See `?polars_options` for details (#726). ### New features -- For compatibility with CRAN, the number of threads used by Polars is automatically set to 2 - if the environment variable `POLARS_MAX_THREADS` is not set (#720). - To disable this behavior and have the maximum number of threads used automatically, - one of the following ways can be used: - - Build the Rust library with the `disable_limit_max_threads` feature. - - Set the `polars.limit_max_threads` option to `FALSE` with the `options()` function - before loading the package. -- New method `$rolling()` for `DataFrame` and `LazyFrame`. When this is - applied, it creates an object of class `RPolarsRollingGroupBy` (#682, #694). -- New method `$group_by_dynamic()` for `DataFrame` and `LazyFrame`. When this - is applied, it creates an object of class `RPolarsDynamicGroupBy` (#691). -- New method `$sink_ndjson()` for LazyFrame (#681). -- New function `pl$duration()` to create a duration by components (week, day, - hour, etc.), and use them with date(time) variables (#692). -- New methods `$list$any()` and `$list$all()` (#709). -- New function `pl$from_epoch()` to convert a Unix timestamp to a date(time) - variable (#708). -- New methods for the `list` subnamespace: `$set_union()`, `$set_intersection()`, - `$set_difference()`, `$set_symmetric_difference()` (#712). -- New option `int64_conversion` to specify how Int64 columns (that don't have - equivalent in base R) should be converted. This option can either be set - globally with `pl$set_options()` or on a case-by-case basis, e.g with - `$to_data_frame(int64_conversion =)` (#706). -- Several changes in `$join()` for `DataFrame` and `LazyFrame` (#716): - - `$join()` now errors if `other` is not a `LazyFrame` and - `$join()` errors if `other` is not a `DataFrame`. - - Some arguments have been reordered (e.g `how` now comes before `left_on`). - This can lead to bugs if the user didn't use argument names. - - Argument `how` now accepts `"outer_coalesce"` to coalesce the join keys - automatically after joining. - - New argument `validate` to perform some checks on join keys (e.g ensure - that there is a one-to-one matching between join keys). - - New argument `join_nulls` to consider `null` values as a valid key. -- `$describe()` now works with all datatypes. It also gains an - `interpolation` argument that is used for quantiles computation (#717). -- `as_polars_df()` and `as_polars_series()` for the `arrow` package classes have been - rewritten and work better (#727). -- Options handling has been rewritten to match the standard option handling in - R (#726): - - Options are now passed via `options()`. The option names don't change but - they must be prefixed with `"polars."`. For example, we can now pass - `options(polars.strictly_immutable = FALSE)`. - - Options can be accessed with `polars_options()`, which returns a named - list (this is the replacement of `pl$options`). - - Options can be reset with `polars_options_reset()` (this is the - replacement of `pl$reset_options()`). -- New function `polars_envvars()` to print the list of environment variables - related to polars (#735). +- For compatibility with CRAN, the number of threads used by Polars is automatically set to 2 + if the environment variable `POLARS_MAX_THREADS` is not set (#720). + To disable this behavior and have the maximum number of threads used automatically, + one of the following ways can be used: + - Build the Rust library with the `disable_limit_max_threads` feature. + - Set the `polars.limit_max_threads` option to `FALSE` with the `options()` function + before loading the package. +- New method `$rolling()` for `DataFrame` and `LazyFrame`. When this is + applied, it creates an object of class `RPolarsRollingGroupBy` (#682, #694). +- New method `$group_by_dynamic()` for `DataFrame` and `LazyFrame`. When this + is applied, it creates an object of class `RPolarsDynamicGroupBy` (#691). +- New method `$sink_ndjson()` for LazyFrame (#681). +- New function `pl$duration()` to create a duration by components (week, day, + hour, etc.), and use them with date(time) variables (#692). +- New methods `$list$any()` and `$list$all()` (#709). +- New function `pl$from_epoch()` to convert a Unix timestamp to a date(time) + variable (#708). +- New methods for the `list` subnamespace: `$set_union()`, `$set_intersection()`, + `$set_difference()`, `$set_symmetric_difference()` (#712). +- New option `int64_conversion` to specify how Int64 columns (that don't have + equivalent in base R) should be converted. This option can either be set + globally with `pl$set_options()` or on a case-by-case basis, e.g with + `$to_data_frame(int64_conversion =)` (#706). +- Several changes in `$join()` for `DataFrame` and `LazyFrame` (#716): + - `$join()` now errors if `other` is not a `LazyFrame` and + `$join()` errors if `other` is not a `DataFrame`. + - Some arguments have been reordered (e.g `how` now comes before `left_on`). + This can lead to bugs if the user didn't use argument names. + - Argument `how` now accepts `"outer_coalesce"` to coalesce the join keys + automatically after joining. + - New argument `validate` to perform some checks on join keys (e.g ensure + that there is a one-to-one matching between join keys). + - New argument `join_nulls` to consider `null` values as a valid key. +- `$describe()` now works with all datatypes. It also gains an + `interpolation` argument that is used for quantiles computation (#717). +- `as_polars_df()` and `as_polars_series()` for the `arrow` package classes have been + rewritten and work better (#727). +- Options handling has been rewritten to match the standard option handling in + R (#726): + - Options are now passed via `options()`. The option names don't change but + they must be prefixed with `"polars."`. For example, we can now pass + `options(polars.strictly_immutable = FALSE)`. + - Options can be accessed with `polars_options()`, which returns a named + list (this is the replacement of `pl$options`). + - Options can be reset with `polars_options_reset()` (this is the + replacement of `pl$reset_options()`). +- New function `polars_envvars()` to print the list of environment variables + related to polars (#735). ## Polars R Package 0.12.2 @@ -270,289 +374,289 @@ a large amount of documentation improvements. ### Deprecations -- `pl$polars_info()` is moved to `polars_info()`. `pl$polars_info()` is deprecated - and will be removed in 0.13.0 (#662). +- `pl$polars_info()` is moved to `polars_info()`. `pl$polars_info()` is deprecated + and will be removed in 0.13.0 (#662). ### Rust-polars update -- rust-polars is updated to 0.36.2 (#659). Most of the changes from 0.35.x to 0.36.2 - were covered in R polars 0.12.0. - The main change is that `pl$Utf8` is replaced by `pl$String`. - `pl$Utf8` is an alias and will keep working, but `pl$String` is now preferred - in the documentation and in new code. +- rust-polars is updated to 0.36.2 (#659). Most of the changes from 0.35.x to 0.36.2 + were covered in R polars 0.12.0. + The main change is that `pl$Utf8` is replaced by `pl$String`. + `pl$Utf8` is an alias and will keep working, but `pl$String` is now preferred + in the documentation and in new code. ### What's changed -- New methods `$str$reverse()`, `$str$contains_any()`, and `$str$replace_many()` - (#641). -- New methods `$rle()` and `$rle_id()` (#648). -- New functions `is_polars_df()`, `is_polars_lf()`, `is_polars_series()` (#658). -- `$gather()` now accepts negative indexing (#659). +- New methods `$str$reverse()`, `$str$contains_any()`, and `$str$replace_many()` + (#641). +- New methods `$rle()` and `$rle_id()` (#648). +- New functions `is_polars_df()`, `is_polars_lf()`, `is_polars_series()` (#658). +- `$gather()` now accepts negative indexing (#659). ### Miscellaneous -- Remove the `Makefile` in favor of `Taskfile.yml`. - Please use `task` instead of `make` as a task runner in the development (#654). +- Remove the `Makefile` in favor of `Taskfile.yml`. + Please use `task` instead of `make` as a task runner in the development (#654). ## Polars R Package 0.12.0 ### BREAKING CHANGES DUE TO RUST-POLARS UPDATE -- rust-polars is updated to 2023-12-25 unreleased version (#601, #622). - This is the same version of Python Polars package 0.20.2, so please check - the [upgrade guide](https://pola-rs.github.io/polars/releases/upgrade/0.20/) for details too. - - `pl$scan_csv()` and `pl$read_csv()`'s `comment_char` argument is renamed `comment_prefix`. - - `$frame_equal()` and `$series_equal()` are renamed - to `$equals()` and `$equals()`. - - `$rolling_*` functions gained an argument `warn_if_unsorted`. - - `$str$json_extract()` is renamed to `$str$json_decode()`. - - Change default join behavior with regard to `null` values. - - Preserve left and right join keys in outer joins. - - `count` now ignores null values. - - `NaN` values are now considered equal. - - `$gather_every()` gained an argument `offset`. +- rust-polars is updated to 2023-12-25 unreleased version (#601, #622). + This is the same version of Python Polars package 0.20.2, so please check + the [upgrade guide](https://pola-rs.github.io/polars/releases/upgrade/0.20/) for details too. + - `pl$scan_csv()` and `pl$read_csv()`'s `comment_char` argument is renamed `comment_prefix`. + - `$frame_equal()` and `$series_equal()` are renamed + to `$equals()` and `$equals()`. + - `$rolling_*` functions gained an argument `warn_if_unsorted`. + - `$str$json_extract()` is renamed to `$str$json_decode()`. + - Change default join behavior with regard to `null` values. + - Preserve left and right join keys in outer joins. + - `count` now ignores null values. + - `NaN` values are now considered equal. + - `$gather_every()` gained an argument `offset`. ### Breaking changes and deprecations -- `$apply()` on an Expr or a Series is renamed `$map_elements()`, and `$map()` - is renamed `$map_batches()`. `$map()` and `$apply()` will be removed in 0.13.0 (#534). -- Removed `$days()`, `$hours()`, `$minutes()`, `$seconds()`, `$milliseconds()`, - `$microseconds()`, `$nanoseconds()`. Those were deprecated in 0.11.0 (#550). -- `pl$concat_list()`: elements being strings are now interpreted as column names. - Use `pl$lit` to concat with a string. -- `$lit_to_s()` is renamed to `$to_series()` (#582). -- `$lit_to_df()` is removed (#582). -- Change class names and function names associated with class names. - - The class name of all objects created by polars (`DataFrame`, `LazyFrame`, - `Expr`, `Series`, etc.) has changed. They now start with `RPolars`, for example - `RPolarsDataFrame`. This will only break your code if you directly use those - class names, such as in S3 methods (#554, #585). - - Private methods have been unified so that they do not have the `RPolars` prefix (#584). +- `$apply()` on an Expr or a Series is renamed `$map_elements()`, and `$map()` + is renamed `$map_batches()`. `$map()` and `$apply()` will be removed in 0.13.0 (#534). +- Removed `$days()`, `$hours()`, `$minutes()`, `$seconds()`, `$milliseconds()`, + `$microseconds()`, `$nanoseconds()`. Those were deprecated in 0.11.0 (#550). +- `pl$concat_list()`: elements being strings are now interpreted as column names. + Use `pl$lit` to concat with a string. +- `$lit_to_s()` is renamed to `$to_series()` (#582). +- `$lit_to_df()` is removed (#582). +- Change class names and function names associated with class names. + - The class name of all objects created by polars (`DataFrame`, `LazyFrame`, + `Expr`, `Series`, etc.) has changed. They now start with `RPolars`, for example + `RPolarsDataFrame`. This will only break your code if you directly use those + class names, such as in S3 methods (#554, #585). + - Private methods have been unified so that they do not have the `RPolars` prefix (#584). ### What's changed -- The Extract function (`[`) for DataFrame can use columns not included in the - result for filtering (#547). -- The Extract function (`[`) for LazyFrame can filter rows with Expressions (#547). -- `as_polars_df()` for `data.frame` has a new argument `rownames` for to convert - the row.names attribute to a column. - This option is inspired by the `tibble::as_tibble()` function (#561). -- `as_polars_df()` for `data.frame` has a new argument `make_names_unique` (#561). -- New methods `$str$to_date()`, `$str$to_time()`, `$str$to_datetime()` as - alternatives to `$str$strptime()` (#558). -- The `dim()` function for DataFrame and LazyFrame correctly returns integer instead of - double (#577). -- The conversion of R's `POSIXct` class to Polars datetime now works correctly with millisecond - precision (#589). -- `$filter()`, `$filter()`, and `pl$when()` now allow multiple conditions - to be separated by commas, like `lf$filter(pl$col("foo") == 1, pl$col("bar") != 2)` (#598). -- New method `$replace()` for expressions (#601). -- Better error messages for trailing argument commas such as `pl$DataFrame()$select("a",)` (#607). -- New function `pl$threadpool_size()` to get the number of threads used by Polars (#620). - Thread pool size is also included in the output of `pl$polars_info()`. +- The Extract function (`[`) for DataFrame can use columns not included in the + result for filtering (#547). +- The Extract function (`[`) for LazyFrame can filter rows with Expressions (#547). +- `as_polars_df()` for `data.frame` has a new argument `rownames` for to convert + the row.names attribute to a column. + This option is inspired by the `tibble::as_tibble()` function (#561). +- `as_polars_df()` for `data.frame` has a new argument `make_names_unique` (#561). +- New methods `$str$to_date()`, `$str$to_time()`, `$str$to_datetime()` as + alternatives to `$str$strptime()` (#558). +- The `dim()` function for DataFrame and LazyFrame correctly returns integer instead of + double (#577). +- The conversion of R's `POSIXct` class to Polars datetime now works correctly with millisecond + precision (#589). +- `$filter()`, `$filter()`, and `pl$when()` now allow multiple conditions + to be separated by commas, like `lf$filter(pl$col("foo") == 1, pl$col("bar") != 2)` (#598). +- New method `$replace()` for expressions (#601). +- Better error messages for trailing argument commas such as `pl$DataFrame()$select("a",)` (#607). +- New function `pl$threadpool_size()` to get the number of threads used by Polars (#620). + Thread pool size is also included in the output of `pl$polars_info()`. ## Polars R Package 0.11.0 ### BREAKING CHANGES DUE TO RUST-POLARS UPDATE -- rust-polars is updated to 0.35.0 (2023-11-17) (#515) - - changes in `$write_csv()` and `sink_csv()`: `has_header` is renamed - `include_header` and there's a new argument `include_bom`. - - `pl$cov()` gains a `ddof` argument. - - `$cumsum()`, `$cumprod()`, `$cummin()`, `$cummax()`, `$cumcount()` are - renamed `$cum_sum()`, `$cum_prod()`, `$cum_min()`, `$cum_max()`, - `$cum_count()`. - - `take()` and `take_every()` are renamed `$gather()` and `gather_every()`. - - `$shift()` and `$shift_and_fill()` now accept Expr as input. - - when `reverse = TRUE`, `$arg_sort()` now places null values in the first - positions. - - Removed argument `ambiguous` in `$dt$truncate()` and `$dt$round()`. - - `$str$concat()` gains an argument `ignore_nulls`. +- rust-polars is updated to 0.35.0 (2023-11-17) (#515) + - changes in `$write_csv()` and `sink_csv()`: `has_header` is renamed + `include_header` and there's a new argument `include_bom`. + - `pl$cov()` gains a `ddof` argument. + - `$cumsum()`, `$cumprod()`, `$cummin()`, `$cummax()`, `$cumcount()` are + renamed `$cum_sum()`, `$cum_prod()`, `$cum_min()`, `$cum_max()`, + `$cum_count()`. + - `take()` and `take_every()` are renamed `$gather()` and `gather_every()`. + - `$shift()` and `$shift_and_fill()` now accept Expr as input. + - when `reverse = TRUE`, `$arg_sort()` now places null values in the first + positions. + - Removed argument `ambiguous` in `$dt$truncate()` and `$dt$round()`. + - `$str$concat()` gains an argument `ignore_nulls`. ### Breaking changes and deprecations -- The rowwise computation when several columns are passed to `pl$min()`, `pl$max()`, - and `pl$sum()` is deprecated and will be removed in 0.12.0. Passing several - columns to these functions will now compute the min/max/sum in each column - separately. Use `pl$min_horizontal()` `pl$max_horizontal()`, and - `pl$sum_horizontal()` instead for rowwise computation (#508). -- `$is_not()` is deprecated and will be removed in 0.12.0. Use `$not()` instead - (#511, #531). -- `$is_first()` is deprecated and will be removed in 0.12.0. Use `$is_first_distinct()` - instead (#531). -- In `pl$concat()`, the argument `to_supertypes` is removed. Use the suffix - `"_relaxed"` in the `how` argument to cast columns to their shared supertypes - (#523). -- All duration methods (`days()`, `hours()`, `minutes()`, `seconds()`, - `milliseconds()`, `microseconds()`, `nanoseconds()`) are renamed, for example - from `$dt$days()` to `$dt$total_days()`. The old usage is deprecated and will - be removed in 0.12.0 (#530). -- DataFrame methods `$as_data_frame()` is removed in favor of `$to_data_frame()` (#533). -- GroupBy methods `$as_data_frame()` and `$to_data_frame()` which were used to - convert GroupBy objects to R data frames are removed. - Use `$ungroup()` method and the `as.data.frame()` function instead (#533). +- The rowwise computation when several columns are passed to `pl$min()`, `pl$max()`, + and `pl$sum()` is deprecated and will be removed in 0.12.0. Passing several + columns to these functions will now compute the min/max/sum in each column + separately. Use `pl$min_horizontal()` `pl$max_horizontal()`, and + `pl$sum_horizontal()` instead for rowwise computation (#508). +- `$is_not()` is deprecated and will be removed in 0.12.0. Use `$not()` instead + (#511, #531). +- `$is_first()` is deprecated and will be removed in 0.12.0. Use `$is_first_distinct()` + instead (#531). +- In `pl$concat()`, the argument `to_supertypes` is removed. Use the suffix + `"_relaxed"` in the `how` argument to cast columns to their shared supertypes + (#523). +- All duration methods (`days()`, `hours()`, `minutes()`, `seconds()`, + `milliseconds()`, `microseconds()`, `nanoseconds()`) are renamed, for example + from `$dt$days()` to `$dt$total_days()`. The old usage is deprecated and will + be removed in 0.12.0 (#530). +- DataFrame methods `$as_data_frame()` is removed in favor of `$to_data_frame()` (#533). +- GroupBy methods `$as_data_frame()` and `$to_data_frame()` which were used to + convert GroupBy objects to R data frames are removed. + Use `$ungroup()` method and the `as.data.frame()` function instead (#533). ### What's changed -- Fix the installation issue on Ubuntu 20.04 (#528, thanks @brownag). -- New methods `$write_json()` and `$write_ndjson()` for DataFrame (#502). -- Removed argument `name` in `pl$date_range()`, which was deprecated for a while - (#503). -- New private method `.pr$DataFrame$drop_all_in_place(df)` to drop `DataFrame` - in-place, to release memory without invoking gc(). However, if there are other - strong references to any of the underlying Series or arrow arrays, that memory - will specifically not be released. This method is aimed for r-polars extensions, - and will be kept stable as much as possible (#504). -- New functions `pl$min_horizontal()`, `pl$max_horizontal()`, `pl$sum_horizontal()`, - `pl$all_horizontal()`, `pl$any_horizontal()` (#508). -- New generic functions `as_polars_df()` and `as_polars_lf()` to create polars - DataFrames and LazyFrames (#519). -- New method `$ungroup()` for `GroupBy` and `LazyGroupBy` (#522). -- New method `$rolling()` to apply an Expr over a rolling window based on - date/datetime/numeric indices (#470). -- New methods `$name$to_lowercase()` and `$name$to_uppercase()` to transform - variable names (#529). -- New method `$is_last_distinct()` (#531). -- New methods of the Expressions class, `$floor_div()`, `$mod()`, `$eq_missing()` - and `$neq_missing()`. The base R operators `%/%` and `%%` for Expressions are - now translated to `$floor_div()` and `$mod()` (#523). - - Note that `$mod()` of Polars is different from the R operator `%%`, which is - not guaranteed `x == (x %% y) + y * (x %/% y)`. - Please check the upstream issue [pola-rs/polars#10570](https://github.com/pola-rs/polars/issues/10570). -- The extract function (`[`) for polars objects now behave more like for base R objects (#543). +- Fix the installation issue on Ubuntu 20.04 (#528, thanks @brownag). +- New methods `$write_json()` and `$write_ndjson()` for DataFrame (#502). +- Removed argument `name` in `pl$date_range()`, which was deprecated for a while + (#503). +- New private method `.pr$DataFrame$drop_all_in_place(df)` to drop `DataFrame` + in-place, to release memory without invoking gc(). However, if there are other + strong references to any of the underlying Series or arrow arrays, that memory + will specifically not be released. This method is aimed for r-polars extensions, + and will be kept stable as much as possible (#504). +- New functions `pl$min_horizontal()`, `pl$max_horizontal()`, `pl$sum_horizontal()`, + `pl$all_horizontal()`, `pl$any_horizontal()` (#508). +- New generic functions `as_polars_df()` and `as_polars_lf()` to create polars + DataFrames and LazyFrames (#519). +- New method `$ungroup()` for `GroupBy` and `LazyGroupBy` (#522). +- New method `$rolling()` to apply an Expr over a rolling window based on + date/datetime/numeric indices (#470). +- New methods `$name$to_lowercase()` and `$name$to_uppercase()` to transform + variable names (#529). +- New method `$is_last_distinct()` (#531). +- New methods of the Expressions class, `$floor_div()`, `$mod()`, `$eq_missing()` + and `$neq_missing()`. The base R operators `%/%` and `%%` for Expressions are + now translated to `$floor_div()` and `$mod()` (#523). + - Note that `$mod()` of Polars is different from the R operator `%%`, which is + not guaranteed `x == (x %% y) + y * (x %/% y)`. + Please check the upstream issue [pola-rs/polars#10570](https://github.com/pola-rs/polars/issues/10570). +- The extract function (`[`) for polars objects now behave more like for base R objects (#543). ## Polars R Package 0.10.1 ### What's changed -- The argument `quote_style` in `$write_csv()` and `$sink_csv()` can now take - the value `"never"` (#483). -- `pl$DataFrame()` now errors if the variables specified in `schema` do not exist - in the data (#486). -- S3 methods for base R functions are well documented (#494). -- A bug that failing `pl$SQLContext()$register()` without load the package was fixed (#496). +- The argument `quote_style` in `$write_csv()` and `$sink_csv()` can now take + the value `"never"` (#483). +- `pl$DataFrame()` now errors if the variables specified in `schema` do not exist + in the data (#486). +- S3 methods for base R functions are well documented (#494). +- A bug that failing `pl$SQLContext()$register()` without load the package was fixed (#496). ## Polars R Package 0.10.0 ### BREAKING CHANGES DUE TO RUST-POLARS UPDATE -- rust-polars is updated to 2023-10-25 unreleased version (#442) - - Minimum supported Rust version (MSRV) is now 1.73. - - New subnamespace `"name"` that contains methods `$prefix()`, `$suffix()` - `keep()` (renamed from `keep_name()`) and `map()` (renamed from `map_alias()`). - - `$dt$round()` gains an argument `ambiguous`. - - The following methods now accept an `Expr` as input: `$top_k()`, `$bottom_k()`, - `$list$join()`, `$str$strip_chars()`, `$str$strip_chars_start()`, - `$str$strip_chars_end()`, `$str$split_exact()`. - - The following methods were renamed: - - `$str$n_chars()` -> `$str$len_chars()` - - `$str$lengths()` -> `$str$len_bytes()` - - `$str$ljust()` -> `$str$pad_end()` - - `$str$rjust()` -> `$str$pad_start()` - - `$concat()` with `how = "diagonal"` now accepts an argument `to_supertypes` - to automatically convert concatenated columns to the same type. - - `pl$enable_string_cache()` doesn't take any argument anymore. The string cache - can now be disabled with `pl$disable_string_cache()`. - - `$scan_parquet()` gains an argument `hive_partitioning`. - - `$meta$tree_format()` has a better formatted output. +- rust-polars is updated to 2023-10-25 unreleased version (#442) + - Minimum supported Rust version (MSRV) is now 1.73. + - New subnamespace `"name"` that contains methods `$prefix()`, `$suffix()` + `keep()` (renamed from `keep_name()`) and `map()` (renamed from `map_alias()`). + - `$dt$round()` gains an argument `ambiguous`. + - The following methods now accept an `Expr` as input: `$top_k()`, `$bottom_k()`, + `$list$join()`, `$str$strip_chars()`, `$str$strip_chars_start()`, + `$str$strip_chars_end()`, `$str$split_exact()`. + - The following methods were renamed: + - `$str$n_chars()` -> `$str$len_chars()` + - `$str$lengths()` -> `$str$len_bytes()` + - `$str$ljust()` -> `$str$pad_end()` + - `$str$rjust()` -> `$str$pad_start()` + - `$concat()` with `how = "diagonal"` now accepts an argument `to_supertypes` + to automatically convert concatenated columns to the same type. + - `pl$enable_string_cache()` doesn't take any argument anymore. The string cache + can now be disabled with `pl$disable_string_cache()`. + - `$scan_parquet()` gains an argument `hive_partitioning`. + - `$meta$tree_format()` has a better formatted output. ### Breaking changes -- `$scan_csv()` and `$read_csv()` now match more closely the Python-Polars API (#455): - - `sep` is renamed `separator`, `overwrite_dtypes` is renamed `dtypes`, - `parse_dates` is renamed `try_parse_dates`. - - new arguments `rechunk`, `eol_char`, `raise_if_empty`, `truncate_ragged_lines` - - `path` can now be a vector of characters indicating several paths to CSV files. - This only works if all CSV files have the same schema. +- `$scan_csv()` and `$read_csv()` now match more closely the Python-Polars API (#455): + - `sep` is renamed `separator`, `overwrite_dtypes` is renamed `dtypes`, + `parse_dates` is renamed `try_parse_dates`. + - new arguments `rechunk`, `eol_char`, `raise_if_empty`, `truncate_ragged_lines` + - `path` can now be a vector of characters indicating several paths to CSV files. + This only works if all CSV files have the same schema. ### What's changed -- New class `RPolarsSQLContext` and its methods to perform SQL queries on DataFrame- - like objects. To use this feature, needs to build Rust library with full features - (#457). -- New methods `$peak_min()` and `$peak_max()` to find local minima and maxima in - an Expr (#462). -- New methods `$read_ndjson()` and `$scan_ndjson()` (#471). -- New method `$with_context()` for `LazyFrame` to have access to columns from - other Data/LazyFrames during the computation (#475). +- New class `RPolarsSQLContext` and its methods to perform SQL queries on DataFrame- + like objects. To use this feature, needs to build Rust library with full features + (#457). +- New methods `$peak_min()` and `$peak_max()` to find local minima and maxima in + an Expr (#462). +- New methods `$read_ndjson()` and `$scan_ndjson()` (#471). +- New method `$with_context()` for `LazyFrame` to have access to columns from + other Data/LazyFrames during the computation (#475). ## Polars R Package 0.9.0 ### BREAKING CHANGES DUE TO RUST-POLARS UPDATE -- rust-polars is updated to 0.33.2 (#417) - - In all date-time related methods, the argument `use_earliest` is replaced by `ambiguous`. - - In `$sample()` and `$shuffle()`, the argument `fixed_seed` is removed. - - In `$value_counts()`, the arguments `multithreaded` and `sort` - (sometimes called `sorted`) have been swapped and renamed `sort` and `parallel`. - - `$str$count_match()` gains a `literal` argument. - - `$arg_min()` doesn't consider `NA` as the minimum anymore (this was already the behavior of `$min()`). - - Using `$is_in()` with `NA` on both sides now returns `NA` and not `TRUE` anymore. - - Argument `pattern` of `$str$count_matches()` can now use expressions. - - Needs Rust toolchain `nightly-2023-08-26` for to build with full features. -- Rename R functions to match rust-polars - - `$str$count_match()` -> `$str$count_matches()` (#417) - - `$str$strip()` -> `$str$strip_chars()` (#417) - - `$str$lstrip()` -> `$str$strip_chars_start()` (#417) - - `$str$rstrip()` -> `$str$strip_chars_end()` (#417) - - `$groupby()` is renamed `$group_by()`. (#427) +- rust-polars is updated to 0.33.2 (#417) + - In all date-time related methods, the argument `use_earliest` is replaced by `ambiguous`. + - In `$sample()` and `$shuffle()`, the argument `fixed_seed` is removed. + - In `$value_counts()`, the arguments `multithreaded` and `sort` + (sometimes called `sorted`) have been swapped and renamed `sort` and `parallel`. + - `$str$count_match()` gains a `literal` argument. + - `$arg_min()` doesn't consider `NA` as the minimum anymore (this was already the behavior of `$min()`). + - Using `$is_in()` with `NA` on both sides now returns `NA` and not `TRUE` anymore. + - Argument `pattern` of `$str$count_matches()` can now use expressions. + - Needs Rust toolchain `nightly-2023-08-26` for to build with full features. +- Rename R functions to match rust-polars + - `$str$count_match()` -> `$str$count_matches()` (#417) + - `$str$strip()` -> `$str$strip_chars()` (#417) + - `$str$lstrip()` -> `$str$strip_chars_start()` (#417) + - `$str$rstrip()` -> `$str$strip_chars_end()` (#417) + - `$groupby()` is renamed `$group_by()`. (#427) ### Breaking changes -- Remove some deprecated methods. - - Method `$with_column()` has been removed (it was deprecated since 0.8.0). - Use `$with_columns()` instead (#402). - - Subnamespace `$arr` has been removed (it was deprecated since 0.8.1). - Use `$list` instead (#402). -- Setting and getting polars options is now made with `pl$options`, - `pl$set_options()` and `pl$reset_options()` (#384). +- Remove some deprecated methods. + - Method `$with_column()` has been removed (it was deprecated since 0.8.0). + Use `$with_columns()` instead (#402). + - Subnamespace `$arr` has been removed (it was deprecated since 0.8.1). + Use `$list` instead (#402). +- Setting and getting polars options is now made with `pl$options`, + `pl$set_options()` and `pl$reset_options()` (#384). ### What's changed -- Bump supported R version to 4.2 or later (#435). -- `pl$concat()` now also supports `Series`, `Expr` and `LazyFrame` (#407). -- New method `$unnest()` for `LazyFrame` (#397). -- New method `$sample()` for `DataFrame` (#399). -- New method `$meta$tree_format()` to display an `Expr` as a tree (#401). -- New argument `schema` in `pl$DataFrame()` and `pl$LazyFrame()` to override the - automatic type detection (#385). -- Fix bug when calling R from polars via e.g. `$map()` where query would not - complete in one edge case (#409). -- New method `$cat$get_categories()` to list unique values of categorical - variables (#412). -- New methods `$fold()` and `$reduce()` to apply an R function rowwise (#403). -- New function `pl$raw_list` and class `rpolars_raw_list` a list of R Raw's, where missing is - encoded as `NULL` to aid conversion to polars binary Series. Support back and forth conversion - from polars binary literal and Series to R raw (#417). -- New method `$write_csv()` for `DataFrame` (#414). -- New method `$sink_csv()` for `LazyFrame` (#432). -- New method `$dt$time()` to extract the time from a `datetime` variable (#428). -- Method `$profile()` gains optimization arguments and plot-related arguments (#429). -- New method `pl$read_parquet()` that is a shortcut for `pl$scan_parquet()$collect()` (#434). -- Rename `$str$str_explode()` to `$str$explode()` (#436). -- New method `$transpose()` for `DataFrame` (#440). -- New argument `eager` of `LazyFrame$set_optimization_toggle()` (#439). -- `{polars}` can now be installed with "R source package with Rust library binary", - by a mechanism copied from [the prqlr package](https://CRAN.R-project.org/package=prqlr). - - ```r - Sys.setenv(NOT_CRAN = "true") - install.packages("polars", repos = "https://rpolars.r-universe.dev") - ``` - - The URL and SHA256 hash of the available binaries are recorded in `tools/lib-sums.tsv`. - (#435, #448, #450, #451) +- Bump supported R version to 4.2 or later (#435). +- `pl$concat()` now also supports `Series`, `Expr` and `LazyFrame` (#407). +- New method `$unnest()` for `LazyFrame` (#397). +- New method `$sample()` for `DataFrame` (#399). +- New method `$meta$tree_format()` to display an `Expr` as a tree (#401). +- New argument `schema` in `pl$DataFrame()` and `pl$LazyFrame()` to override the + automatic type detection (#385). +- Fix bug when calling R from polars via e.g. `$map()` where query would not + complete in one edge case (#409). +- New method `$cat$get_categories()` to list unique values of categorical + variables (#412). +- New methods `$fold()` and `$reduce()` to apply an R function rowwise (#403). +- New function `pl$raw_list` and class `rpolars_raw_list` a list of R Raw's, where missing is + encoded as `NULL` to aid conversion to polars binary Series. Support back and forth conversion + from polars binary literal and Series to R raw (#417). +- New method `$write_csv()` for `DataFrame` (#414). +- New method `$sink_csv()` for `LazyFrame` (#432). +- New method `$dt$time()` to extract the time from a `datetime` variable (#428). +- Method `$profile()` gains optimization arguments and plot-related arguments (#429). +- New method `pl$read_parquet()` that is a shortcut for `pl$scan_parquet()$collect()` (#434). +- Rename `$str$str_explode()` to `$str$explode()` (#436). +- New method `$transpose()` for `DataFrame` (#440). +- New argument `eager` of `LazyFrame$set_optimization_toggle()` (#439). +- `{polars}` can now be installed with "R source package with Rust library binary", + by a mechanism copied from [the prqlr package](https://CRAN.R-project.org/package=prqlr). + + ```r + Sys.setenv(NOT_CRAN = "true") + install.packages("polars", repos = "https://rpolars.r-universe.dev") + ``` + + The URL and SHA256 hash of the available binaries are recorded in `tools/lib-sums.tsv`. + (#435, #448, #450, #451) ## Polars R Package 0.8.1 ### What's changed -- New string method `to_titlecase()` (#371). -- Although stated in news for PR (#334) `strip = true` was not actually set for the - "release-optimized" compilation profile. Now it is, but the binary sizes seems unchanged (#377). -- New vignette on best practices to improve `polars` performance (#188). -- Subnamespace name "arr" as in `$arr$` & `$arr$` is deprecated - in favor of "list". The subnamespace "arr" will be removed in polars 0.9.0 (#375). +- New string method `to_titlecase()` (#371). +- Although stated in news for PR (#334) `strip = true` was not actually set for the + "release-optimized" compilation profile. Now it is, but the binary sizes seems unchanged (#377). +- New vignette on best practices to improve `polars` performance (#188). +- Subnamespace name "arr" as in `$arr$` & `$arr$` is deprecated + in favor of "list". The subnamespace "arr" will be removed in polars 0.9.0 (#375). ## Polars R Package 0.8.0 @@ -562,354 +666,176 @@ rust-polars was updated to 0.32.0, which comes with many breaking changes and ne features. Unrelated breaking changes and new features are put in separate sections (#334): -- update of rust toolchain: nightly bumped to nightly-2023-07-27 and MSRV is - now >=1.70. -- param `common_subplan_elimination = TRUE` in `` methods `$collect()`, - `$sink_ipc()` and `$sink_parquet()` is renamed and split into - `comm_subplan_elim = TRUE` and `comm_subexpr_elim = TRUE`. -- Series_is_sorted: nulls_last argument is dropped. -- `when-then-otherwise` classes are renamed to `When`, `Then`, `ChainedWhen` - and `ChainedThen`. The syntactically illegal methods have been removed, e.g. - chaining `$when()` twice. -- Github release + R-universe is compiled with `profile=release-optimized`, - which now includes `strip=false`, `lto=fat` & `codegen-units=1`. This should - make the binary a bit smaller and faster. See also FULL_FEATURES=`true` env - flag to enable simd with nightly rust. For development or faster compilation, - use instead `profile=release`. -- `fmt` arg is renamed `format` in `pl$Ptimes` and `$str$strptime`. -- `$approx_unique()` changed name to `$approx_n_unique()`. -- `$str$json_extract` arg `pat` changed to `dtype` and has a new argument - `infer_schema_length = 100`. -- Some arguments in `pl$date_range()` have changed: `low` -> `start`, - `high` -> `end`, `lazy = TRUE` -> `eager = FALSE`. Args `time_zone` and `time_unit` - can no longer be used to implicitly cast time types. These two args can only - be used to annotate a naive time unit. Mixing `time_zone` and `time_unit` for - `start` and `end` is not allowed anymore. -- `$is_in()` operation no longer supported for dtype `null`. -- Various subtle changes: - - `(pl$lit(NA_real_) == pl$lit(NA_real_))$lit_to_s()` renders now to `null` - not `true`. - - `pl$lit(NA_real_)$is_in(pl$lit(NULL))$lit_to_s()` renders now to `false` - and before `true` - - `pl$lit(numeric(0))$sum()$lit_to_s()` now yields `0f64` and not `null`. -- `$all()` and `$any()` have a new arg `drop_nulls = TRUE`. -- `$sample()` and `$shuffle()` have a new arg `fix_seed`. -- `$sort()` and `$sort()` have a new arg - `maintain_order = FALSE`. +- update of rust toolchain: nightly bumped to nightly-2023-07-27 and MSRV is + now >=1.70. +- param `common_subplan_elimination = TRUE` in `` methods `$collect()`, + `$sink_ipc()` and `$sink_parquet()` is renamed and split into + `comm_subplan_elim = TRUE` and `comm_subexpr_elim = TRUE`. +- Series_is_sorted: nulls_last argument is dropped. +- `when-then-otherwise` classes are renamed to `When`, `Then`, `ChainedWhen` + and `ChainedThen`. The syntactically illegal methods have been removed, e.g. + chaining `$when()` twice. +- Github release + R-universe is compiled with `profile=release-optimized`, + which now includes `strip=false`, `lto=fat` & `codegen-units=1`. This should + make the binary a bit smaller and faster. See also FULL_FEATURES=`true` env + flag to enable simd with nightly rust. For development or faster compilation, + use instead `profile=release`. +- `fmt` arg is renamed `format` in `pl$Ptimes` and `$str$strptime`. +- `$approx_unique()` changed name to `$approx_n_unique()`. +- `$str$json_extract` arg `pat` changed to `dtype` and has a new argument + `infer_schema_length = 100`. +- Some arguments in `pl$date_range()` have changed: `low` -> `start`, + `high` -> `end`, `lazy = TRUE` -> `eager = FALSE`. Args `time_zone` and `time_unit` + can no longer be used to implicitly cast time types. These two args can only + be used to annotate a naive time unit. Mixing `time_zone` and `time_unit` for + `start` and `end` is not allowed anymore. +- `$is_in()` operation no longer supported for dtype `null`. +- Various subtle changes: + - `(pl$lit(NA_real_) == pl$lit(NA_real_))$lit_to_s()` renders now to `null` + not `true`. + - `pl$lit(NA_real_)$is_in(pl$lit(NULL))$lit_to_s()` renders now to `false` + and before `true` + - `pl$lit(numeric(0))$sum()$lit_to_s()` now yields `0f64` and not `null`. +- `$all()` and `$any()` have a new arg `drop_nulls = TRUE`. +- `$sample()` and `$shuffle()` have a new arg `fix_seed`. +- `$sort()` and `$sort()` have a new arg + `maintain_order = FALSE`. ### OTHER BREAKING CHANGES -- `$rpow()` is removed. It should never have been translated. Use `^` and `$pow()` - instead (#346). -- `$collect_background()` renamed `$collect_in_background()` - and reworked. Likewise `PolarsBackgroundHandle` reworked and renamed to - `RThreadHandle` (#311). -- `pl$scan_arrow_ipc` is now called `pl$scan_ipc` (#343). +- `$rpow()` is removed. It should never have been translated. Use `^` and `$pow()` + instead (#346). +- `$collect_background()` renamed `$collect_in_background()` + and reworked. Likewise `PolarsBackgroundHandle` reworked and renamed to + `RThreadHandle` (#311). +- `pl$scan_arrow_ipc` is now called `pl$scan_ipc` (#343). ### Other changes -- Stream query to file with `pl$sink_ipc()` and `pl$sink_parquet()` (#343) -- New method `$explode()` for `DataFrame` and `LazyFrame` (#314). -- New method `$clone()` for `LazyFrame` (#347). -- New method `$fetch()` for `LazyFrame` (#319). -- New methods `$optimization_toggle()` and `$profile()` for `LazyFrame` (#323). -- `$with_column()` is now deprecated (following upstream `polars`). It will be - removed in 0.9.0. It should be replaced with `$with_columns()` (#313). -- New lazy function translated: `concat_str()` to concatenate several columns - into one (#349). -- New stat functions `pl$cov()`, `pl$rolling_cov()` `pl$corr()`, `pl$rolling_corr()` (#351). -- Add functions `pl$set_global_rpool_cap()`, `pl$get_global_rpool_cap()`, class `RThreadHandle` and - `in_background = FALSE` param to `$map()` and `$apply()`. It is now possible to run R code - with `collect_in_background()` and/or let polars parallize R code in an R processes - pool. See `RThreadHandle-class` in reference docs for more info. (#311) -- Internal IPC/shared-mem channel to serialize and send R objects / polars DataFrame across - R processes. (#311) -- Compile environment flag RPOLARS_ALL_FEATURES changes name to RPOLARS_FULL_FEATURES. If 'true' - will trigger something like `Cargo build --features "full_features"` which is not exactly the same - as `Cargo build --all-features`. Some dev features are not included in "full_features" (#311). -- Fix bug to allow using polars without library(polars) (#355). -- New methods `$optimization_toggle()` + `$profile()` and enable rust-polars feature - CSE: "Activate common subplan elimination optimization" (#323) -- Named expression e.g. `pl$select(newname = pl$lit(2))` are no longer experimental - and allowed as default (#357). -- Added methods `pl$enable_string_cache()`, `pl$with_string_cache()` and `pl$using_string_cache()` - for joining/comparing Categorical series/columns (#361). -- Added an S3 generic `as_polars_series()` where users or developers of extensions - can define a custom way to convert their format to Polars format. This generic - must return a Polars series. See #368 for an example (#369). -- Private API Support for Arrow Stream import/export of DataFrame between two R packages that uses - rust-polars. [See R package example here](https://github.com/rpolars/extendrpolarsexamples) - (#326). +- Stream query to file with `pl$sink_ipc()` and `pl$sink_parquet()` (#343) +- New method `$explode()` for `DataFrame` and `LazyFrame` (#314). +- New method `$clone()` for `LazyFrame` (#347). +- New method `$fetch()` for `LazyFrame` (#319). +- New methods `$optimization_toggle()` and `$profile()` for `LazyFrame` (#323). +- `$with_column()` is now deprecated (following upstream `polars`). It will be + removed in 0.9.0. It should be replaced with `$with_columns()` (#313). +- New lazy function translated: `concat_str()` to concatenate several columns + into one (#349). +- New stat functions `pl$cov()`, `pl$rolling_cov()` `pl$corr()`, `pl$rolling_corr()` (#351). +- Add functions `pl$set_global_rpool_cap()`, `pl$get_global_rpool_cap()`, class `RThreadHandle` and + `in_background = FALSE` param to `$map()` and `$apply()`. It is now possible to run R code + with `collect_in_background()` and/or let polars parallize R code in an R processes + pool. See `RThreadHandle-class` in reference docs for more info. (#311) +- Internal IPC/shared-mem channel to serialize and send R objects / polars DataFrame across + R processes. (#311) +- Compile environment flag RPOLARS_ALL_FEATURES changes name to RPOLARS_FULL_FEATURES. If 'true' + will trigger something like `Cargo build --features "full_features"` which is not exactly the same + as `Cargo build --all-features`. Some dev features are not included in "full_features" (#311). +- Fix bug to allow using polars without library(polars) (#355). +- New methods `$optimization_toggle()` + `$profile()` and enable rust-polars feature + CSE: "Activate common subplan elimination optimization" (#323) +- Named expression e.g. `pl$select(newname = pl$lit(2))` are no longer experimental + and allowed as default (#357). +- Added methods `pl$enable_string_cache()`, `pl$with_string_cache()` and `pl$using_string_cache()` + for joining/comparing Categorical series/columns (#361). +- Added an S3 generic `as_polars_series()` where users or developers of extensions + can define a custom way to convert their format to Polars format. This generic + must return a Polars series. See #368 for an example (#369). +- Private API Support for Arrow Stream import/export of DataFrame between two R packages that uses + rust-polars. [See R package example here](https://github.com/rpolars/extendrpolarsexamples) + (#326). ## Polars R Package 0.7.0 ### BREAKING CHANGES -- Replace the argument `reverse` by `descending` in all sorting functions. This - is for consistency with the upstream Polars (#291, #293). -- Bump rust-polars from 2023-04-20 unreleased version to version 0.30.0 released in 2023-05-30 (#289). - - Rename `concat_lst` to `concat_list`. - - Rename `$str$explode` to `$str$str_explode`. - - Remove `tz_aware` and `utc` arguments from `str_parse`. - - in `$date_range`'s the `lazy` argument is now `TRUE` by default. -- The functions to read CSV have been renamed `scan_csv` and `read_csv` for - consistency with the upstream Polars. `scan_xxx` and `read_xxx` functions are now accessed via `pl`, - e.g. `pl$scan_csv()` (#305). +- Replace the argument `reverse` by `descending` in all sorting functions. This + is for consistency with the upstream Polars (#291, #293). +- Bump rust-polars from 2023-04-20 unreleased version to version 0.30.0 released in 2023-05-30 (#289). + - Rename `concat_lst` to `concat_list`. + - Rename `$str$explode` to `$str$str_explode`. + - Remove `tz_aware` and `utc` arguments from `str_parse`. + - in `$date_range`'s the `lazy` argument is now `TRUE` by default. +- The functions to read CSV have been renamed `scan_csv` and `read_csv` for + consistency with the upstream Polars. `scan_xxx` and `read_xxx` functions are now accessed via `pl`, + e.g. `pl$scan_csv()` (#305). ### What's changed -- New method `$rename()` for `LazyFrame` and `DataFrame` (#239) -- `$unique()` and `$unique()` gain a `maintain_order` argument (#238). -- New `pl$LazyFrame()` to quickly create a `LazyFrame`, mostly in examples or - for demonstration purposes (#240). -- Polars is internally moving away from string errors to a new error-type called `RPolarsErr` both on rust- and R-side. Final error messages should look very similar (#233). -- `$columns()`, `$schema()`, `$dtypes()` for `LazyFrame` implemented (#250). -- Improvements to internal `RPolarsErr`. Also `RPolarsErr` will now print each context of the error on a separate line (#250). -- Fix memory leak on error bug. Fix printing of `%` bug. Prepare for renaming of polars classes (#252). -- Add helpful reference landing page at `polars.github.io/reference_home` (#223, #264). -- Supports Rust 1.65 (#262, #280) - - rust-polars' `simd` feature is now disabled by default. To enable it, set the environment variable - `RPOLARS_ALL_FEATURES` to `true` when build r-polars (#262). - - `opt-level` of `argminmax` is now set to `1` in the `release` profile to support Rust < 1.66. - The profile can be changed by setting the environment variable `RPOLARS_PROFILE` (when set to `release-optimized`, - `opt-level` of `argminmax` is set to `3`). -- A new function `pl$polars_info()` will tell which features enabled (#271, #285, #305). -- `select()` now accepts lists of expressions. For example, `$select(l_expr)` - works with `l_expr = list(pl$col("a"))` (#265). -- LazyFrame gets some new S3 methods: `[`, `dim()`, `dimnames()`, `length()`, `names()` (#301) -- `$glimpse()` is a fast `str()`-like view of a `DataFrame` (#277). -- `$over()` now accepts a vector of column names (#287). -- New method `$describe()` (#268). -- Cross joining is now possible with `how = "cross"` in `$join()` (#310). -- Add license info of all rust crates to `LICENSE.note` (#309). -- With CRAN 0.7.0 release candidate (#308). - - New author accredited, SHIMA Tatsuya (@eitsupi). - - DESCRIPTION revised. +- New method `$rename()` for `LazyFrame` and `DataFrame` (#239) +- `$unique()` and `$unique()` gain a `maintain_order` argument (#238). +- New `pl$LazyFrame()` to quickly create a `LazyFrame`, mostly in examples or + for demonstration purposes (#240). +- Polars is internally moving away from string errors to a new error-type called `RPolarsErr` both on rust- and R-side. Final error messages should look very similar (#233). +- `$columns()`, `$schema()`, `$dtypes()` for `LazyFrame` implemented (#250). +- Improvements to internal `RPolarsErr`. Also `RPolarsErr` will now print each context of the error on a separate line (#250). +- Fix memory leak on error bug. Fix printing of `%` bug. Prepare for renaming of polars classes (#252). +- Add helpful reference landing page at `polars.github.io/reference_home` (#223, #264). +- Supports Rust 1.65 (#262, #280) + - rust-polars' `simd` feature is now disabled by default. To enable it, set the environment variable + `RPOLARS_ALL_FEATURES` to `true` when build r-polars (#262). + - `opt-level` of `argminmax` is now set to `1` in the `release` profile to support Rust < 1.66. + The profile can be changed by setting the environment variable `RPOLARS_PROFILE` (when set to `release-optimized`, + `opt-level` of `argminmax` is set to `3`). +- A new function `pl$polars_info()` will tell which features enabled (#271, #285, #305). +- `select()` now accepts lists of expressions. For example, `$select(l_expr)` + works with `l_expr = list(pl$col("a"))` (#265). +- LazyFrame gets some new S3 methods: `[`, `dim()`, `dimnames()`, `length()`, `names()` (#301) +- `$glimpse()` is a fast `str()`-like view of a `DataFrame` (#277). +- `$over()` now accepts a vector of column names (#287). +- New method `$describe()` (#268). +- Cross joining is now possible with `how = "cross"` in `$join()` (#310). +- Add license info of all rust crates to `LICENSE.note` (#309). +- With CRAN 0.7.0 release candidate (#308). + - New author accredited, SHIMA Tatsuya (@eitsupi). + - DESCRIPTION revised. ## Polars R Package 0.6.1 ### What's changed -- use `pl$set_polars_options(debug_polars = TRUE)` to profile/debug method-calls of a polars query (#193) -- add `$melt(), $pivot() + $melt()` methods (#232) -- lazy functions translated: `pl$implode`, `pl$explode`, `pl$unique`, `pl$approx_unique`, `pl$head`, `pl$tail` (#196) -- `pl$list` is deprecated, use `pl$implode` instead. (#196) -- Docs improvements. (#210, #213) -- Update nix flake. (#227) +- use `pl$set_polars_options(debug_polars = TRUE)` to profile/debug method-calls of a polars query (#193) +- add `$melt(), $pivot() + $melt()` methods (#232) +- lazy functions translated: `pl$implode`, `pl$explode`, `pl$unique`, `pl$approx_unique`, `pl$head`, `pl$tail` (#196) +- `pl$list` is deprecated, use `pl$implode` instead. (#196) +- Docs improvements. (#210, #213) +- Update nix flake. (#227) ## Polars R Package 0.6.0 ### BREAKING CHANGES -- Bump rust-polars from 2023-02-17 unreleased version to 2023-04-20 unreleased version. (#183) - - `top_k`'s `reverse` option is removed. Use the new `bottom_k` method instead. - - The name of the `fmt` argument of some methods (e.g. `parse_date`) has been changed to `format`. +- Bump rust-polars from 2023-02-17 unreleased version to 2023-04-20 unreleased version. (#183) + - `top_k`'s `reverse` option is removed. Use the new `bottom_k` method instead. + - The name of the `fmt` argument of some methods (e.g. `parse_date`) has been changed to `format`. ### What's changed -- `DataFrame` objects can be subsetted using brackets like standard R data frames: `pl$DataFrame(mtcars)[2:4, c("mpg", "hp")]` (#140 @vincentarelbundock) -- An experimental `knit_print()` method has been added to DataFrame that outputs HTML tables - (similar to py-polars' HTML output) (#125 @eitsupi) -- `Series` gains new methods: `$mean`, `$median`, `$std`, `$var` (#170 @vincentarelbundock) -- A new option `use_earliest` of `replace_time_zone`. (#183) -- A new option `strict` of `parse_int`. (#183) -- Perform joins on nearest keys with method `join_asof`. (#172) +- `DataFrame` objects can be subsetted using brackets like standard R data frames: `pl$DataFrame(mtcars)[2:4, c("mpg", "hp")]` (#140 @vincentarelbundock) +- An experimental `knit_print()` method has been added to DataFrame that outputs HTML tables + (similar to py-polars' HTML output) (#125 @eitsupi) +- `Series` gains new methods: `$mean`, `$median`, `$std`, `$var` (#170 @vincentarelbundock) +- A new option `use_earliest` of `replace_time_zone`. (#183) +- A new option `strict` of `parse_int`. (#183) +- Perform joins on nearest keys with method `join_asof`. (#172) ## Polars R Package v0.5.0 ### BREAKING CHANGE -- The package name was changed from `rpolars` to `polars`. (#84) +- The package name was changed from `rpolars` to `polars`. (#84) ### What's changed -- Several new methods for DataFrame, LazyFrame & GroupBy translated (#103, #105 @vincentarelbundock) -- Doc fixes (#102, #109 @etiennebacher) -- Experimental opt-in auto completion (#96 @sorhawell) -- Base R functions work on DataFrame and LazyFrame objects via S3 methods: as.data.frame, as.matrix, dim, head, length, max, mean, median, min, na.omit, names, sum, tail, unique, ncol, nrow (#107 @vincentarelbundock). +- Several new methods for DataFrame, LazyFrame & GroupBy translated (#103, #105 @vincentarelbundock) +- Doc fixes (#102, #109 @etiennebacher) +- Experimental opt-in auto completion (#96 @sorhawell) +- Base R functions work on DataFrame and LazyFrame objects via S3 methods: as.data.frame, as.matrix, dim, head, length, max, mean, median, min, na.omit, names, sum, tail, unique, ncol, nrow (#107 @vincentarelbundock). ### New Contributors -- @etiennebacher made their first contribution in #102 -- @vincentarelbundock made their first contribution in #103 +- @etiennebacher made their first contribution in #102 +- @vincentarelbundock made their first contribution in #103 Release date: 2023-04-16. Full changelog: [v0.4.6...v0.5.0](https://github.com/pola-rs/r-polars/compare/v0.4.7...v0.5.0) - -## rpolars v0.4.7 - -### What's changed - -- Revamped docs that includes a new introductory vignette (#81 @grantmcdermott) -- Misc documentation improvements - -## rpolars v0.4.6 - -Release date: 2023-03-13. Full changelog: [v0.4.5...v0.4.6](https://github.com/pola-rs/r-polars/compare/v0.4.5...v0.4.6) - -### What's new - -- Almost all Expr translated, only missing 'binary'-expr now. #52 #53 -- Run polars queries in detached background threads, no need for any parallel libraries or cluster config #56 #59 -- Full support for when-then-otherwise-syntax #65 -- **rpolars** now uses bit64 integer64 vectors as input/output for i64 vectors: #68 #69 -- use `pl$from_arrow` to zero-copy(almost) import `Table`/`Array` from **r-arrow**. #67 -- Support inter process connections with `scan_ipc` -- Implement `scan_ipc` by @Sicheng-Pan in #63 -- 'Backend' improvements - - (prepare support for aarch64-linux) Touch libgcc_eh.a by @yutannihilation in #49 - - Use py-polars rust file structure (to help devs) by @sorhawell in #55 - - Refactor Makefiles by @eitsupi in #58 - - Build **rpolars** from Nix by @Sicheng-Pan in #54 - - `extendr_api` 0.4 by @sorhawell in #6 - - Add r-universe URL by @jeroen in #71 - - chore: install **nanoarrow** from cran by @eitsupi in #72 - - chore: install **nanoarrow** from cran (#72) by @sorhawell in #73 - - Fix pdf latex errors by @sorhawell in #74 - - re-enable devel test, **pak** R-devel issue went away by @sorhawell in #75 - - DO NOT MERGE: tracking hello_r_universe branch by @eitsupi in #38 - - revert to nightly by @sorhawell in #78 - -### New Contributors - -- @Sicheng-Pan made their first contribution in #54 -- @jeroen made their first contribution in #71 - -## rpolars v0.4.5 - -Release date: 2023-02-21. Full Changelog: [v0.4.3...v0.4.5](https://github.com/pola-rs/r-polars/compare/v0.4.3...v0.4.5) - -### What's Changed - -- bump rust polars to latest rust-polars and fix all errors by @sorhawell in #42 -- Customize **extendr** to better support cross Rust-R/R-Rust error handling - - bump extendr_api by @sorhawell in #44 - - Str even more by @sorhawell in #47 -- **rpolars** is now available for install from [rpolars.r-universe.dev](https://rpolars.r-universe.dev/polars#install) @eitsupi - - advertise R-universe by @sorhawell in #39 - - Includes reasonably easy pre-compiled installation for arm64-MacBooks -- All string Expressions available - - Expr str strptime by @sorhawell in #40 - - rust_result tests + fixes by @sorhawell in #41 - - Str continued by @sorhawell in #43 - - Str even more by @sorhawell in #47 -- Starting to roll out new error-handling and type-conversions between R and rust. - - - Precise source of error should be very clear even in a long method-chain e.g. - - ```r - pl$lit("hey-you-there")$str$splitn("-",-3)$alias("struct_of_words")$to_r() - > Error: in str$splitn the arg [n] the value -3 cannot be less than zero - when calling : - pl$lit("hey-you-there")$str$splitn("-", -3) - ``` - -- Misc - - Clippy + tiny optimization by @sorhawell in #45 - - Tidying by @sorhawell in #37 - -## rpolars v0.4.3 - -Release date: 2023-02-01. Full Changelog: [v0.4.2...v0.4.3](https://github.com/pola-rs/r-polars/compare/v0.4.2...v0.4.3) - -### What's Changed - -- All DateTime expresssions implemented + update rust-polars to latest commit. - - Arr str by @sorhawell in #32 - - Datetime continued by @sorhawell in #33 - - Datatime remaining tests + tidy util functions by @sorhawell in #36 - -### Developer changes - -- Refactoring GitHub Actions workflows by @eitsupi in #24 -- Fix cache and check scan by @sorhawell in #30 - -## rpolars v0.4.2 - -Release date: 2023-01-17. Full Changelog: [V0.4.1...v0.4.2](https://github.com/pola-rs/r-polars/compare/V0.4.1...v0.4.2) - -### What's Changed - -- fix minor Series syntax issue #8 @sorhawell in #22 -- nanoarrow followup: docs + adjust test by @sorhawell in #21 -- Add R CMD check workflow by @eitsupi in #23 -- `usethis::use_mit_license()` by @yutannihilation in #27 -- Fix check errors by @sorhawell in #26 - -### New Contributors - -- @eitsupi made their first contribution in #23 -- @yutannihilation made their first contribution in #27 - -## rpolars v0.4.1 - -Release date: 2023-01-12. Full Changelog: [v0.4.0...V0.4.1](https://github.com/pola-rs/r-polars/compare/v0.4.0...V0.4.1) - -### What's Changed - -- Export ArrowArrayStream from polars data frame by @paleolimbot in #5 -- Minor arithmetics syntax improvement @sorhawell in #20 - -### Dev env - -- Renv is deactivated as default. Renv.lock still defines package stack on build server @sorhawell in #19 - -### Minor stuff - -- Improve docs by @sorhawell in #16 -- Update rust polars to +26.1 by @sorhawell in #18 - -### New Contributors - -- @paleolimbot made their first contribution in #5 - -## rpolars v0.4.0 - -Release date: 2023-01-11. Full Changelog: [v0.3.1...v0.4.0](https://github.com/pola-rs/r-polars/compare/V0.3.1...v0.4.0) - -### Breaking changes - -- Class label "DataType" is now called "RPolarsDataType". Syntax wise 'DataType' can still be used, e.g. `.pr$DataType$` -- try fix name space collision with arrow by @sorhawell in #15 - -### New features - -- all list Expr$arr$list functions have been translated: -- Expr list 2.0 by @sorhawell in #10 -- Expr list 3.0 by @sorhawell in #12 - -### Dev environment - -- update rextendr by @sorhawell in #13 - -## rpolars v0.3.1 - -Release date: 2023-01-07. Full Changelog: [v0.3.0...v0.3.1](https://github.com/pola-rs/r-polars/compare/v0.3.0...V0.3.1) - -### What's Changed - -- drop github action upload pre-release of PR's by @sorhawell in #7 -- Fix readme typo by @erjanmx in #6 -- Expr arr list functions + rework r_to_series by @sorhawell in #2 - -### New Contributors - -- @erjanmx made their first contribution in #6 - -## rpolars v0.3.0 - -Release date: 2022-12-31. Full Changelog: [v0.2.1...v0.3.0](https://github.com/pola-rs/r-polars/compare/v0.2.1...v0.3.0) - -### What's Changed - -- use jemalloc(linux) else mimallac as py-polars by @sorhawell in #1 -- Bump rust polars 26.1 by @sorhawell in #3 -- Expr_interpolate now has two methods, linear, nearest -- Expr_quantile also takes quantile value as an expression -- map_alias improved error handling - -## rpolars v0.2.1 - -Release date: 2022-12-27 - -- **rpolars** is now hosted at . Happy to be here. diff --git a/R/as_polars.R b/R/as_polars.R index a07b5f5df..7c5b9ca1c 100644 --- a/R/as_polars.R +++ b/R/as_polars.R @@ -27,10 +27,10 @@ #' # Convert an arrow Table, with renaming and casting all columns #' as_polars_df( #' at, -#' schema = list(a = pl$Int64, b = pl$String) +#' schema = list(b = pl$Int64, a = pl$String) #' ) #' -#' # Convert an arrow Table, with renaming and casting some columns +#' # Convert an arrow Table, with casting some columns #' as_polars_df( #' at, #' schema_overrides = list(y = pl$String) # cast some columns @@ -202,8 +202,8 @@ as_polars_df.RPolarsLazyGroupBy = function(x, ...) { #' @param rechunk A logical flag (default `TRUE`). #' Make sure that all data of each column is in contiguous memory. #' @param schema named list of DataTypes, or character vector of column names. -#' Should be the same length as the number of columns of `x`. -#' If schema names or types do not match `x`, the columns will be renamed/recast. +#' Should match the number of columns in `x` and correspond to each column in `x` by position. +#' If a column in `x` does not match the name or type at the same position, it will be renamed/recast. #' If `NULL` (default), convert columns as is. #' @param schema_overrides named list of DataTypes. Cast some columns to the DataType. #' @export @@ -224,40 +224,43 @@ as_polars_df.ArrowTabular = function( #' @rdname as_polars_df #' @export -as_polars_df.nanoarrow_array_stream = function(x, ...) { - on.exit(x$release()) +as_polars_df.nanoarrow_array = function(x, ...) { + array_type = nanoarrow::infer_nanoarrow_schema(x) |> + nanoarrow::nanoarrow_schema_parse() |> + (\(x) x$type)() - if (!inherits(nanoarrow::infer_nanoarrow_ptype(x$get_schema()), "data.frame")) { - stop("Can't convert non-struct array stream to RPolarsDataFrame") + if (array_type != "struct") { + Err_plain("Can't convert non-struct array to RPolarsDataFrame") |> + unwrap("in as_polars_df():") } - list_of_struct_arrays = nanoarrow::collect_array_stream(x, validate = FALSE) - if (length(list_of_struct_arrays)) { - data_cols = list() + series = as_polars_series.nanoarrow_array(x, name = NULL) - struct_array = list_of_struct_arrays[[1L]] - list_of_arrays = struct_array$children - col_names = names(list_of_arrays) + if (length(series)) { + series$to_frame()$unnest("") + } else { + # TODO: support 0-length array + pl$DataFrame() + } +} - for (i in seq_along(list_of_arrays)) { - data_cols[[col_names[i]]] = as_polars_series.nanoarrow_array(list_of_arrays[[i]]) - } - for (struct_array in list_of_struct_arrays[-1L]) { - list_of_arrays = struct_array$children - col_names = names(list_of_arrays) - for (i in seq_along(list_of_arrays)) { - .pr$Series$append_mut(data_cols[[col_names[i]]], as_polars_series.nanoarrow_array(list_of_arrays[[i]])) |> - unwrap("in as_polars_df():") - } - } +#' @rdname as_polars_df +#' @export +as_polars_df.nanoarrow_array_stream = function(x, ...) { + if (!inherits(nanoarrow::infer_nanoarrow_ptype(x$get_schema()), "data.frame")) { + Err_plain("Can't convert non-struct array stream to RPolarsDataFrame") |> + unwrap("in as_polars_df():") + } - out = do.call(pl$select, data_cols) + series = as_polars_series.nanoarrow_array_stream(x, name = NULL) + + if (length(series)) { + series$to_frame()$unnest("") } else { - out = pl$DataFrame() # TODO: support creating 0-row DataFrame + # TODO: support 0-length array stream + pl$DataFrame() } - - out } @@ -408,9 +411,11 @@ as_polars_series.nanoarrow_array_stream = function(x, name = NULL, ...) { out = pl$Series(NULL, name = name) } else { out = as_polars_series.nanoarrow_array(list_of_arrays[[1L]], name = name) - for (array in list_of_arrays[-1L]) { - .pr$Series$append_mut(out, as_polars_series.nanoarrow_array(array)) - } + lapply( + list_of_arrays[-1L], + \(array) .pr$Series$append_mut(out, as_polars_series.nanoarrow_array(array)) + ) |> + invisible() } out @@ -478,7 +483,7 @@ as_polars_series.clock_time_point = function(x, name = NULL, ...) { pl$col("diff_1")$cast(pl$Int64) )$mul( pl$lit(n_multiply_to_ms)$cast(pl$UInt32) - )$cast(pl$Datetime(tu = target_precision)) + )$cast(pl$Datetime(target_precision)) )$get_column("out")$alias(name %||% "") } diff --git a/R/construction.R b/R/construction.R index 74554fd1a..3b8f968ef 100644 --- a/R/construction.R +++ b/R/construction.R @@ -223,20 +223,10 @@ df_to_rpldf = function(x, ..., schema = NULL, schema_overrides = NULL) { unwrap() } - data_cols = list() + out = lapply(x, as_polars_series) |> + pl$select() - for (i in seq_len(n_cols)) { - column = as_polars_series(x[[i]]) - col_name = col_names[i] - - data_cols[[col_name]] = column - } - - if (length(data_cols)) { - out = do.call(pl$select, data_cols) - } else { - out = pl$DataFrame() - } + out$columns = col_names cast_these_fields = mapply( new_schema, diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 494104ac0..5f576e2c0 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -60,6 +60,46 @@ #' #' `$width` returns the number of columns in the DataFrame. #' +#' @section Conversion to R data types considerations: +#' When converting Polars objects, such as [DataFrames][DataFrame_class] +#' to R objects, for example via the [`as.data.frame()`][as.data.frame.RPolarsDataFrame] generic function, +#' each type in the Polars object is converted to an R type. +#' In some cases, an error may occur because the conversion is not appropriate. +#' In particular, there is a high possibility of an error when converting +#' a [Datetime][DataType_Datetime] type without a time zone. +#' A [Datetime][DataType_Datetime] type without a time zone in Polars is converted +#' to the [POSIXct] type in R, which takes into account the time zone in which +#' the R session is running (which can be checked with the [Sys.timezone()] +#' function). In this case, if ambiguous times are included, a conversion error +#' will occur. In such cases, change the session time zone using +#' [`Sys.setenv(TZ = "UTC")`][base::Sys.setenv] and then perform the conversion, or use the +#' [`$dt$replace_time_zone()`][ExprDT_replace_time_zone] method on the Datetime type column to +#' explicitly specify the time zone before conversion. +#' +#' ```{r} +#' # Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am +#' # so this particular date-time doesn't exist +#' non_existent_time = pl$Series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "%F %T") +#' +#' withr::with_envvar( +#' new = c(TZ = "America/New_York"), +#' { +#' tryCatch( +#' # This causes an error due to the time zone (the `TZ` env var is affected). +#' as.vector(non_existent_time), +#' error = function(e) e +#' ) +#' } +#' ) +#' +#' withr::with_envvar( +#' new = c(TZ = "America/New_York"), +#' { +#' # This is safe. +#' as.vector(non_existent_time$dt$replace_time_zone("UTC")) +#' } +#' ) +#' ``` #' @details Check out the source code in #' [R/dataframe_frame.R](https://github.com/pola-rs/r-polars/blob/main/R/dataframe__frame.R) #' to see how public methods are derived from private methods. Check out @@ -231,14 +271,20 @@ pl_DataFrame = function(..., make_names_unique = TRUE, schema = NULL) { result() |> uw() - if (!is.null(schema) && !all(names(schema) %in% names(largs))) { + if (length(largs) > 0 && !is.null(schema) && !all(names(schema) %in% names(largs))) { Err_plain("Some columns in `schema` are not in the DataFrame.") |> uw() } - # no args crete empty DataFrame + # no args create empty DataFrame if (length(largs) == 0L) { - return(.pr$DataFrame$default()) + if (!is.null(schema)) { + out = lapply(schema, \(dtype) pl$Series(NULL)$cast(dtype)) |> + pl$select() + } else { + out = .pr$DataFrame$default() + } + return(out) } # pass through if already a DataFrame @@ -838,6 +884,8 @@ DataFrame_filter = function(...) { #' @details Within each group, the order of the rows is always preserved, #' regardless of the `maintain_order` argument. #' @return [GroupBy][GroupBy_class] (a DataFrame with special groupby methods like `$agg()`) +#' @seealso +#' - [`$partition_by()`][DataFrame_partition_by] #' @examples #' df = pl$DataFrame( #' a = c("a", "b", "a", "b", "c"), @@ -885,6 +933,7 @@ DataFrame_group_by = function(..., maintain_order = polars_options()$maintain_or #' * `"string"` converts Int64 values to character. #' #' @return An R data.frame +#' @inheritSection DataFrame_class Conversion to R data types considerations #' @keywords DataFrame #' @examples #' df = pl$DataFrame(iris[1:3, ]) @@ -917,6 +966,7 @@ DataFrame_to_data_frame = function(..., int64_conversion = polars_options()$int6 #' structure is not very typical or efficient in R. #' #' @return R list of vectors +#' @inheritSection DataFrame_class Conversion to R data types considerations #' @keywords DataFrame #' @examples #' pl$DataFrame(iris)$to_list() @@ -1967,8 +2017,7 @@ DataFrame_rolling = function(index_column, period, offset = NULL, closed = "righ #' time = pl$date_range( #' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), -#' interval = "30m", -#' eager = TRUE, +#' interval = "30m" #' ), #' n = 0:6 #' ) @@ -2046,3 +2095,108 @@ DataFrame_group_by_dynamic = function( by, start_by, check_sorted ) } + + +#' Split a DataFrame into multiple DataFrames +#' +#' Similar to [`$group_by()`][DataFrame_group_by]. +#' Group by the given columns and return the groups as separate [DataFrames][DataFrame_class]. +#' It is useful to use this in combination with functions like [lapply()] or `purrr::map()`. +#' @param ... Characters of column names to group by. Passed to [`pl$col()`][pl_col]. +#' @param maintain_order If `TRUE`, ensure that the order of the groups is consistent with the input data. +#' This is slower than a default partition by operation. +#' @param include_key If `TRUE`, include the columns used to partition the DataFrame in the output. +#' @param as_nested_list This affects the format of the output. +#' If `FALSE` (default), the output is a flat [list] of [DataFrames][DataFrame_class]. +#' IF `TRUE` and one of the `maintain_order` or `include_key` argument is `TRUE`, +#' then each element of the output has two children: `key` and `data`. +#' See the examples for more details. +#' @return A list of [DataFrames][DataFrame_class]. See the examples for details. +#' @seealso +#' - [`$group_by()`][DataFrame_group_by] +#' @examples +#' df = pl$DataFrame( +#' a = c("a", "b", "a", "b", "c"), +#' b = c(1, 2, 1, 3, 3), +#' c = c(5, 4, 3, 2, 1) +#' ) +#' df +#' +#' # Pass a single column name to partition by that column. +#' df$partition_by("a") +#' +#' # Partition by multiple columns. +#' df$partition_by("a", "b") +#' +#' # Partition by column data type +#' df$partition_by(pl$String) +#' +#' # If `as_nested_list = TRUE`, the output is a list whose elements have a `key` and a `data` field. +#' # The `key` is a named list of the key values, and the `data` is the DataFrame. +#' df$partition_by("a", "b", as_nested_list = TRUE) +#' +#' # `as_nested_list = TRUE` should be used with `maintain_order = TRUE` or `include_key = TRUE`. +#' tryCatch( +#' df$partition_by("a", "b", maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE), +#' warning = function(w) w +#' ) +#' +#' # Example of using with lapply(), and printing the key and the data summary +#' df$partition_by("a", "b", maintain_order = FALSE, as_nested_list = TRUE) |> +#' lapply(\(x) { +#' sprintf("\nThe key value of `a` is %s and the key value of `b` is %s\n", x$key$a, x$key$b) |> +#' cat() +#' x$data$drop(names(x$key))$describe() |> +#' print() +#' invisible(NULL) +#' }) |> +#' invisible() +DataFrame_partition_by = function( + ..., + maintain_order = TRUE, + include_key = TRUE, + as_nested_list = FALSE) { + uw = \(res) unwrap(res, "in $partition_by():") + + by = result(dots_to_colnames(self, ...)) |> + uw() + + if (!length(by)) { + Err_plain("There is no column to partition by.") |> + uw() + } + + partitions = .pr$DataFrame$partition_by(self, by, maintain_order, include_key) |> + uw() + + if (isTRUE(as_nested_list)) { + if (include_key) { + out = lapply(seq_along(partitions), \(index) { + data = partitions[[index]] + key = data$select(by)$head(1)$to_list() + + list(key = key, data = data) + }) + + return(out) + } else if (maintain_order) { + key_df = self$select(by)$unique(maintain_order = TRUE) + out = lapply(seq_along(partitions), \(index) { + data = partitions[[index]] + key = key_df$slice(index - 1, 1)$to_list() + + list(key = key, data = data) + }) + + return(out) + } else { + warning( + "cannot use `$partition_by` with ", + "`maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE`. ", + "Fall back to a flat list." + ) + } + } + + partitions +} diff --git a/R/datatype.R b/R/datatype.R index 5d6f2ce6e..5059ffb55 100644 --- a/R/datatype.R +++ b/R/datatype.R @@ -41,6 +41,7 @@ wrap_proto_schema = function(x) { #' @title DataTypes (RPolarsDataType) #' #' @name pl_dtypes +#' @aliases RPolarsDataType #' @description `DataType` any polars type (ported so far) #' @return not applicable #' @examples @@ -160,26 +161,38 @@ DataType_constructors = function() { ) } -# TODO: change the argument name from `tz` to `time_zone` -#' Create Datetime DataType -#' @description Datetime DataType constructor -#' @param tu string option either "ms", "us" or "ns" -#' @param tz string the Time Zone, see details -#' @details all allowed TimeZone designations can be found in `base::OlsonNames()` -#' @keywords pl + +#' Data type representing a calendar date and time of day. +#' +#' The underlying representation of this type is a 64-bit signed integer. +#' The integer indicates the number of time units since the Unix epoch (1970-01-01 00:00:00). +#' The number can be negative to indicate datetimes before the epoch. +#' @aliases pl_Datetime +#' @param time_unit Unit of time. One of `"ms"`, `"us"` (default) or `"ns"`. +#' @param time_zone Time zone string, as defined in [OlsonNames()]. +#' Setting `timezone = "*"` will match any timezone, which can be useful to +#' select all Datetime columns containing a timezone. #' @return Datetime DataType #' @examples #' pl$Datetime("ns", "Pacific/Samoa") -DataType_Datetime = function(tu = "us", tz = NULL) { - if (!is.null(tz) && !isTRUE(tz %in% base::OlsonNames())) { +#' +#' df = pl$DataFrame( +#' naive_time = as.POSIXct("1900-01-01"), +#' zoned_time = as.POSIXct("1900-01-01", "UTC") +#' ) +#' df +#' +#' df$select(pl$col(pl$Datetime("us", "*"))) +DataType_Datetime = function(time_unit = "us", time_zone = NULL) { + if (!is.null(time_zone) && !isTRUE(time_zone %in% c(base::OlsonNames(), "*"))) { sprintf( "The time zone '%s' is not supported in polars. See `base::OlsonNames()` for supported time zones.", - tz + time_zone ) |> Err_plain() |> unwrap("in $Datetime():") } - unwrap(.pr$DataType$new_datetime(tu, tz)) + unwrap(.pr$DataType$new_datetime(time_unit, time_zone)) } #' Create Struct DataType diff --git a/R/dotdotdot.R b/R/dotdotdot.R index 3fb0e41b2..cbf281fd8 100644 --- a/R/dotdotdot.R +++ b/R/dotdotdot.R @@ -45,9 +45,9 @@ unpack_list = function(..., .context = NULL, .call = sys.call(1L), skip_classes l = list2(..., .context = .context, .call = .call) if ( length(l) == 1L && - is.list(l[[1L]]) && - !(!is.null(skip_classes) && inherits(l[[1L]], skip_classes)) && - is.null(names(l)) + is.list(l[[1L]]) && + !(!is.null(skip_classes) && inherits(l[[1L]], skip_classes)) && + is.null(names(l)) ) { l[[1L]] } else { @@ -79,3 +79,13 @@ unpack_bool_expr_result = function(...) { } }) } + + +#' Convert dots to a character vector of column names +#' @param .df [RPolarsDataFrame] +#' @param ... Arguments to pass to [`pl$col()`][pl_col] +#' @noRd +dots_to_colnames = function(.df, ..., .call = sys.call(1L)) { + result(pl$DataFrame(schema = .df$schema)$select(pl$col(...))$columns) |> + unwrap(call = .call) +} diff --git a/R/expr__datetime.R b/R/expr__datetime.R index 4b5466c75..dcdef9771 100644 --- a/R/expr__datetime.R +++ b/R/expr__datetime.R @@ -25,7 +25,7 @@ #' @examples #' t1 = as.POSIXct("3040-01-01", tz = "GMT") #' t2 = t1 + as.difftime(25, units = "secs") -#' s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms", eager = TRUE) +#' s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms") #' #' # use a dt namespace function #' df = pl$DataFrame(datetime = s)$with_columns( @@ -73,7 +73,7 @@ ExprDT_truncate = function(every, offset = NULL) { #' @examples #' t1 = as.POSIXct("3040-01-01", tz = "GMT") #' t2 = t1 + as.difftime(25, units = "secs") -#' s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms", eager = TRUE) +#' s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms") #' #' # use a dt namespace function #' df = pl$DataFrame(datetime = s)$with_columns( @@ -122,7 +122,7 @@ ExprDT_round = function(every, offset = NULL) { #' #' # if needed to convert back to R it is more intuitive to set a specific time zone #' expr = pl$lit(as.Date("2021-01-01"))$dt$combine(3600 * 1.5E6 + 123, tu = "us") -#' expr$cast(pl$Datetime(tu = "us", tz = "GMT"))$to_r() +#' expr$cast(pl$Datetime("us", "GMT"))$to_r() ExprDT_combine = function(tm, tu = "us") { if (inherits(tm, "PTime")) tu = "ns" # PTime implicitly gets converted to "ns" if (!is_string(tu)) stop("combine: input tu is not a string, [%s ]", str_string(tu)) @@ -166,8 +166,7 @@ ExprDT_strftime = function(format) { #' as.Date("2020-12-25"), #' as.Date("2021-1-05"), #' interval = "1d", -#' time_zone = "GMT", -#' eager = TRUE +#' time_zone = "GMT" #' ) #' ) #' df$with_columns( @@ -196,8 +195,7 @@ ExprDT_year = function() { #' as.Date("2020-12-25"), #' as.Date("2021-1-05"), #' interval = "1d", -#' time_zone = "GMT", -#' eager = TRUE +#' time_zone = "GMT" #' ) #' ) #' df$with_columns( @@ -224,8 +222,7 @@ ExprDT_iso_year = function() { #' as.Date("2020-12-25"), #' as.Date("2021-1-05"), #' interval = "1d", -#' time_zone = "GMT", -#' eager = TRUE +#' time_zone = "GMT" #' ) #' ) #' df$with_columns( @@ -251,8 +248,7 @@ ExprDT_quarter = function() { #' as.Date("2020-12-25"), #' as.Date("2021-1-05"), #' interval = "1d", -#' time_zone = "GMT", -#' eager = TRUE +#' time_zone = "GMT" #' ) #' ) #' df$with_columns( @@ -279,8 +275,7 @@ ExprDT_month = function() { #' as.Date("2020-12-25"), #' as.Date("2021-1-05"), #' interval = "1d", -#' time_zone = "GMT", -#' eager = TRUE +#' time_zone = "GMT" #' ) #' ) #' df$with_columns( @@ -305,8 +300,7 @@ ExprDT_week = function() { #' as.Date("2020-12-25"), #' as.Date("2021-1-05"), #' interval = "1d", -#' time_zone = "GMT", -#' eager = TRUE +#' time_zone = "GMT" #' ) #' ) #' df$with_columns( @@ -333,8 +327,7 @@ ExprDT_weekday = function() { #' as.Date("2020-12-25"), #' as.Date("2021-1-05"), #' interval = "1d", -#' time_zone = "GMT", -#' eager = TRUE +#' time_zone = "GMT" #' ) #' ) #' df$with_columns( @@ -360,8 +353,7 @@ ExprDT_day = function() { #' as.Date("2020-12-25"), #' as.Date("2021-1-05"), #' interval = "1d", -#' time_zone = "GMT", -#' eager = TRUE +#' time_zone = "GMT" #' ) #' ) #' df$with_columns( @@ -387,8 +379,7 @@ ExprDT_ordinal_day = function() { #' as.Date("2020-12-25"), #' as.Date("2021-1-05"), #' interval = "1d2h", -#' time_zone = "GMT", -#' eager = TRUE +#' time_zone = "GMT" #' ) #' ) #' df$with_columns( @@ -413,8 +404,7 @@ ExprDT_hour = function() { #' as.Date("2020-12-25"), #' as.Date("2021-1-05"), #' interval = "1d5s", -#' time_zone = "GMT", -#' eager = TRUE +#' time_zone = "GMT" #' ) #' ) #' df$with_columns( @@ -443,7 +433,6 @@ ExprDT_minute = function() { #' as.numeric(as.POSIXct("2001-1-1 00:00:6")) * 1E6, #' interval = "2s654321us", #' time_unit = "us", # instruct polars input is us, and store as us -#' eager = TRUE #' ))$with_columns( #' pl$col("date")$dt$second()$alias("second"), #' pl$col("date")$dt$second(fractional = TRUE)$alias("second_frac") @@ -471,7 +460,6 @@ ExprDT_second = function(fractional = FALSE) { #' as.numeric(as.POSIXct("2001-1-1 00:00:6")) * 1E6, #' interval = "2s654321us", #' time_unit = "us", # instruct polars input is us, and store as us -#' eager = TRUE #' ))$with_columns( #' pl$col("date")$cast(pl$Int64)$alias("datetime int64"), #' pl$col("date")$dt$millisecond()$alias("millisecond") @@ -494,8 +482,7 @@ ExprDT_millisecond = function() { #' as.numeric(as.POSIXct("2001-1-1")) * 1E6 + 456789, # manually convert to us #' as.numeric(as.POSIXct("2001-1-1 00:00:6")) * 1E6, #' interval = "2s654321us", -#' time_unit = "us", # instruct polars input is us, and store as us -#' eager = TRUE +#' time_unit = "us" # instruct polars input is us, and store as us #' ) #' )$with_columns( #' pl$col("date")$cast(pl$Int64)$alias("datetime int64"), @@ -523,8 +510,7 @@ ExprDT_microsecond = function() { #' as.numeric(as.POSIXct("2001-1-1")) * 1E9 + 123456789, # manually convert to us #' as.numeric(as.POSIXct("2001-1-1 00:00:6")) * 1E9, #' interval = "1s987654321ns", -#' time_unit = "ns", # instruct polars input is us, and store as us -#' eager = TRUE +#' time_unit = "ns" # instruct polars input is us, and store as us #' ))$with_columns( #' pl$col("date")$cast(pl$Int64)$alias("datetime int64"), #' pl$col("date")$dt$nanosecond()$alias("nanosecond") @@ -546,10 +532,10 @@ ExprDT_nanosecond = function() { #' @keywords ExprDT #' @aliases (Expr)$dt$epoch #' @examples -#' pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch("ns")$to_series() -#' pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch("ms")$to_series() -#' pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch("s")$to_series() -#' pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch("d")$to_series() +#' pl$date_range(as.Date("2022-1-1"))$dt$epoch("ns")$to_series() +#' pl$date_range(as.Date("2022-1-1"))$dt$epoch("ms")$to_series() +#' pl$date_range(as.Date("2022-1-1"))$dt$epoch("s")$to_series() +#' pl$date_range(as.Date("2022-1-1"))$dt$epoch("d")$to_series() ExprDT_epoch = function(tu = c("us", "ns", "ms", "s", "d")) { tu = tu[1] @@ -582,8 +568,7 @@ ExprDT_epoch = function(tu = c("us", "ns", "ms", "s", "d")) { #' date = pl$date_range( #' start = as.Date("2001-1-1"), #' end = as.Date("2001-1-3"), -#' interval = "1d1s", -#' eager = TRUE +#' interval = "1d1s" #' ) #' ) #' df$select( @@ -612,8 +597,7 @@ ExprDT_timestamp = function(tu = c("ns", "us", "ms")) { #' date = pl$date_range( #' start = as.Date("2001-1-1"), #' end = as.Date("2001-1-3"), -#' interval = "1d1s", -#' eager = TRUE +#' interval = "1d1s" #' ) #' ) #' df$select( @@ -643,8 +627,7 @@ ExprDT_with_time_unit = function(tu = c("ns", "us", "ms")) { #' date = pl$date_range( #' start = as.Date("2001-1-1"), #' end = as.Date("2001-1-3"), -#' interval = "1d1s", -#' eager = TRUE +#' interval = "1d1s" #' ) #' ) #' df$select( @@ -669,20 +652,52 @@ ExprDT_cast_time_unit = function(tu = c("ns", "us", "ms")) { #' @aliases (Expr)$dt$convert_time_zone #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( -#' start = as.Date("2001-3-1"), -#' end = as.Date("2001-5-1"), -#' interval = "1mo12m34s", -#' eager = TRUE -#' ) +#' london_timezone = pl$date_range( +#' as.POSIXct("2020-03-01", tz = "UTC"), +#' as.POSIXct("2020-07-01", tz = "UTC"), +#' "1mo", +#' time_zone = "UTC" +#' )$dt$convert_time_zone("Europe/London") #' ) +#' #' df$select( -#' pl$col("date"), -#' pl$col("date") -#' $dt$replace_time_zone("Europe/Amsterdam") -#' $dt$convert_time_zone("Europe/London") -#' $alias("London_with") +#' "london_timezone", +#' London_to_Amsterdam = pl$col( +#' "london_timezone" +#' )$dt$replace_time_zone("Europe/Amsterdam") +#' ) +#' +#' # You can use `ambiguous` to deal with ambiguous datetimes: +#' dates = c( +#' "2018-10-28 01:30", +#' "2018-10-28 02:00", +#' "2018-10-28 02:30", +#' "2018-10-28 02:00" +#' ) +#' +#' df = pl$DataFrame( +#' ts = pl$Series(dates)$str$strptime(pl$Datetime("us"), "%F %H:%M"), +#' ambiguous = c("earliest", "earliest", "latest", "latest") #' ) +#' +#' df$with_columns( +#' ts_localized = pl$col("ts")$dt$replace_time_zone( +#' "Europe/Brussels", +#' ambiguous = pl$col("ambiguous") +#' ) +#' ) +#' +#' # Polars Datetime type without a time zone will be converted to R +#' # with respect to the session time zone. If ambiguous times are present +#' # an error will be raised. It is recommended to add a time zone before +#' # converting to R. +#' s_without_tz = pl$Series(dates)$str$strptime(pl$Datetime("us"), "%F %H:%M") +#' s_without_tz +#' +#' s_with_tz = s_without_tz$dt$replace_time_zone("UTC") +#' s_with_tz +#' +#' as.vector(s_with_tz) ExprDT_convert_time_zone = function(tz) { check_tz_to_result(tz) |> map(\(valid_tz) .pr$Expr$dt_convert_time_zone(self, valid_tz)) |> @@ -743,8 +758,7 @@ ExprDT_replace_time_zone = function(tz, ambiguous = "raise") { #' date = pl$date_range( #' start = as.Date("2020-3-1"), #' end = as.Date("2020-5-1"), -#' interval = "1mo", -#' eager = TRUE +#' interval = "1mo" #' ) #' ) #' df$select( @@ -765,8 +779,7 @@ ExprDT_total_days = function() { #' date = pl$date_range( #' start = as.Date("2020-1-1"), #' end = as.Date("2020-1-4"), -#' interval = "1d", -#' eager = TRUE +#' interval = "1d" #' ) #' ) #' df$select( @@ -787,8 +800,7 @@ ExprDT_total_hours = function() { #' date = pl$date_range( #' start = as.Date("2020-1-1"), #' end = as.Date("2020-1-4"), -#' interval = "1d", -#' eager = TRUE +#' interval = "1d" #' ) #' ) #' df$select( @@ -808,8 +820,7 @@ ExprDT_total_minutes = function() { #' df = pl$DataFrame(date = pl$date_range( #' start = as.POSIXct("2020-1-1", tz = "GMT"), #' end = as.POSIXct("2020-1-1 00:04:00", tz = "GMT"), -#' interval = "1m", -#' eager = TRUE +#' interval = "1m" #' )) #' df$select( #' pl$col("date"), @@ -828,8 +839,7 @@ ExprDT_total_seconds = function() { #' df = pl$DataFrame(date = pl$date_range( #' start = as.POSIXct("2020-1-1", tz = "GMT"), #' end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), -#' interval = "1ms", -#' eager = TRUE +#' interval = "1ms" #' )) #' df$select( #' pl$col("date"), @@ -848,8 +858,7 @@ ExprDT_total_milliseconds = function() { #' df = pl$DataFrame(date = pl$date_range( #' start = as.POSIXct("2020-1-1", tz = "GMT"), #' end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), -#' interval = "1ms", -#' eager = TRUE +#' interval = "1ms" #' )) #' df$select( #' pl$col("date"), @@ -868,8 +877,7 @@ ExprDT_total_microseconds = function() { #' df = pl$DataFrame(date = pl$date_range( #' start = as.POSIXct("2020-1-1", tz = "GMT"), #' end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), -#' interval = "1ms", -#' eager = TRUE +#' interval = "1ms" #' )) #' df$select( #' pl$col("date"), @@ -915,8 +923,7 @@ ExprDT_total_nanoseconds = function() { #' dates = pl$date_range( #' as.Date("2000-1-1"), #' as.Date("2005-1-1"), -#' "1y", -#' eager = TRUE +#' "1y" #' ) #' ) #' df$select( @@ -954,8 +961,7 @@ ExprDT_offset_by = function(by) { #' df = pl$DataFrame(dates = pl$date_range( #' as.Date("2000-1-1"), #' as.Date("2000-1-2"), -#' "1h", -#' eager = TRUE +#' "1h" #' )) #' #' df$with_columns(times = pl$col("dates")$dt$time()) diff --git a/R/expr__expr.R b/R/expr__expr.R index f7101ed56..605e06cab 100644 --- a/R/expr__expr.R +++ b/R/expr__expr.R @@ -203,184 +203,166 @@ wrap_elist_result = function(elist, str_to_lit = TRUE) { #' Add two expressions #' -#' The RHS can either be an Expr or an object that can be converted to a literal -#' (e.g an integer). -#' -#' @param other Literal or object that can be converted to a literal -#' @return Expr +#' Method equivalent of addition operator `expr + other`. +#' @param other numeric or string value; accepts expression input. +#' @return [Expr][Expr_class] +#' @seealso +#' - [Arithmetic operators][S3_arithmetic] #' @examples -#' pl$lit(5) + 10 -#' pl$lit(5) + pl$lit(10) -#' pl$lit(5)$add(pl$lit(10)) -#' +pl$lit(5) # unary use resolves to same as pl$lit(5) +#' df = pl$DataFrame(x = 1:5) +#' +#' df$with_columns( +#' `x+int` = pl$col("x")$add(2L), +#' `x+expr` = pl$col("x")$add(pl$col("x")$cum_prod()) +#' ) +#' +#' df = pl$DataFrame( +#' x = c("a", "d", "g"), +#' y = c("b", "e", "h"), +#' z = c("c", "f", "i") +#' ) +#' +#' df$with_columns( +#' pl$col("x")$add(pl$col("y"))$add(pl$col("z"))$alias("xyz") +#' ) Expr_add = function(other) { .pr$Expr$add(self, other) |> unwrap("in $add()") } -# TODO: move S3 methods to other documents -#' @export -#' @rdname Expr_add -#' @param e1 Expr only -#' @param e2 Expr or anything that can be converted to a literal -`+.RPolarsExpr` = function(e1, e2) { - if (missing(e2)) { - return(e1) - } - result(wrap_e(e1)$add(e2)) |> unwrap("using the '+'-operator") -} - -#' @export -`+.RPolarsThen` = `+.RPolarsExpr` - -#' @export -`+.RPolarsChainedThen` = `+.RPolarsExpr` - #' Divide two expressions #' -#' @inherit Expr_add description params return +#' Method equivalent of float division operator `expr / other`. #' +#' Zero-division behaviour follows IEEE-754: +#' - `0/0`: Invalid operation - mathematically undefined, returns `NaN`. +#' - `n/0`: On finite operands gives an exact infinite result, e.g.: ±infinity. +#' @inherit Expr_add return +#' @param other Numeric literal or expression value. +#' @seealso +#' - [Arithmetic operators][S3_arithmetic] +#' - [`$floor_div()`][Expr_floor_div] #' @examples -#' pl$lit(5) / 10 -#' pl$lit(5) / pl$lit(10) -#' pl$lit(5)$div(pl$lit(10)) +#' df = pl$DataFrame( +#' x = -2:2, +#' y = c(0.5, 0, 0, -4, -0.5) +#' ) +#' +#' df$with_columns( +#' `x/2` = pl$col("x")$div(2), +#' `x/y` = pl$col("x")$div(pl$col("y")) +#' ) Expr_div = function(other) { .pr$Expr$div(self, other) |> unwrap("in $div()") } -#' @export -#' @rdname Expr_div -#' @inheritParams Expr_add -`/.RPolarsExpr` = function(e1, e2) result(wrap_e(e1)$div(e2)) |> unwrap("using the '/'-operator") - -#' @export -`/.RPolarsThen` = `/.RPolarsExpr` - -#' @export -`/.RPolarsChainedThen` = `/.RPolarsExpr` - #' Floor divide two expressions #' -#' @inherit Expr_add description params return -#' +#' Method equivalent of floor division operator `expr %/% other`. +#' @inherit Expr_div params return +#' @seealso +#' - [Arithmetic operators][S3_arithmetic] +#' - [`$div()`][Expr_div] +#' - [`$mod()`][Expr_mod] #' @examples -#' pl$lit(5) %/% 10 -#' pl$lit(5) %/% pl$lit(10) -#' pl$lit(5)$floor_div(pl$lit(10)) +#' df = pl$DataFrame(x = 1:5) +#' +#' df$with_columns( +#' `x/2` = pl$col("x")$div(2), +#' `x%/%2` = pl$col("x")$floor_div(2) +#' ) Expr_floor_div = function(other) { .pr$Expr$floor_div(self, other) |> unwrap("in $floor_div()") } -#' @export -#' @rdname Expr_floor_div -#' @inheritParams Expr_add -`%/%.RPolarsExpr` = function(e1, e2) result(wrap_e(e1)$floor_div(e2)) |> unwrap("using the '%/%'-operator") - -#' @export -`%/%.RPolarsThen` = `%/%.RPolarsExpr` - -#' @export -`%/%.RPolarsChainedThen` = `%/%.RPolarsExpr` - #' Modulo two expressions #' -#' @inherit Expr_add description params return -#' -#' @details Currently, the modulo operator behaves differently than in R, -#' and not guaranteed `x == (x %% y) + y * (x %/% y)`. +#' Method equivalent of modulus operator `expr %% other`. +#' @inherit Expr_div params return +#' @seealso +#' - [Arithmetic operators][S3_arithmetic] +#' - [`$floor_div()`][Expr_floor_div] #' @examples -#' pl$select(pl$lit(-1:12) %% 3)$to_series()$to_vector() -#' -#' # The example is **NOT** equivalent to the followings: -#' -1:12 %% 3 -#' pl$select(-1:12 %% 3)$to_series()$to_vector() +#' df = pl$DataFrame(x = -5L:5L) #' -#' # Not guaranteed `x == (x %% y) + y * (x %/% y)` -#' x = pl$lit(-1:12) -#' y = pl$lit(3) -#' pl$select(x == (x %% y) + y * (x %/% y)) +#' df$with_columns( +#' `x%%2` = pl$col("x")$mod(2) +#' ) Expr_mod = function(other) { .pr$Expr$rem(self, other) |> unwrap("in $mod()") } -#' @export -#' @rdname Expr_mod -#' @inheritParams Expr_add -`%%.RPolarsExpr` = function(e1, e2) result(wrap_e(e1)$mod(e2)) |> unwrap("using the '%%'-operator") - -#' @export -`%%.RPolarsThen` = `%%.RPolarsExpr` - -#' @export -`%%.RPolarsChainedThen` = `%%.RPolarsExpr` - #' Substract two expressions #' -#' @inherit Expr_add description params return -#' +#' Method equivalent of subtraction operator `expr - other`. +#' @inherit Expr_div params return +#' @seealso +#' - [Arithmetic operators][S3_arithmetic] #' @examples -#' pl$lit(5) - 10 -#' pl$lit(5) - pl$lit(10) -#' pl$lit(5)$sub(pl$lit(10)) -#' -pl$lit(5) +#' df = pl$DataFrame(x = 0:4) +#' +#' df$with_columns( +#' `x-2` = pl$col("x")$sub(2), +#' `x-expr` = pl$col("x")$sub(pl$col("x")$cum_sum()) +#' ) Expr_sub = function(other) { .pr$Expr$sub(self, other) |> unwrap("in $sub()") } -#' @export -#' @rdname Expr_sub -#' @inheritParams Expr_add -`-.RPolarsExpr` = function(e1, e2) { - result( - if (missing(e2)) wrap_e(0L)$sub(e1) else wrap_e(e1)$sub(e2) - ) |> unwrap("using the '-'-operator") -} - -#' @export -`-.RPolarsThen` = `-.RPolarsExpr` - -#' @export -`-.RPolarsChainedThen` = `-.RPolarsExpr` - #' Multiply two expressions #' -#' @inherit Expr_add description params return -#' +#' Method equivalent of multiplication operator `expr * other`. +#' @inherit Expr_div params return +#' @seealso +#' - [Arithmetic operators][S3_arithmetic] #' @examples -#' pl$lit(5) * 10 -#' pl$lit(5) * pl$lit(10) -#' pl$lit(5)$mul(pl$lit(10)) +#' df = pl$DataFrame(x = c(1, 2, 4, 8, 16)) +#' +#' df$with_columns( +#' `x*2` = pl$col("x")$mul(2), +#' `x * xlog2` = pl$col("x")$mul(pl$col("x")$log(2)) +#' ) Expr_mul = Expr_mul = function(other) { .pr$Expr$mul(self, other) |> unwrap("in $mul()") } -#' @export -#' @rdname Expr_mul -#' @inheritParams Expr_add -`*.RPolarsExpr` = function(e1, e2) result(wrap_e(e1)$mul(e2)) |> unwrap("using the '*'-operator") -#' @export -`*.RPolarsThen` = `*.RPolarsExpr` - -#' @export -`*.RPolarsChainedThen` = `*.RPolarsExpr` +#' Exponentiation two expressions +#' +#' Method equivalent of exponentiation operator `expr ^ exponent`. +#' +#' @param exponent Numeric literal or expression value. +#' @inherit Expr_div return +#' @seealso +#' - [Arithmetic operators][S3_arithmetic] +#' @examples +#' df = pl$DataFrame(x = c(1, 2, 4, 8)) +#' +#' df$with_columns( +#' cube = pl$col("x")$pow(3), +#' `x^xlog2` = pl$col("x")$pow(pl$col("x")$log(2)) +#' ) +Expr_pow = function(exponent) { + .pr$Expr$pow(self, exponent) |> unwrap("in $pow()") +} #' Negate a boolean expression #' -#' @inherit Expr_add description return +#' Method equivalent of negation operator `!expr`. +#' @inherit Expr_add return #' @examples #' # two syntaxes same result #' pl$lit(TRUE)$not() #' !pl$lit(TRUE) Expr_not = use_extendr_wrapper + +# TODO: move to another file and create `S3_logic.Rd` #' @export -#' @rdname Expr_not -#' @param x Expr `!.RPolarsExpr` = function(x) x$not() #' @export @@ -402,8 +384,6 @@ Expr_lt = function(other) { .pr$Expr$lt(self, other) |> unwrap("in $lt()") } #' @export -#' @inheritParams Expr_add -#' @rdname Expr_lt `<.RPolarsExpr` = function(e1, e2) result(wrap_e(e1)$lt(e2)) |> unwrap("using the '<'-operator") #' @export @@ -426,8 +406,6 @@ Expr_gt = function(other) { } #' @export -#' @inheritParams Expr_add -#' @rdname Expr_gt `>.RPolarsExpr` = function(e1, e2) result(wrap_e(e1)$gt(e2)) |> unwrap("using the '>'-operator") #' @export @@ -451,8 +429,6 @@ Expr_eq = function(other) { } #' @export -#' @inheritParams Expr_add -#' @rdname Expr_eq `==.RPolarsExpr` = function(e1, e2) result(wrap_e(e1)$eq(e2)) |> unwrap("using the '=='-operator") #' @export @@ -492,8 +468,6 @@ Expr_neq = function(other) { } #' @export -#' @inheritParams Expr_add -#' @rdname Expr_neq `!=.RPolarsExpr` = function(e1, e2) result(wrap_e(e1)$neq(e2)) |> unwrap("using the '!='-operator") #' @export @@ -532,8 +506,6 @@ Expr_lt_eq = function(other) { } #' @export -#' @inheritParams Expr_add -#' @rdname Expr_lt_eq `<=.RPolarsExpr` = function(e1, e2) result(wrap_e(e1)$lt_eq(e2)) |> unwrap("using the '<='-operator") #' @export @@ -556,8 +528,6 @@ Expr_gt_eq = function(other) { } #' @export -#' @inheritParams Expr_add -#' @rdname Expr_gt_eq `>=.RPolarsExpr` = function(e1, e2) result(wrap_e(e1)$gt_eq(e2)) |> unwrap("using the '>='-operator") #' @export @@ -2090,32 +2060,6 @@ Expr_limit = function(n = 10) { } - -#' Exponentiation -#' -#' Raise expression to the power of exponent. -#' -#' @param exponent Exponent value. -#' @return Expr -#' @examples -#' # use via `pow`-method and the `^`-operator -#' pl$DataFrame(a = -1:3, b = 2:6)$with_columns( -#' x = pl$col("a")$pow(2), -#' y = pl$col("a")^3 -#' ) -Expr_pow = function(exponent) { - .pr$Expr$pow(self, exponent) |> unwrap("in $pow()") -} - -#' @export -`^.RPolarsExpr` = function(e1, e2) result(wrap_e(e1)$pow(e2)) |> unwrap("using '^'-operator") - -#' @export -`^.RPolarsThen` = `^.RPolarsExpr` - -#' @export -`^.RPolarsChainedThen` = `^.RPolarsExpr` - #' Check whether a value is in a vector #' #' Notice that to check whether a factor value is in a vector of strings, you @@ -2840,7 +2784,7 @@ Expr_sinh = use_extendr_wrapper #' #' @return Expr #' @examples -#' pl$DataFrame(a = c(-1, acosh(0.5), 0, 1, NA_real_))$ +#' pl$DataFrame(a = c(-1, acosh(2), 0, 1, NA_real_))$ #' with_columns(cosh = pl$col("a")$cosh()) Expr_cosh = use_extendr_wrapper @@ -3410,7 +3354,7 @@ Expr_peak_max = function() { #' #' df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ #' with_columns( -#' pl$col("dt")$str$strptime(pl$Datetime(tu = "us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() +#' pl$col("dt")$str$strptime(pl$Datetime("us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() #' ) #' #' df$with_columns( diff --git a/R/expr__string.R b/R/expr__string.R index 27d8e5aba..e8eb7ca77 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -4,32 +4,53 @@ # expr_str_make_sub_ns = macro_new_subnamespace("^ExprStr_", "RPolarsExprStrNameSpace") +# TODO for 0.16.0: rename arguments, should not allow positional arguments except for the first two #' Convert a String column into a Date/Datetime/Time column. #' +#' Similar to the [strptime()] function. #' +#' When parsing a Datetime the column precision will be inferred from the format +#' string, if given, e.g.: `"%F %T%.3f"` => [`pl$Datetime("ms")`][pl_Datetime]. +#' If no fractional second component is found then the default is `"us"` (microsecond). #' @param datatype The data type to convert into. Can be either Date, Datetime, #' or Time. -#' @param format Format to use for conversion. See `?strptime` for possible -#' values. Example: "%Y-%m-%d %H:%M:%S". If `NULL` (default), the format is -#' inferred from the data. Notice that time zone `%Z` is not supported and will -#' just ignore timezones. Numeric time zones like `%z` or `%:z` are supported. +#' @param format Format to use for conversion. Refer to +#' [the chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) +#' for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`. +#' If `NULL` (default), the format is inferred from the data. +#' Notice that time zone `%Z` is not supported and will just ignore timezones. +#' Numeric time zones like `%z` or `%:z` are supported. #' @param strict If `TRUE` (default), raise an error if a single string cannot -#' be parsed. Otherwise, produce a polars `null`. -#' @param exact If `TRUE` (default), require an exact format match. Otherwise, +#' be parsed. If `FALSE`, produce a polars `null`. +#' @param exact If `TRUE` (default), require an exact format match. If `FALSE`, #' allow the format to match anywhere in the target string. +#' Conversion to the Time type is always exact. +#' Note that using `exact = FALSE` introduces a performance penalty - +#' cleaning your data beforehand will almost certainly be more performant. #' @param cache Use a cache of unique, converted dates to apply the datetime #' conversion. #' @param ambiguous Determine how to deal with ambiguous datetimes: #' * `"raise"` (default): raise #' * `"earliest"`: use the earliest datetime #' * `"latest"`: use the latest datetime -#' @details -#' When parsing a Datetime the column precision will be inferred from the format -#' string, if given, eg: “%F %T%.3f" => Datetime("ms"). If no fractional second -#' component is found then the default is "us" (microsecond). -#' @keywords ExprStr -#' @return Expr of a Date, Datetime or Time Series +#' @return [Expr][Expr_class] of Date, Datetime or Time type +#' @seealso +#' - [`$str$to_date()`][ExprStr_to_date] +#' - [`$str$to_datetime()`][ExprStr_to_datetime] +#' - [`$str$to_time()`][ExprStr_to_time] #' @examples +#' # Dealing with a consistent format +#' s = pl$Series(c("2020-01-01 01:00Z", "2020-01-01 02:00Z")) +#' +#' s$str$strptime(pl$Datetime(), "%Y-%m-%d %H:%M%#z") +#' +#' # Auto infer format +#' s$str$strptime(pl$Datetime()) +#' +#' # Datetime with timezone is interpreted as UTC timezone +#' pl$Series("2020-01-01T01:00:00+09:00")$str$strptime(pl$Datetime()) +#' +#' # Dealing with different formats. #' s = pl$Series( #' c( #' "2021-04-22", @@ -39,38 +60,41 @@ #' ), #' "date" #' ) -#' #' #join multiple passes with different format -#' s$to_frame()$with_columns( -#' pl$col("date") -#' $str$strptime(pl$Date, "%F", strict = FALSE) -#' $fill_null(pl$col("date")$str$strptime(pl$Date, "%F %T", strict = FALSE)) -#' $fill_null(pl$col("date")$str$strptime(pl$Date, "%D", strict = FALSE)) -#' $fill_null(pl$col("date")$str$strptime(pl$Date, "%c", strict = FALSE)) +#' +#' s$to_frame()$select( +#' pl$coalesce( +#' pl$col("date")$str$strptime(pl$Date, "%F", strict = FALSE), +#' pl$col("date")$str$strptime(pl$Date, "%F %T", strict = FALSE), +#' pl$col("date")$str$strptime(pl$Date, "%D", strict = FALSE), +#' pl$col("date")$str$strptime(pl$Date, "%c", strict = FALSE) +#' ) #' ) #' -#' txt_datetimes = c( -#' "2023-01-01 11:22:33 -0100", -#' "2023-01-01 11:22:33 +0300", -#' "invalid time" +#' # Ignore invalid time +#' s = pl$Series( +#' c( +#' "2023-01-01 11:22:33 -0100", +#' "2023-01-01 11:22:33 +0300", +#' "invalid time" +#' ) #' ) #' -#' pl$lit(txt_datetimes)$str$strptime( +#' s$str$strptime( #' pl$Datetime("ns"), -#' format = "%Y-%m-%d %H:%M:%S %z", strict = FALSE, -#' )$to_series() +#' format = "%Y-%m-%d %H:%M:%S %z", +#' strict = FALSE, +#' ) ExprStr_strptime = function( datatype, - format, + format = NULL, strict = TRUE, exact = TRUE, cache = TRUE, ambiguous = "raise") { pcase( - # not a datatype !is_polars_dtype(datatype), Err_plain("arg datatype is not an RPolarsDataType"), - # Datetime pl$same_outer_dt(datatype, pl$Datetime()), { @@ -81,93 +105,78 @@ ExprStr_strptime = function( \(expr) .pr$Expr$dt_cast_time_unit(expr, datetime_type$tu) # cast if not an err ) }, - # Date datatype == pl$Date, .pr$Expr$str_to_date(self, format, strict, exact, cache), - # Time datatype == pl$Time, .pr$Expr$str_to_time(self, format, strict, cache), - # Other or_else = Err_plain("datatype should be of type {Date, Datetime, Time}") ) |> unwrap("in str$strptime():") } +# TODO for 0.16.0: should not allow positional arguments except for the first one #' Convert a String column into a Date column #' -#' @param format Format to use for conversion. See `?strptime` for possible -#' values. Example: "%Y-%m-%d". If `NULL` (default), the format is -#' inferred from the data. Notice that time zone `%Z` is not supported and will -#' just ignore timezones. Numeric time zones like `%z` or `%:z` are supported. -#' @param strict If `TRUE` (default), raise an error if a single string cannot -#' be parsed. If `FALSE`, parsing failure will produce a polars `null`. -#' @param exact If `TRUE` (default), require an exact format match. Otherwise, -#' allow the format to match anywhere in the target string. -#' @param cache Use a cache of unique, converted dates to apply the datetime -#' conversion. -#' -#' @return Expr -#' -#' +#' @inheritParams ExprStr_strptime +#' @format Format to use for conversion. Refer to +#' [the chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) +#' for the full specification. Example: `"%Y-%m-%d"`. +#' If `NULL` (default), the format is inferred from the data. +#' @return [Expr][Expr_class] of Date type +#' @seealso +#' - [`$str$strptime()`][ExprStr_strptime] #' @examples -#' pl$DataFrame(str_date = c("2009-01-02", "2009-01-03", "2009-1-4", "2009 05 01"))$ -#' with_columns(date = pl$col("str_date")$str$to_date(strict = FALSE)) +#' s = pl$Series(c("2020/01/01", "2020/02/01", "2020/03/01")) +#' +#' s$str$to_date() ExprStr_to_date = function(format = NULL, strict = TRUE, exact = TRUE, cache = TRUE) { .pr$Expr$str_to_date(self, format, strict, exact, cache) |> unwrap("in $str$to_date():") } +# TODO for 0.16.0: should not allow positional arguments except for the first one #' Convert a String column into a Time column #' -#' @param format Format to use for conversion. See `?strptime` for possible -#' values. Example: "%H:%M:%S". If `NULL` (default), the format is -#' inferred from the data. Notice that time zone `%Z` is not supported and will -#' just ignore timezones. Numeric time zones like `%z` or `%:z` are supported. -#' @param strict If `TRUE` (default), raise an error if a single string cannot -#' be parsed. If `FALSE`, parsing failure will produce a polars `null`. -#' @param cache Use a cache of unique, converted dates to apply the datetime -#' conversion. -#' -#' @return Expr -#' -#' +#' @inheritParams ExprStr_strptime +#' @format Format to use for conversion. Refer to +#' [the chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) +#' for the full specification. Example: `"%H:%M:%S"`. +#' If `NULL` (default), the format is inferred from the data. +#' @return [Expr][Expr_class] of Time type +#' @seealso +#' - [`$str$strptime()`][ExprStr_strptime] #' @examples -#' pl$DataFrame(str_time = c("01:20:01", "28:00:02", "03:00:02"))$ -#' with_columns(time = pl$col("str_time")$str$to_time(strict = FALSE)) +#' s = pl$Series(c("01:00", "02:00", "03:00")) +#' +#' s$str$to_time("%H:%M") ExprStr_to_time = function(format = NULL, strict = TRUE, cache = TRUE) { .pr$Expr$str_to_time(self, format, strict, cache) |> unwrap("in $str$to_time():") } +# TODO for 0.16.0: should not allow positional arguments except for the first one #' Convert a String column into a Datetime column #' -#' @param format Format to use for conversion. See `?strptime` for possible -#' values. Example: "%Y-%m-%d %H:%M:%S". If `NULL` (default), the format is -#' inferred from the data. Notice that time zone `%Z` is not supported and will -#' just ignore timezones. Numeric time zones like `%z` or `%:z` are supported. -#' @param time_unit String (`"ns"`, `"us"`, `"ms"`) or integer. -#' @param time_zone String describing a timezone. If `NULL` (default), `"GMT` is -#' used. -#' @param strict If `TRUE` (default), raise an error if a single string cannot -#' be parsed. If `FALSE`, parsing failure will produce a polars `null`. -#' @param exact If `TRUE` (default), require an exact format match. Otherwise, -#' allow the format to match anywhere in the target string. -#' @param cache Use a cache of unique, converted dates to apply the datetime -#' conversion. -#' @param ambiguous Determine how to deal with ambiguous datetimes: -#' * `"raise"` (default): raise -#' * `"earliest"`: use the earliest datetime -#' * `"latest"`: use the latest datetime -#' -#' @return Expr -#' -#' -#' @examples -#' pl$DataFrame(str_date = c("2009-01-02 01:00", "2009-01-03 02:00", "2009-1-4 3:00"))$ -#' with_columns(datetime = pl$col("str_date")$str$to_datetime(strict = FALSE)) +#' @inheritParams ExprStr_strptime +#' @param time_unit Unit of time for the resulting Datetime column. If `NULL` (default), +#' the time unit is inferred from the format string if given, +#' e.g.: `"%F %T%.3f"` => [`pl$Datetime("ms")`][pl_Datetime]. +#' If no fractional second component is found, the default is `"us"` (microsecond). +#' @param time_zone for the resulting [Datetime][pl_Datetime] column. +#' @param exact If `TRUE` (default), require an exact format match. If `FALSE`, allow the format to match +#' anywhere in the target string. Note that using `exact = FALSE` introduces a performance +#' penalty - cleaning your data beforehand will almost certainly be more performant. +#' @return [Expr][Expr_class] of [Datetime][pl_Datetime] type +#' @seealso +#' - [`$str$strptime()`][ExprStr_strptime] +#' @examples +#' s = pl$Series(c("2020-01-01 01:00Z", "2020-01-01 02:00Z")) +#' +#' s$str$to_datetime("%Y-%m-%d %H:%M%#z") +#' s$str$to_datetime(time_unit = "ms") ExprStr_to_datetime = function( format = NULL, time_unit = NULL, diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 60e999f8d..d2e54ff8f 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -178,6 +178,8 @@ RPolarsDataFrame$to_struct <- function(name) .Call(wrap__RPolarsDataFrame__to_st RPolarsDataFrame$unnest <- function(names) .Call(wrap__RPolarsDataFrame__unnest, self, names) +RPolarsDataFrame$partition_by <- function(by, maintain_order, include_key) .Call(wrap__RPolarsDataFrame__partition_by, self, by, maintain_order, include_key) + RPolarsDataFrame$export_stream <- function(stream_ptr) invisible(.Call(wrap__RPolarsDataFrame__export_stream, self, stream_ptr)) RPolarsDataFrame$from_arrow_record_batches <- function(rbr) .Call(wrap__RPolarsDataFrame__from_arrow_record_batches, rbr) diff --git a/R/functions__eager.R b/R/functions__eager.R index c9a819c33..a33ac7360 100644 --- a/R/functions__eager.R +++ b/R/functions__eager.R @@ -163,13 +163,12 @@ pl_concat = function( #' New date range +#' #' @param start POSIXt or Date preferably with time_zone or double or integer #' @param end POSIXt or Date preferably with time_zone or double or integer. If #' `end` and `interval` are missing, then a single datetime is constructed. #' @param interval String, a Polars `duration` or R [difftime()]. Can be missing #' if `end` is missing also. -#' @param eager If `FALSE` (default), return an `Expr`. Otherwise, returns a -#' `Series`. #' @param closed One of `"both"` (default), `"left"`, `"none"` or `"right"`. #' @param time_unit String (`"ns"`, `"us"`, `"ms"`) or integer. #' @param time_zone String describing a timezone. If `NULL` (default), `"GMT` is @@ -188,6 +187,10 @@ pl_concat = function( #' In R/r-polars it is perfectly fine to mix timezones of params `time_zone`, #' `start` and `end`. #' +#' Compared to the Python implementation, `pl$date_range()` doesn't have the +#' argument `eager` and always returns an Expr. Use `$to_series()` to return a +#' Series. +#' #' @return A datetime #' #' @examples @@ -218,7 +221,6 @@ pl_date_range = function( start, end, interval, - eager = FALSE, closed = "both", time_unit = "us", time_zone = NULL, @@ -227,20 +229,10 @@ pl_date_range = function( end = start interval = "1h" } - - f_eager_eval = \(lit) { - if (isTRUE(eager)) { - result(lit$to_series()) - } else { - Ok(lit) - } - } - start = cast_naive_value_to_datetime_expr(start) end = cast_naive_value_to_datetime_expr(end) r_date_range_lazy(start, end, interval, closed, time_unit, time_zone, explode) |> - and_then(f_eager_eval) |> unwrap("in pl$date_range()") } diff --git a/R/functions__lazy.R b/R/functions__lazy.R index cd06d4c96..f43b451ea 100644 --- a/R/functions__lazy.R +++ b/R/functions__lazy.R @@ -572,27 +572,24 @@ pl_concat_list = function(exprs) { unwrap(" in pl$concat_list():") } -#' struct -#' @aliases struct -#' @description Collect several columns into a Series of dtype Struct. +#' Collect columns into a struct column +#' #' @param exprs Columns/Expressions to collect into a Struct. -#' @param eager Evaluate immediately. -#' @param schema Optional schema named list that explicitly defines the struct field dtypes. -#' Each name must match a column name wrapped in the struct. Can only be used to cast some or all -#' dtypes, not to change the names. NULL means to include keep columns into the struct by their -#' current DataType. If a column is not included in the schema it is removed from the final struct. -#' -#' @details pl$struct creates Expr or Series of DataType Struct() -#' pl$Struct creates the DataType Struct() -#' In polars a schema is a named list of DataTypes. #' A schema describes e.g. a DataFrame. -#' More formally schemas consist of Fields. -#' A Field is an object describing the name and DataType of a column/Series, but same same. -#' A struct is a DataFrame wrapped into a Series, the DataType is Struct, and each -#' sub-datatype within are Fields. -#' In a dynamic language schema and a Struct (the DataType) are quite the same, except -#' schemas describe DataFrame and Struct's describe some Series. -#' -#' @return Eager=FALSE: Expr of Series with dtype Struct | Eager=TRUE: Series with dtype Struct +#' @param schema Optional schema named list that explicitly defines the struct +#' field dtypes. Each name must match a column name wrapped in the struct. Can +#' only be used to cast some or all dtypes, not to change the names. If `NULL` +#' (default), columns datatype are not modified. Columns that do not exist are +#' silently ignored and not included in the final struct. +#' +#' @details +#' +#' `pl$struct()` creates an Expr of DataType [`Struct()`][DataType_Struct]. +#' +#' Compared to the Python implementation, `pl$struct()` doesn't have the +#' argument `eager` and always returns an Expr. Use `$to_series()` to return a +#' Series. +#' +#' @return Expr with dtype Struct #' #' @examples #' # isolated expression to wrap all columns in a struct aliased 'my_struct' @@ -635,10 +632,7 @@ pl_concat_list = function(exprs) { #' #' df$select(e2) #' df$select(e2)$to_data_frame() -pl_struct = function( - exprs, # list of exprs, str or Series or Expr or Series, - eager = FALSE, - schema = NULL) { +pl_struct = function(exprs, schema = NULL) { # convert any non expr to expr and catch error in a result as_struct(wrap_elist_result(exprs, str_to_lit = FALSE)) |> and_then(\(struct_expr) { # if no errors continue struct_expr @@ -646,14 +640,7 @@ pl_struct = function( if (!is.null(schema)) { struct_expr = struct_expr$cast(pl$Struct(schema)) } - if (!is_scalar_bool(eager)) { - return(Err("arg [eager] is not a bool")) - } - if (eager) { - result(pl$select(struct_expr)$to_series()) - } else { - Ok(struct_expr) - } + Ok(struct_expr) }) |> unwrap( # raise any error with context "in pl$struct:" @@ -1057,6 +1044,6 @@ pl_from_epoch = function(column, time_unit = "s") { switch(time_unit, "d" = column$cast(pl$Date), "s" = (column$cast(pl$Int64) * 1000000L)$cast(pl$Datetime("us")), - column$cast(pl$Datetime(tu = time_unit)) + column$cast(pl$Datetime(time_unit)) ) } diff --git a/R/group_by_dynamic.R b/R/group_by_dynamic.R index b53e61120..bb9983a7e 100644 --- a/R/group_by_dynamic.R +++ b/R/group_by_dynamic.R @@ -8,8 +8,7 @@ #' time = pl$date_range( #' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), -#' interval = "30m", -#' eager = TRUE, +#' interval = "30m" #' ), #' n = 0:6 #' ) @@ -115,8 +114,7 @@ DynamicGroupBy_agg = function(...) { #' time = pl$date_range( #' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), -#' interval = "30m", -#' eager = TRUE, +#' interval = "30m" #' ), #' n = 0:6 #' ) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index e0b139507..27308a874 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -46,6 +46,7 @@ #' #' `$width` returns the number of columns in the LazyFrame. #' +#' @inheritSection DataFrame_class Conversion to R data types considerations #' @keywords LazyFrame #' @examples #' # see all exported methods @@ -1861,8 +1862,7 @@ LazyFrame_rolling = function( #' time = pl$date_range( #' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), -#' interval = "30m", -#' eager = TRUE, +#' interval = "30m" #' ), #' n = 0:6 #' ) diff --git a/R/s3-methods-operator.R b/R/s3-methods-operator.R new file mode 100644 index 000000000..3f026113e --- /dev/null +++ b/R/s3-methods-operator.R @@ -0,0 +1,206 @@ +#' Arithmetic operators for RPolars objects +#' +#' @name S3_arithmetic +#' @param x,y numeric type of RPolars objects or objects that can be coerced such. +#' Only `+` can take strings. +#' @return A Polars object the same type as the input. +#' @seealso +#' - [`$add()`][Expr_add] +#' - [`$sub()`][Expr_sub] +#' - [`$mul()`][Expr_mul] +#' - [`$div()`][Expr_div] +#' - [`$pow()`][Expr_pow] +#' - [`$mod()`][Expr_mod] +#' - [`$floor_div()`][Expr_floor_div] +#' - [`$add()`][Series_add] +#' - [`$sub()`][Series_sub] +#' - [`$mul()`][Series_mul] +#' - [`$div()`][Series_div] +#' - [`$pow()`][Series_pow] +#' - [`$mod()`][Series_mod] +#' - [`$floor_div()`][Series_floor_div] +#' @examples +#' pl$lit(5) + 10 +#' 5 + pl$lit(10) +#' pl$lit(5) + pl$lit(10) +#' +pl$lit(1) +#' +#' # This will not raise an error as it is not actually evaluated. +#' expr = pl$lit(5) + "10" +#' expr +#' +#' # Will raise an error as it is evaluated. +#' tryCatch( +#' expr$to_series(), +#' error = function(e) e +#' ) +#' +#' pl$Series(5) + 10 +#' +pl$Series(5) +#' -pl$Series(5) +NULL + + +#' @export +#' @rdname S3_arithmetic +`+.RPolarsExpr` = function(x, y) { + if (missing(y)) { + return(x) + } + result(wrap_e(x)$add(y)) |> + unwrap("using the `+`-operator") +} + +#' @export +`+.RPolarsThen` = `+.RPolarsExpr` + +#' @export +`+.RPolarsChainedThen` = `+.RPolarsExpr` + + +#' @export +#' @rdname S3_arithmetic +`-.RPolarsExpr` = function(x, y) { + result( + if (missing(y)) wrap_e(0L)$sub(x) else wrap_e(x)$sub(y) + ) |> unwrap("using the '-'-operator") +} + +#' @export +`-.RPolarsThen` = `-.RPolarsExpr` + +#' @export +`-.RPolarsChainedThen` = `-.RPolarsExpr` + + +#' @export +#' @rdname S3_arithmetic +`*.RPolarsExpr` = function(x, y) { + result(wrap_e(x)$mul(y)) |> + unwrap("using the `*`-operator") +} + +#' @export +`*.RPolarsThen` = `*.RPolarsExpr` + +#' @export +`*.RPolarsChainedThen` = `*.RPolarsExpr` + + +#' @export +#' @rdname S3_arithmetic +`/.RPolarsExpr` = function(x, y) { + result(wrap_e(x)$div(y)) |> + unwrap("using the `/`-operator") +} + +#' @export +`/.RPolarsThen` = `/.RPolarsExpr` + +#' @export +`/.RPolarsChainedThen` = `/.RPolarsExpr` + + +#' @export +#' @rdname S3_arithmetic +`^.RPolarsExpr` = function(x, y) { + result(wrap_e(x)$pow(y)) |> + unwrap("using `^`-operator") +} + +#' @export +`^.RPolarsThen` = `^.RPolarsExpr` + +#' @export +`^.RPolarsChainedThen` = `^.RPolarsExpr` + + +#' @export +#' @rdname S3_arithmetic +`%%.RPolarsExpr` = function(x, y) { + result(wrap_e(x)$mod(y)) |> + unwrap("using the `%%`-operator") +} + +#' @export +`%%.RPolarsThen` = `%%.RPolarsExpr` + +#' @export +`%%.RPolarsChainedThen` = `%%.RPolarsExpr` + + +#' @export +#' @rdname S3_arithmetic +`%/%.RPolarsExpr` = function(x, y) { + result(wrap_e(x)$floor_div(y)) |> + unwrap("using the `%/%`-operator") +} + +#' @export +`%/%.RPolarsThen` = `%/%.RPolarsExpr` + +#' @export +`%/%.RPolarsChainedThen` = `%/%.RPolarsExpr` + + +#' @export +#' @rdname S3_arithmetic +`+.RPolarsSeries` = function(x, y) { + if (missing(y)) { + return(x) + } + result(as_polars_series(x)$add(y)) |> + unwrap("using the `+`-operator") +} + + +#' @export +#' @rdname S3_arithmetic +`-.RPolarsSeries` = function(x, y) { + result(if (missing(y)) { + pl$Series(0L)$sub(as_polars_series(x)) + } else { + as_polars_series(x)$sub(y) + }) |> + unwrap("using the `-`-operator") +} + + +#' @export +#' @rdname S3_arithmetic +`*.RPolarsSeries` = function(x, y) { + result(as_polars_series(x)$mul(y)) |> + unwrap("using the `*`-operator") +} + + +#' @export +#' @rdname S3_arithmetic +`/.RPolarsSeries` = function(x, y) { + result(as_polars_series(x)$div(y)) |> + unwrap("using the `/`-operator") +} + + +#' @export +#' @rdname S3_arithmetic +`^.RPolarsSeries` = function(x, y) { + result(as_polars_series(x)$pow(y)) |> + unwrap("using the `^`-operator") +} + + +#' @export +#' @rdname S3_arithmetic +`%%.RPolarsSeries` = function(x, y) { + result(as_polars_series(x)$mod(y)) |> + unwrap("using the `%%`-operator") +} + + +#' @export +#' @rdname S3_arithmetic +`%/%.RPolarsSeries` = function(x, y) { + result(as_polars_series(x)$floor_div(y)) |> + unwrap("using the `%/%`-operator") +} diff --git a/R/s3_methods.R b/R/s3-methods.R similarity index 98% rename from R/s3_methods.R rename to R/s3-methods.R index 8a2b8dd04..409fea2fe 100644 --- a/R/s3_methods.R +++ b/R/s3-methods.R @@ -255,6 +255,7 @@ dimnames.RPolarsLazyFrame = function(x) list(NULL, names(x)) #' @param x An object to convert to a [data.frame]. #' @param ... Additional arguments passed to methods. #' @inheritParams DataFrame_to_data_frame +#' @inheritSection DataFrame_class Conversion to R data types considerations #' @seealso #' - [as_polars_df()] #' - [`$to_data_frame()`][DataFrame_to_data_frame] @@ -409,6 +410,7 @@ sum.RPolarsSeries = function(x, ...) x$sum() #' #' @param x A Polars Series #' @param mode Not used. +#' @inheritSection DataFrame_class Conversion to R data types considerations #' @export #' @rdname S3_as.vector as.vector.RPolarsSeries = function(x, mode) x$to_vector() @@ -481,7 +483,7 @@ c.RPolarsSeries = \(x, ...) { l = list2(...) x = x$clone() # clone to retain an immutable api, append_mut is not immutable for (i in seq_along(l)) { # append each element of i being either Series or Into - unwrap(.pr$Series$append_mut(x, wrap_s(l[[i]])), "in $c:") + unwrap(.pr$Series$append_mut(x, as_polars_series(l[[i]])), "in $c():") } x } diff --git a/R/series__series.R b/R/series__series.R index 3c14f4845..aa1d69855 100644 --- a/R/series__series.R +++ b/R/series__series.R @@ -88,6 +88,7 @@ #' #' `$struct` stores all struct related methods. #' +#' @inheritSection DataFrame_class Conversion to R data types considerations #' @keywords Series #' #' @examples @@ -234,30 +235,62 @@ Series_str = method_as_active_binding(\() series_make_sub_ns(self, expr_str_make Series_struct = method_as_active_binding(\() series_make_sub_ns(self, expr_struct_make_sub_ns)) -#' Wrap as Series -#' @noRd -#' @description input is either already a Series of will be passed to the Series constructor -#' @param x a Series or something-turned-into-Series -#' @return Series -wrap_s = function(x) { - if (inherits(x, "RPolarsSeries")) x else pl$Series(x) -} - +# TODO: change the arguments to be match to Python Polars before 0.16.0 #' Create new Series -#' @description found in api as pl$Series named Series_constructor internally #' +#' This function is a simple way to convert basic types of vectors provided by base R to +#' [the Series class object][Series_class]. +#' For converting more types properly, use the generic function [as_polars_series()]. #' @param x any vector -#' @param name string -#' @name pl_Series -#' @keywords Series_new -#' @return Series +#' @param name Name of the Series. If `NULL`, an empty string is used. +#' @param dtype One of [polars data type][pl_dtypes] or `NULL`. +#' If not `NULL`, that data type is used to [cast][Expr_cast] the Series created from the vector +#' to a specific data type internally. +#' @param ... Ignored. +#' @param nan_to_null If `TRUE`, `NaN` values contained in the Series are replaced to `null`. +#' Using the [`$fill_nan()`][Expr_fill_nan] method internally. +#' @return [Series][Series_class] #' @aliases Series -#' +#' @seealso +#' - [as_polars_series()] #' @examples -#' pl$Series(1:4) -pl_Series = function(x, name = NULL) { - .pr$Series$new(x, name) |> - unwrap("in pl$Series()") +#' # Constructing a Series by specifying name and values positionally: +#' s = pl$Series(1:3, "a") +#' s +#' +#' # Notice that the dtype is automatically inferred as a polars Int32: +#' s$dtype +#' +#' # Constructing a Series with a specific dtype: +#' s2 = pl$Series(1:3, "a", dtype = pl$Float32) +#' s2 +pl_Series = function( + x, + name = NULL, + dtype = NULL, + ..., + nan_to_null = FALSE) { + uw = function(x) unwrap(x, "in pl$Series():") + + if (!is.null(dtype) && !isTRUE(is_polars_dtype(dtype))) { + Err_plain("The dtype argument is not a valid Polars data type and cannot be converted into one.") |> + uw() + } + + out = .pr$Series$new(x, name) |> + uw() + + if (!is.null(dtype)) { + out = result(out$cast(dtype)) |> + uw() + } + + if (isTRUE(nan_to_null)) { + out = result(out$fill_nan(NULL)) |> + uw() + } + + out } #' Print Series @@ -270,35 +303,34 @@ Series_print = function() { invisible(self) } -#' add Series -#' @name Series_add -#' @description Series arithmetics -#' @param other Series or into Series -#' @return Series -#' @aliases add -#' @keywords Series + +#' Add Series +#' +#' Method equivalent of addition operator `series + other`. +#' @param other [Series][Series_class] like object of numeric or string values. +#' Converted to [Series][Series_class] by [as_polars_series()] in this method. +#' @return [Series][Series_class] +#' @seealso +#' - [Arithmetic operators][S3_arithmetic] #' @examples -#' pl$Series(1:3)$add(11:13) #' pl$Series(1:3)$add(pl$Series(11:13)) +#' pl$Series(1:3)$add(11:13) #' pl$Series(1:3)$add(1L) -#' 1L + pl$Series(1:3) -#' pl$Series(1:3) + 1L +#' +#' pl$Series("a")$add("-z") Series_add = function(other) { - .pr$Series$add(self, wrap_s(other)) + .pr$Series$add(self, as_polars_series(other)) } -#' @export -#' @rdname Series_add -#' @param s1 lhs Series -#' @param s2 rhs Series or any into Series -"+.RPolarsSeries" = function(s1, s2) wrap_s(s1)$add(s2) -#' sub Series -#' @name Series_sub -#' @description Series arithmetics -#' @param other Series or into Series -#' @return Series -#' @aliases sub -#' @keywords Series + +#' Subtract Series +#' +#' Method equivalent of subtraction operator `series - other`. +#' @inherit Series_add return +#' @param other [Series][Series_class] like object of numeric. +#' Converted to [Series][Series_class] by [as_polars_series()] in this method. +#' @seealso +#' - [Arithmetic operators][S3_arithmetic] #' @examples #' pl$Series(1:3)$sub(11:13) #' pl$Series(1:3)$sub(pl$Series(11:13)) @@ -306,73 +338,87 @@ Series_add = function(other) { #' 1L - pl$Series(1:3) #' pl$Series(1:3) - 1L Series_sub = function(other) { - .pr$Series$sub(self, wrap_s(other)) + .pr$Series$sub(self, as_polars_series(other)) } -#' @export -#' @rdname Series_sub -#' @param s1 lhs Series -#' @param s2 rhs Series or any into Series -"-.RPolarsSeries" = function(s1, s2) wrap_s(s1)$sub(s2) -#' div Series -#' @name Series_div -#' @description Series arithmetics -#' @param other Series or into Series -#' @return Series -#' @aliases div -#' @keywords Series + +#' Divide Series +#' +#' Method equivalent of division operator `series / other`. +#' @inherit Series_sub params return +#' @seealso +#' - [Arithmetic operators][S3_arithmetic] #' @examples #' pl$Series(1:3)$div(11:13) #' pl$Series(1:3)$div(pl$Series(11:13)) #' pl$Series(1:3)$div(1L) -#' 2L / pl$Series(1:3) -#' pl$Series(1:3) / 2L Series_div = function(other) { - .pr$Series$div(self, wrap_s(other)) + .pr$Series$div(self, as_polars_series(other)) } -#' @export -#' @rdname Series_div -#' @param s1 lhs Series -#' @param s2 rhs Series or any into Series -"/.RPolarsSeries" = function(s1, s2) wrap_s(s1)$div(s2) -#' mul Series -#' @name Series_mul -#' @description Series arithmetics -#' @param other Series or into Series -#' @return Series -#' @aliases mul -#' @keywords Series + +#' Floor Divide Series +#' +#' Method equivalent of floor division operator `series %/% other`. +#' @inherit Series_sub params return +#' @seealso +#' - [Arithmetic operators][S3_arithmetic] +#' @examples +#' pl$Series(1:3)$floor_div(11:13) +#' pl$Series(1:3)$floor_div(pl$Series(11:13)) +#' pl$Series(1:3)$floor_div(1L) +Series_floor_div = function(other) { + self$to_frame()$select(pl$col(self$name)$floor_div(as_polars_series(other)))$to_series(0) +} + + +#' Multiply Series +#' +#' Method equivalent of multiplication operator `series * other`. +#' @inherit Series_sub params return +#' @seealso +#' - [Arithmetic operators][S3_arithmetic] #' @examples #' pl$Series(1:3)$mul(11:13) #' pl$Series(1:3)$mul(pl$Series(11:13)) #' pl$Series(1:3)$mul(1L) -#' 2L * pl$Series(1:3) -#' pl$Series(1:3) * 2L Series_mul = function(other) { - .pr$Series$mul(self, wrap_s(other)) + .pr$Series$mul(self, as_polars_series(other)) } -#' @export -#' @rdname Series_mul -#' @param s1 lhs Series -#' @param s2 rhs Series or any into Series -"*.RPolarsSeries" = function(s1, s2) wrap_s(s1)$mul(s2) -#' rem Series -#' @description Series arithmetics, remainder -#' @param other Series or into Series -#' @return Series -#' @keywords Series -#' @aliases rem -#' @name Series_rem + +#' Modulo Series +#' +#' Method equivalent of modulo operator `series %% other`. +#' @inherit Series_sub params return +#' @seealso +#' - [Arithmetic operators][S3_arithmetic] +#' @examples +#' pl$Series(1:4)$mod(2L) +#' pl$Series(1:3)$mod(pl$Series(11:13)) +#' pl$Series(1:3)$mod(1L) +Series_mod = function(other) { + .pr$Series$rem(self, as_polars_series(other)) +} + + +#' Power Series +#' +#' Method equivalent of power operator `series ^ other`. +#' @inherit Series_sub return +#' @param exponent [Series][Series_class] like object of numeric. +#' Converted to [Series][Series_class] by [as_polars_series()] in this method. +#' @seealso +#' - [Arithmetic operators][S3_arithmetic] #' @examples -#' pl$Series(1:4)$rem(2L) -#' pl$Series(1:3)$rem(pl$Series(11:13)) -#' pl$Series(1:3)$rem(1L) -Series_rem = function(other) { - .pr$Series$rem(self, wrap_s(other)) +#' s = as_polars_series(1:4, name = "foo") +#' +#' s$pow(3L) +Series_pow = function(exponent) { + self$to_frame()$select(pl$col(self$name)$pow(as_polars_series(exponent)))$to_series(0) } + #' Compare Series #' @name Series_compare #' @description compare two Series @@ -384,7 +430,7 @@ Series_rem = function(other) { #' @examples #' pl$Series(1:5) == pl$Series(c(1:3, NA_integer_, 10L)) Series_compare = function(other, op) { - other_s = wrap_s(other) + other_s = as_polars_series(other) s_len = self$len() o_len = other_s$len() if ( @@ -394,28 +440,31 @@ Series_compare = function(other, op) { ) { stop("in compare Series: not same length or either of length 1.") } - .pr$Series$compare(self, wrap_s(other), op) + .pr$Series$compare(self, as_polars_series(other), op) } + + +# TODO: move to the other file #' @export #' @rdname Series_compare #' @param s1 lhs Series #' @param s2 rhs Series or any into Series -"==.RPolarsSeries" = function(s1, s2) unwrap(wrap_s(s1)$compare(s2, "equal")) +"==.RPolarsSeries" = function(s1, s2) unwrap(as_polars_series(s1)$compare(s2, "equal")) #' @export #' @rdname Series_compare -"!=.RPolarsSeries" = function(s1, s2) unwrap(wrap_s(s1)$compare(s2, "not_equal")) +"!=.RPolarsSeries" = function(s1, s2) unwrap(as_polars_series(s1)$compare(s2, "not_equal")) #' @export #' @rdname Series_compare -"<.RPolarsSeries" = function(s1, s2) unwrap(wrap_s(s1)$compare(s2, "lt")) +"<.RPolarsSeries" = function(s1, s2) unwrap(as_polars_series(s1)$compare(s2, "lt")) #' @export #' @rdname Series_compare -">.RPolarsSeries" = function(s1, s2) unwrap(wrap_s(s1)$compare(s2, "gt")) +">.RPolarsSeries" = function(s1, s2) unwrap(as_polars_series(s1)$compare(s2, "gt")) #' @export #' @rdname Series_compare -"<=.RPolarsSeries" = function(s1, s2) unwrap(wrap_s(s1)$compare(s2, "lt_eq")) +"<=.RPolarsSeries" = function(s1, s2) unwrap(as_polars_series(s1)$compare(s2, "lt_eq")) #' @export #' @rdname Series_compare -">=.RPolarsSeries" = function(s1, s2) unwrap(wrap_s(s1)$compare(s2, "gt_eq")) +">=.RPolarsSeries" = function(s1, s2) unwrap(as_polars_series(s1)$compare(s2, "gt_eq")) #' Get r vector/list @@ -428,7 +477,7 @@ Series_compare = function(other, op) { #' @details #' Fun fact: Nested polars Series list must have same inner type, e.g. List(List(Int32)) #' Thus every leaf(non list type) will be placed on the same depth of the tree, and be the same type. -#' +#' @inheritSection DataFrame_class Conversion to R data types considerations #' @examples #' #' series_vec = pl$Series(letters[1:3]) diff --git a/Taskfile.yml b/Taskfile.yml index 284463d8b..c864beb21 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -80,8 +80,7 @@ tasks: - setup-venv cmds: - "{{.VENV_BIN}}/python -m pip install --upgrade uv" - - "{{.VENV_BIN}}/uv pip install --upgrade mkdocs" - - "{{.VENV_BIN}}/uv pip install --upgrade mkdocs-material" + - "{{.VENV_BIN}}/uv pip install --upgrade mkdocs-material mdx_truly_sane_lists" build-lib-sums: desc: Build lib-sums.tsv. diff --git a/altdoc/mkdocs_static.yml b/altdoc/mkdocs_static.yml index f918478d8..79ca642f3 100644 --- a/altdoc/mkdocs_static.yml +++ b/altdoc/mkdocs_static.yml @@ -33,6 +33,7 @@ theme: - toc.follow markdown_extensions: + - mdx_truly_sane_lists - footnotes - toc: baselevel: 1 diff --git a/altdoc/preamble_man_qmd.yml b/altdoc/preamble_man_qmd.yml index f7f071b77..819aee267 100644 --- a/altdoc/preamble_man_qmd.yml +++ b/altdoc/preamble_man_qmd.yml @@ -6,3 +6,4 @@ knitr: opts_chunk: comment: "#>" --- + diff --git a/altdoc/preamble_vignettes_qmd.yml b/altdoc/preamble_vignettes_qmd.yml index 28ecf9c9f..1000b63cc 100644 --- a/altdoc/preamble_vignettes_qmd.yml +++ b/altdoc/preamble_vignettes_qmd.yml @@ -7,3 +7,4 @@ knitr: opts_chunk: comment: "#>" --- + diff --git a/altdoc/preamble_vignettes_rmd.yml b/altdoc/preamble_vignettes_rmd.yml index b86e330e2..2d4968420 100644 --- a/altdoc/preamble_vignettes_rmd.yml +++ b/altdoc/preamble_vignettes_rmd.yml @@ -5,3 +5,4 @@ knitr: opts_chunk: comment: "#>" --- + diff --git a/altdoc/reference_home.Rmd b/altdoc/reference_home.Rmd index e0d58279d..6cf055242 100644 --- a/altdoc/reference_home.Rmd +++ b/altdoc/reference_home.Rmd @@ -26,11 +26,11 @@ to choose between eager and lazy evaluation, that require respectively a for grouped data). We can apply functions directly on a `DataFrame` or `LazyFrame`, such as `rename()` -or `drop()`. Most functions that can be applied to `DataFrame`s can also be used -on `LazyFrame`s, but some are specific to one or the other. For example: +or `drop()`. Most functions that can be applied to `DataFrame`s can also be used +on `LazyFrame`s, but some are specific to one or the other. For example: * `$equals()` exists for `DataFrame` but not for `LazyFrame`; -* `$collect()` executes a lazy query, which means it can only be applied on +* `$collect()` executes a lazy query, which means it can only be applied on a `LazyFrame`. Another common data structure is the `Series`, which can be considered as the @@ -89,7 +89,7 @@ test$group_by(pl$col("cyl"))$agg( ## Expressions Expressions are the building blocks that give all the flexibility we need to -modify or create new columns. +modify or create new columns. Two important expressions starters are `pl$col()` (names a column in the context) and `pl$lit()` (wraps a literal value or vector/series in an Expr). Most other @@ -118,7 +118,7 @@ when it is applied on binary data or on string data. To be able to distinguish those usages and to check the validity of a query, `polars` stores methods in subnamespaces. For each datatype other than numeric (floats and integers), there is a subnamespace containing the available methods: -`dt` (datetime), `list` (list), `str` (strings), `struct` (structs), `cat` +`dt` (datetime), `list` (list), `str` (strings), `struct` (structs), `cat` (categoricals) and `bin` (binary). As a sidenote, there is also an exotic subnamespace called `meta` which is rarely used to manipulate the expressions themselves. Each subsection in the "Expressions" section lists all operations @@ -133,8 +133,7 @@ df = pl$DataFrame( date = pl$date_range( as.Date("2020-01-01"), as.Date("2023-01-02"), - interval = "1y", - eager = TRUE + interval = "1y" ) ) df @@ -149,7 +148,7 @@ df$with_columns( ) ``` -Similarly, to convert a string column to uppercase, we use the `str` prefix +Similarly, to convert a string column to uppercase, we use the `str` prefix before using `to_uppercase()`: ```{r} diff --git a/man/DataFrame_class.Rd b/man/DataFrame_class.Rd index 2d54a867d..4cf0f36f2 100644 --- a/man/DataFrame_class.Rd +++ b/man/DataFrame_class.Rd @@ -84,6 +84,50 @@ the number of columns. } } +\section{Conversion to R data types considerations}{ + +When converting Polars objects, such as \link[=DataFrame_class]{DataFrames} +to R objects, for example via the \code{\link[=as.data.frame.RPolarsDataFrame]{as.data.frame()}} generic function, +each type in the Polars object is converted to an R type. +In some cases, an error may occur because the conversion is not appropriate. +In particular, there is a high possibility of an error when converting +a \link[=DataType_Datetime]{Datetime} type without a time zone. +A \link[=DataType_Datetime]{Datetime} type without a time zone in Polars is converted +to the \link{POSIXct} type in R, which takes into account the time zone in which +the R session is running (which can be checked with the \code{\link[=Sys.timezone]{Sys.timezone()}} +function). In this case, if ambiguous times are included, a conversion error +will occur. In such cases, change the session time zone using +\code{\link[base:Sys.setenv]{Sys.setenv(TZ = "UTC")}} and then perform the conversion, or use the +\code{\link[=ExprDT_replace_time_zone]{$dt$replace_time_zone()}} method on the Datetime type column to +explicitly specify the time zone before conversion. + +\if{html}{\out{
}}\preformatted{# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am +# so this particular date-time doesn't exist +non_existent_time = pl$Series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "\%F \%T") + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + tryCatch( + # This causes an error due to the time zone (the `TZ` env var is affected). + as.vector(non_existent_time), + error = function(e) e + ) + \} +) +#> + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + # This is safe. + as.vector(non_existent_time$dt$replace_time_zone("UTC")) + \} +) +#> [1] "2020-03-08 02:00:00 UTC" +}\if{html}{\out{
}} +} + \examples{ # see all public exported method names (normally accessed via a class # instance with $) diff --git a/man/DataFrame_group_by.Rd b/man/DataFrame_group_by.Rd index d4535326a..1a2fbddb1 100644 --- a/man/DataFrame_group_by.Rd +++ b/man/DataFrame_group_by.Rd @@ -53,3 +53,8 @@ df$group_by(d = "a", e = pl$col("b") \%/\% 2)$agg( pl$col("c")$mean() ) } +\seealso{ +\itemize{ +\item \code{\link[=DataFrame_partition_by]{$partition_by()}} +} +} diff --git a/man/DataFrame_group_by_dynamic.Rd b/man/DataFrame_group_by_dynamic.Rd index ac240d21b..f4205b183 100644 --- a/man/DataFrame_group_by_dynamic.Rd +++ b/man/DataFrame_group_by_dynamic.Rd @@ -120,8 +120,7 @@ df = pl$DataFrame( time = pl$date_range( start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), - interval = "30m", - eager = TRUE, + interval = "30m" ), n = 0:6 ) diff --git a/man/DataFrame_partition_by.Rd b/man/DataFrame_partition_by.Rd new file mode 100644 index 000000000..d2fdcea3b --- /dev/null +++ b/man/DataFrame_partition_by.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe__frame.R +\name{DataFrame_partition_by} +\alias{DataFrame_partition_by} +\title{Split a DataFrame into multiple DataFrames} +\usage{ +DataFrame_partition_by( + ..., + maintain_order = TRUE, + include_key = TRUE, + as_nested_list = FALSE +) +} +\arguments{ +\item{...}{Characters of column names to group by. Passed to \code{\link[=pl_col]{pl$col()}}.} + +\item{maintain_order}{If \code{TRUE}, ensure that the order of the groups is consistent with the input data. +This is slower than a default partition by operation.} + +\item{include_key}{If \code{TRUE}, include the columns used to partition the DataFrame in the output.} + +\item{as_nested_list}{This affects the format of the output. +If \code{FALSE} (default), the output is a flat \link{list} of \link[=DataFrame_class]{DataFrames}. +IF \code{TRUE} and one of the \code{maintain_order} or \code{include_key} argument is \code{TRUE}, +then each element of the output has two children: \code{key} and \code{data}. +See the examples for more details.} +} +\value{ +A list of \link[=DataFrame_class]{DataFrames}. See the examples for details. +} +\description{ +Similar to \code{\link[=DataFrame_group_by]{$group_by()}}. +Group by the given columns and return the groups as separate \link[=DataFrame_class]{DataFrames}. +It is useful to use this in combination with functions like \code{\link[=lapply]{lapply()}} or \code{purrr::map()}. +} +\examples{ +df = pl$DataFrame( + a = c("a", "b", "a", "b", "c"), + b = c(1, 2, 1, 3, 3), + c = c(5, 4, 3, 2, 1) +) +df + +# Pass a single column name to partition by that column. +df$partition_by("a") + +# Partition by multiple columns. +df$partition_by("a", "b") + +# Partition by column data type +df$partition_by(pl$String) + +# If `as_nested_list = TRUE`, the output is a list whose elements have a `key` and a `data` field. +# The `key` is a named list of the key values, and the `data` is the DataFrame. +df$partition_by("a", "b", as_nested_list = TRUE) + +# `as_nested_list = TRUE` should be used with `maintain_order = TRUE` or `include_key = TRUE`. +tryCatch( + df$partition_by("a", "b", maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE), + warning = function(w) w +) + +# Example of using with lapply(), and printing the key and the data summary +df$partition_by("a", "b", maintain_order = FALSE, as_nested_list = TRUE) |> + lapply(\(x) { + sprintf("\nThe key value of `a` is \%s and the key value of `b` is \%s\n", x$key$a, x$key$b) |> + cat() + x$data$drop(names(x$key))$describe() |> + print() + invisible(NULL) + }) |> + invisible() +} +\seealso{ +\itemize{ +\item \code{\link[=DataFrame_group_by]{$group_by()}} +} +} diff --git a/man/DataFrame_to_data_frame.Rd b/man/DataFrame_to_data_frame.Rd index 31aecb59f..02a789622 100644 --- a/man/DataFrame_to_data_frame.Rd +++ b/man/DataFrame_to_data_frame.Rd @@ -27,6 +27,50 @@ An R data.frame \description{ Return Polars DataFrame as R data.frame } +\section{Conversion to R data types considerations}{ + +When converting Polars objects, such as \link[=DataFrame_class]{DataFrames} +to R objects, for example via the \code{\link[=as.data.frame.RPolarsDataFrame]{as.data.frame()}} generic function, +each type in the Polars object is converted to an R type. +In some cases, an error may occur because the conversion is not appropriate. +In particular, there is a high possibility of an error when converting +a \link[=DataType_Datetime]{Datetime} type without a time zone. +A \link[=DataType_Datetime]{Datetime} type without a time zone in Polars is converted +to the \link{POSIXct} type in R, which takes into account the time zone in which +the R session is running (which can be checked with the \code{\link[=Sys.timezone]{Sys.timezone()}} +function). In this case, if ambiguous times are included, a conversion error +will occur. In such cases, change the session time zone using +\code{\link[base:Sys.setenv]{Sys.setenv(TZ = "UTC")}} and then perform the conversion, or use the +\code{\link[=ExprDT_replace_time_zone]{$dt$replace_time_zone()}} method on the Datetime type column to +explicitly specify the time zone before conversion. + +\if{html}{\out{
}}\preformatted{# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am +# so this particular date-time doesn't exist +non_existent_time = pl$Series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "\%F \%T") + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + tryCatch( + # This causes an error due to the time zone (the `TZ` env var is affected). + as.vector(non_existent_time), + error = function(e) e + ) + \} +) +#> + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + # This is safe. + as.vector(non_existent_time$dt$replace_time_zone("UTC")) + \} +) +#> [1] "2020-03-08 02:00:00 UTC" +}\if{html}{\out{
}} +} + \examples{ df = pl$DataFrame(iris[1:3, ]) df$to_data_frame() diff --git a/man/DataFrame_to_list.Rd b/man/DataFrame_to_list.Rd index 3ae7b6ce9..86f1057f4 100644 --- a/man/DataFrame_to_list.Rd +++ b/man/DataFrame_to_list.Rd @@ -37,6 +37,50 @@ before exporting to R. If \code{unnest_structs = FALSE}, then \code{struct} colu will be returned as nested lists, where each row is a list of values. Such a structure is not very typical or efficient in R. } +\section{Conversion to R data types considerations}{ + +When converting Polars objects, such as \link[=DataFrame_class]{DataFrames} +to R objects, for example via the \code{\link[=as.data.frame.RPolarsDataFrame]{as.data.frame()}} generic function, +each type in the Polars object is converted to an R type. +In some cases, an error may occur because the conversion is not appropriate. +In particular, there is a high possibility of an error when converting +a \link[=DataType_Datetime]{Datetime} type without a time zone. +A \link[=DataType_Datetime]{Datetime} type without a time zone in Polars is converted +to the \link{POSIXct} type in R, which takes into account the time zone in which +the R session is running (which can be checked with the \code{\link[=Sys.timezone]{Sys.timezone()}} +function). In this case, if ambiguous times are included, a conversion error +will occur. In such cases, change the session time zone using +\code{\link[base:Sys.setenv]{Sys.setenv(TZ = "UTC")}} and then perform the conversion, or use the +\code{\link[=ExprDT_replace_time_zone]{$dt$replace_time_zone()}} method on the Datetime type column to +explicitly specify the time zone before conversion. + +\if{html}{\out{
}}\preformatted{# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am +# so this particular date-time doesn't exist +non_existent_time = pl$Series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "\%F \%T") + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + tryCatch( + # This causes an error due to the time zone (the `TZ` env var is affected). + as.vector(non_existent_time), + error = function(e) e + ) + \} +) +#> + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + # This is safe. + as.vector(non_existent_time$dt$replace_time_zone("UTC")) + \} +) +#> [1] "2020-03-08 02:00:00 UTC" +}\if{html}{\out{
}} +} + \examples{ pl$DataFrame(iris)$to_list() } diff --git a/man/DataType_Datetime.Rd b/man/DataType_Datetime.Rd index b23ef34fc..0bda0b1b4 100644 --- a/man/DataType_Datetime.Rd +++ b/man/DataType_Datetime.Rd @@ -2,25 +2,34 @@ % Please edit documentation in R/datatype.R \name{DataType_Datetime} \alias{DataType_Datetime} -\title{Create Datetime DataType} +\alias{pl_Datetime} +\title{Data type representing a calendar date and time of day.} \usage{ -DataType_Datetime(tu = "us", tz = NULL) +DataType_Datetime(time_unit = "us", time_zone = NULL) } \arguments{ -\item{tu}{string option either "ms", "us" or "ns"} +\item{time_unit}{Unit of time. One of \code{"ms"}, \code{"us"} (default) or \code{"ns"}.} -\item{tz}{string the Time Zone, see details} +\item{time_zone}{Time zone string, as defined in \code{\link[=OlsonNames]{OlsonNames()}}. +Setting \code{timezone = "*"} will match any timezone, which can be useful to +select all Datetime columns containing a timezone.} } \value{ Datetime DataType } \description{ -Datetime DataType constructor -} -\details{ -all allowed TimeZone designations can be found in \code{base::OlsonNames()} +The underlying representation of this type is a 64-bit signed integer. +The integer indicates the number of time units since the Unix epoch (1970-01-01 00:00:00). +The number can be negative to indicate datetimes before the epoch. } \examples{ pl$Datetime("ns", "Pacific/Samoa") + +df = pl$DataFrame( + naive_time = as.POSIXct("1900-01-01"), + zoned_time = as.POSIXct("1900-01-01", "UTC") +) +df + +df$select(pl$col(pl$Datetime("us", "*"))) } -\keyword{pl} diff --git a/man/DynamicGroupBy_agg.Rd b/man/DynamicGroupBy_agg.Rd index 3d6a9cc47..7c84ee88c 100644 --- a/man/DynamicGroupBy_agg.Rd +++ b/man/DynamicGroupBy_agg.Rd @@ -22,8 +22,7 @@ df = pl$DataFrame( time = pl$date_range( start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), - interval = "30m", - eager = TRUE, + interval = "30m" ), n = 0:6 ) diff --git a/man/DynamicGroupBy_class.Rd b/man/DynamicGroupBy_class.Rd index e7e207320..66fbf6939 100644 --- a/man/DynamicGroupBy_class.Rd +++ b/man/DynamicGroupBy_class.Rd @@ -12,8 +12,7 @@ df = pl$DataFrame( time = pl$date_range( start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), - interval = "30m", - eager = TRUE, + interval = "30m" ), n = 0:6 ) diff --git a/man/DynamicGroupBy_ungroup.Rd b/man/DynamicGroupBy_ungroup.Rd index ccb7e605a..3e042c876 100644 --- a/man/DynamicGroupBy_ungroup.Rd +++ b/man/DynamicGroupBy_ungroup.Rd @@ -18,8 +18,7 @@ df = pl$DataFrame( time = pl$date_range( start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), - interval = "30m", - eager = TRUE, + interval = "30m" ), n = 0:6 ) diff --git a/man/ExprDT_cast_time_unit.Rd b/man/ExprDT_cast_time_unit.Rd index 11ef7a1f2..8532d0a9e 100644 --- a/man/ExprDT_cast_time_unit.Rd +++ b/man/ExprDT_cast_time_unit.Rd @@ -22,8 +22,7 @@ df = pl$DataFrame( date = pl$date_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), - interval = "1d1s", - eager = TRUE + interval = "1d1s" ) ) df$select( diff --git a/man/ExprDT_combine.Rd b/man/ExprDT_combine.Rd index 66d4d548c..65dad8894 100644 --- a/man/ExprDT_combine.Rd +++ b/man/ExprDT_combine.Rd @@ -43,6 +43,6 @@ pl$lit(as.Date("2021-01-01"))$dt$combine(3600 * 1.5E6 + 123, tu = "us")$to_serie # if needed to convert back to R it is more intuitive to set a specific time zone expr = pl$lit(as.Date("2021-01-01"))$dt$combine(3600 * 1.5E6 + 123, tu = "us") -expr$cast(pl$Datetime(tu = "us", tz = "GMT"))$to_r() +expr$cast(pl$Datetime("us", "GMT"))$to_r() } \keyword{ExprDT} diff --git a/man/ExprDT_convert_time_zone.Rd b/man/ExprDT_convert_time_zone.Rd index e29b3146b..27fae6770 100644 --- a/man/ExprDT_convert_time_zone.Rd +++ b/man/ExprDT_convert_time_zone.Rd @@ -22,19 +22,51 @@ corresponds to in R manually modifying the tzone attribute of POSIXt objects } \examples{ df = pl$DataFrame( - date = pl$date_range( - start = as.Date("2001-3-1"), - end = as.Date("2001-5-1"), - interval = "1mo12m34s", - eager = TRUE - ) + london_timezone = pl$date_range( + as.POSIXct("2020-03-01", tz = "UTC"), + as.POSIXct("2020-07-01", tz = "UTC"), + "1mo", + time_zone = "UTC" + )$dt$convert_time_zone("Europe/London") ) + df$select( - pl$col("date"), - pl$col("date") - $dt$replace_time_zone("Europe/Amsterdam") - $dt$convert_time_zone("Europe/London") - $alias("London_with") + "london_timezone", + London_to_Amsterdam = pl$col( + "london_timezone" + )$dt$replace_time_zone("Europe/Amsterdam") +) + +# You can use `ambiguous` to deal with ambiguous datetimes: +dates = c( + "2018-10-28 01:30", + "2018-10-28 02:00", + "2018-10-28 02:30", + "2018-10-28 02:00" +) + +df = pl$DataFrame( + ts = pl$Series(dates)$str$strptime(pl$Datetime("us"), "\%F \%H:\%M"), + ambiguous = c("earliest", "earliest", "latest", "latest") +) + +df$with_columns( + ts_localized = pl$col("ts")$dt$replace_time_zone( + "Europe/Brussels", + ambiguous = pl$col("ambiguous") + ) ) + +# Polars Datetime type without a time zone will be converted to R +# with respect to the session time zone. If ambiguous times are present +# an error will be raised. It is recommended to add a time zone before +# converting to R. +s_without_tz = pl$Series(dates)$str$strptime(pl$Datetime("us"), "\%F \%H:\%M") +s_without_tz + +s_with_tz = s_without_tz$dt$replace_time_zone("UTC") +s_with_tz + +as.vector(s_with_tz) } \keyword{ExprDT} diff --git a/man/ExprDT_day.Rd b/man/ExprDT_day.Rd index 402f432ab..36e789435 100644 --- a/man/ExprDT_day.Rd +++ b/man/ExprDT_day.Rd @@ -22,8 +22,7 @@ df = pl$DataFrame( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", - time_zone = "GMT", - eager = TRUE + time_zone = "GMT" ) ) df$with_columns( diff --git a/man/ExprDT_epoch.Rd b/man/ExprDT_epoch.Rd index 7d6962dca..37e1d8007 100644 --- a/man/ExprDT_epoch.Rd +++ b/man/ExprDT_epoch.Rd @@ -21,9 +21,9 @@ ns and perhaps us will exceed integerish limit if returning to R as flaot64/double. } \examples{ -pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch("ns")$to_series() -pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch("ms")$to_series() -pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch("s")$to_series() -pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch("d")$to_series() +pl$date_range(as.Date("2022-1-1"))$dt$epoch("ns")$to_series() +pl$date_range(as.Date("2022-1-1"))$dt$epoch("ms")$to_series() +pl$date_range(as.Date("2022-1-1"))$dt$epoch("s")$to_series() +pl$date_range(as.Date("2022-1-1"))$dt$epoch("d")$to_series() } \keyword{ExprDT} diff --git a/man/ExprDT_hour.Rd b/man/ExprDT_hour.Rd index 2a9c3ff02..f16784847 100644 --- a/man/ExprDT_hour.Rd +++ b/man/ExprDT_hour.Rd @@ -21,8 +21,7 @@ df = pl$DataFrame( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d2h", - time_zone = "GMT", - eager = TRUE + time_zone = "GMT" ) ) df$with_columns( diff --git a/man/ExprDT_iso_year.Rd b/man/ExprDT_iso_year.Rd index 9f9c06bc1..99b891be5 100644 --- a/man/ExprDT_iso_year.Rd +++ b/man/ExprDT_iso_year.Rd @@ -22,8 +22,7 @@ df = pl$DataFrame( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", - time_zone = "GMT", - eager = TRUE + time_zone = "GMT" ) ) df$with_columns( diff --git a/man/ExprDT_microsecond.Rd b/man/ExprDT_microsecond.Rd index 090d3c655..dc4b995d0 100644 --- a/man/ExprDT_microsecond.Rd +++ b/man/ExprDT_microsecond.Rd @@ -20,8 +20,7 @@ pl$DataFrame( as.numeric(as.POSIXct("2001-1-1")) * 1E6 + 456789, # manually convert to us as.numeric(as.POSIXct("2001-1-1 00:00:6")) * 1E6, interval = "2s654321us", - time_unit = "us", # instruct polars input is us, and store as us - eager = TRUE + time_unit = "us" # instruct polars input is us, and store as us ) )$with_columns( pl$col("date")$cast(pl$Int64)$alias("datetime int64"), diff --git a/man/ExprDT_millisecond.Rd b/man/ExprDT_millisecond.Rd index 06810d98a..ab86a2432 100644 --- a/man/ExprDT_millisecond.Rd +++ b/man/ExprDT_millisecond.Rd @@ -20,7 +20,6 @@ pl$DataFrame(date = pl$date_range( as.numeric(as.POSIXct("2001-1-1 00:00:6")) * 1E6, interval = "2s654321us", time_unit = "us", # instruct polars input is us, and store as us - eager = TRUE ))$with_columns( pl$col("date")$cast(pl$Int64)$alias("datetime int64"), pl$col("date")$dt$millisecond()$alias("millisecond") diff --git a/man/ExprDT_minute.Rd b/man/ExprDT_minute.Rd index d08c75558..0d6ef7da8 100644 --- a/man/ExprDT_minute.Rd +++ b/man/ExprDT_minute.Rd @@ -21,8 +21,7 @@ df = pl$DataFrame( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d5s", - time_zone = "GMT", - eager = TRUE + time_zone = "GMT" ) ) df$with_columns( diff --git a/man/ExprDT_month.Rd b/man/ExprDT_month.Rd index adedf2503..be707dd79 100644 --- a/man/ExprDT_month.Rd +++ b/man/ExprDT_month.Rd @@ -22,8 +22,7 @@ df = pl$DataFrame( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", - time_zone = "GMT", - eager = TRUE + time_zone = "GMT" ) ) df$with_columns( diff --git a/man/ExprDT_nanosecond.Rd b/man/ExprDT_nanosecond.Rd index a0f0ca7ee..2ccd1aa8f 100644 --- a/man/ExprDT_nanosecond.Rd +++ b/man/ExprDT_nanosecond.Rd @@ -22,8 +22,7 @@ pl$DataFrame(date = pl$date_range( as.numeric(as.POSIXct("2001-1-1")) * 1E9 + 123456789, # manually convert to us as.numeric(as.POSIXct("2001-1-1 00:00:6")) * 1E9, interval = "1s987654321ns", - time_unit = "ns", # instruct polars input is us, and store as us - eager = TRUE + time_unit = "ns" # instruct polars input is us, and store as us ))$with_columns( pl$col("date")$cast(pl$Int64)$alias("datetime int64"), pl$col("date")$dt$nanosecond()$alias("nanosecond") diff --git a/man/ExprDT_offset_by.Rd b/man/ExprDT_offset_by.Rd index 5f947ab8f..a5e44a488 100644 --- a/man/ExprDT_offset_by.Rd +++ b/man/ExprDT_offset_by.Rd @@ -45,8 +45,7 @@ df = pl$DataFrame( dates = pl$date_range( as.Date("2000-1-1"), as.Date("2005-1-1"), - "1y", - eager = TRUE + "1y" ) ) df$select( diff --git a/man/ExprDT_ordinal_day.Rd b/man/ExprDT_ordinal_day.Rd index 9a915c583..2b033f1fe 100644 --- a/man/ExprDT_ordinal_day.Rd +++ b/man/ExprDT_ordinal_day.Rd @@ -22,8 +22,7 @@ df = pl$DataFrame( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", - time_zone = "GMT", - eager = TRUE + time_zone = "GMT" ) ) df$with_columns( diff --git a/man/ExprDT_quarter.Rd b/man/ExprDT_quarter.Rd index 7e2a1633f..275b121e7 100644 --- a/man/ExprDT_quarter.Rd +++ b/man/ExprDT_quarter.Rd @@ -21,8 +21,7 @@ df = pl$DataFrame( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", - time_zone = "GMT", - eager = TRUE + time_zone = "GMT" ) ) df$with_columns( diff --git a/man/ExprDT_round.Rd b/man/ExprDT_round.Rd index 3d2cad2b3..b5ede4cda 100644 --- a/man/ExprDT_round.Rd +++ b/man/ExprDT_round.Rd @@ -48,7 +48,7 @@ change without it being considered a breaking change. \examples{ t1 = as.POSIXct("3040-01-01", tz = "GMT") t2 = t1 + as.difftime(25, units = "secs") -s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms", eager = TRUE) +s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms") # use a dt namespace function df = pl$DataFrame(datetime = s)$with_columns( diff --git a/man/ExprDT_second.Rd b/man/ExprDT_second.Rd index c6bdd2b2a..4598e2ed4 100644 --- a/man/ExprDT_second.Rd +++ b/man/ExprDT_second.Rd @@ -26,7 +26,6 @@ pl$DataFrame(date = pl$date_range( as.numeric(as.POSIXct("2001-1-1 00:00:6")) * 1E6, interval = "2s654321us", time_unit = "us", # instruct polars input is us, and store as us - eager = TRUE ))$with_columns( pl$col("date")$dt$second()$alias("second"), pl$col("date")$dt$second(fractional = TRUE)$alias("second_frac") diff --git a/man/ExprDT_time.Rd b/man/ExprDT_time.Rd index a50c4128a..6aebb4ceb 100644 --- a/man/ExprDT_time.Rd +++ b/man/ExprDT_time.Rd @@ -16,8 +16,7 @@ This only works on Datetime Series, it will error on Date Series. df = pl$DataFrame(dates = pl$date_range( as.Date("2000-1-1"), as.Date("2000-1-2"), - "1h", - eager = TRUE + "1h" )) df$with_columns(times = pl$col("dates")$dt$time()) diff --git a/man/ExprDT_timestamp.Rd b/man/ExprDT_timestamp.Rd index 2ed36246b..6892e5aa7 100644 --- a/man/ExprDT_timestamp.Rd +++ b/man/ExprDT_timestamp.Rd @@ -21,8 +21,7 @@ df = pl$DataFrame( date = pl$date_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), - interval = "1d1s", - eager = TRUE + interval = "1d1s" ) ) df$select( diff --git a/man/ExprDT_total_days.Rd b/man/ExprDT_total_days.Rd index 47bac8dc2..b7dc388a0 100644 --- a/man/ExprDT_total_days.Rd +++ b/man/ExprDT_total_days.Rd @@ -17,8 +17,7 @@ df = pl$DataFrame( date = pl$date_range( start = as.Date("2020-3-1"), end = as.Date("2020-5-1"), - interval = "1mo", - eager = TRUE + interval = "1mo" ) ) df$select( diff --git a/man/ExprDT_total_hours.Rd b/man/ExprDT_total_hours.Rd index 0032f3186..72df5a912 100644 --- a/man/ExprDT_total_hours.Rd +++ b/man/ExprDT_total_hours.Rd @@ -17,8 +17,7 @@ df = pl$DataFrame( date = pl$date_range( start = as.Date("2020-1-1"), end = as.Date("2020-1-4"), - interval = "1d", - eager = TRUE + interval = "1d" ) ) df$select( diff --git a/man/ExprDT_total_microseconds.Rd b/man/ExprDT_total_microseconds.Rd index 9906c52fc..0e69df198 100644 --- a/man/ExprDT_total_microseconds.Rd +++ b/man/ExprDT_total_microseconds.Rd @@ -16,8 +16,7 @@ Extract the microseconds from a Duration type. df = pl$DataFrame(date = pl$date_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), - interval = "1ms", - eager = TRUE + interval = "1ms" )) df$select( pl$col("date"), diff --git a/man/ExprDT_total_milliseconds.Rd b/man/ExprDT_total_milliseconds.Rd index ca49f9ed1..9cf2b3f54 100644 --- a/man/ExprDT_total_milliseconds.Rd +++ b/man/ExprDT_total_milliseconds.Rd @@ -16,8 +16,7 @@ Extract the milliseconds from a Duration type. df = pl$DataFrame(date = pl$date_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), - interval = "1ms", - eager = TRUE + interval = "1ms" )) df$select( pl$col("date"), diff --git a/man/ExprDT_total_minutes.Rd b/man/ExprDT_total_minutes.Rd index 50fcafaf5..93eeed1e9 100644 --- a/man/ExprDT_total_minutes.Rd +++ b/man/ExprDT_total_minutes.Rd @@ -17,8 +17,7 @@ df = pl$DataFrame( date = pl$date_range( start = as.Date("2020-1-1"), end = as.Date("2020-1-4"), - interval = "1d", - eager = TRUE + interval = "1d" ) ) df$select( diff --git a/man/ExprDT_total_nanoseconds.Rd b/man/ExprDT_total_nanoseconds.Rd index dac22b2c6..dda7bfc41 100644 --- a/man/ExprDT_total_nanoseconds.Rd +++ b/man/ExprDT_total_nanoseconds.Rd @@ -16,8 +16,7 @@ Extract the nanoseconds from a Duration type. df = pl$DataFrame(date = pl$date_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), - interval = "1ms", - eager = TRUE + interval = "1ms" )) df$select( pl$col("date"), diff --git a/man/ExprDT_total_seconds.Rd b/man/ExprDT_total_seconds.Rd index 44f6b5f2e..9bdabdff5 100644 --- a/man/ExprDT_total_seconds.Rd +++ b/man/ExprDT_total_seconds.Rd @@ -16,8 +16,7 @@ Extract the seconds from a Duration type. df = pl$DataFrame(date = pl$date_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:04:00", tz = "GMT"), - interval = "1m", - eager = TRUE + interval = "1m" )) df$select( pl$col("date"), diff --git a/man/ExprDT_truncate.Rd b/man/ExprDT_truncate.Rd index 96ce42909..273c7efff 100644 --- a/man/ExprDT_truncate.Rd +++ b/man/ExprDT_truncate.Rd @@ -42,7 +42,7 @@ These strings can be combined: \examples{ t1 = as.POSIXct("3040-01-01", tz = "GMT") t2 = t1 + as.difftime(25, units = "secs") -s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms", eager = TRUE) +s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms") # use a dt namespace function df = pl$DataFrame(datetime = s)$with_columns( diff --git a/man/ExprDT_week.Rd b/man/ExprDT_week.Rd index 72fb3554e..3d0ad1be3 100644 --- a/man/ExprDT_week.Rd +++ b/man/ExprDT_week.Rd @@ -22,8 +22,7 @@ df = pl$DataFrame( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", - time_zone = "GMT", - eager = TRUE + time_zone = "GMT" ) ) df$with_columns( diff --git a/man/ExprDT_weekday.Rd b/man/ExprDT_weekday.Rd index 73296dc20..f42fdf17b 100644 --- a/man/ExprDT_weekday.Rd +++ b/man/ExprDT_weekday.Rd @@ -21,8 +21,7 @@ df = pl$DataFrame( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", - time_zone = "GMT", - eager = TRUE + time_zone = "GMT" ) ) df$with_columns( diff --git a/man/ExprDT_with_time_unit.Rd b/man/ExprDT_with_time_unit.Rd index 714678b0c..65d85a261 100644 --- a/man/ExprDT_with_time_unit.Rd +++ b/man/ExprDT_with_time_unit.Rd @@ -23,8 +23,7 @@ df = pl$DataFrame( date = pl$date_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), - interval = "1d1s", - eager = TRUE + interval = "1d1s" ) ) df$select( diff --git a/man/ExprDT_year.Rd b/man/ExprDT_year.Rd index e32c4aae6..9a6eac3b0 100644 --- a/man/ExprDT_year.Rd +++ b/man/ExprDT_year.Rd @@ -21,8 +21,7 @@ df = pl$DataFrame( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", - time_zone = "GMT", - eager = TRUE + time_zone = "GMT" ) ) df$with_columns( diff --git a/man/ExprStr_strptime.Rd b/man/ExprStr_strptime.Rd index 9603aebc1..f501c7d26 100644 --- a/man/ExprStr_strptime.Rd +++ b/man/ExprStr_strptime.Rd @@ -6,7 +6,7 @@ \usage{ ExprStr_strptime( datatype, - format, + format = NULL, strict = TRUE, exact = TRUE, cache = TRUE, @@ -17,16 +17,21 @@ ExprStr_strptime( \item{datatype}{The data type to convert into. Can be either Date, Datetime, or Time.} -\item{format}{Format to use for conversion. See \code{?strptime} for possible -values. Example: "\%Y-\%m-\%d \%H:\%M:\%S". If \code{NULL} (default), the format is -inferred from the data. Notice that time zone \verb{\%Z} is not supported and will -just ignore timezones. Numeric time zones like \verb{\%z} or \verb{\%:z} are supported.} +\item{format}{Format to use for conversion. Refer to +\href{https://docs.rs/chrono/latest/chrono/format/strftime/index.html}{the chrono crate documentation} +for the full specification. Example: \code{"\%Y-\%m-\%d \%H:\%M:\%S"}. +If \code{NULL} (default), the format is inferred from the data. +Notice that time zone \verb{\%Z} is not supported and will just ignore timezones. +Numeric time zones like \verb{\%z} or \verb{\%:z} are supported.} \item{strict}{If \code{TRUE} (default), raise an error if a single string cannot -be parsed. Otherwise, produce a polars \code{null}.} +be parsed. If \code{FALSE}, produce a polars \code{null}.} -\item{exact}{If \code{TRUE} (default), require an exact format match. Otherwise, -allow the format to match anywhere in the target string.} +\item{exact}{If \code{TRUE} (default), require an exact format match. If \code{FALSE}, +allow the format to match anywhere in the target string. +Conversion to the Time type is always exact. +Note that using \code{exact = FALSE} introduces a performance penalty - +cleaning your data beforehand will almost certainly be more performant.} \item{cache}{Use a cache of unique, converted dates to apply the datetime conversion.} @@ -39,17 +44,29 @@ conversion.} }} } \value{ -Expr of a Date, Datetime or Time Series +\link[=Expr_class]{Expr} of Date, Datetime or Time type } \description{ -Convert a String column into a Date/Datetime/Time column. +Similar to the \code{\link[=strptime]{strptime()}} function. } \details{ When parsing a Datetime the column precision will be inferred from the format -string, if given, eg: “\%F \%T\%.3f" => Datetime("ms"). If no fractional second -component is found then the default is "us" (microsecond). +string, if given, e.g.: \code{"\%F \%T\%.3f"} => \code{\link[=pl_Datetime]{pl$Datetime("ms")}}. +If no fractional second component is found then the default is \code{"us"} (microsecond). } \examples{ +# Dealing with a consistent format +s = pl$Series(c("2020-01-01 01:00Z", "2020-01-01 02:00Z")) + +s$str$strptime(pl$Datetime(), "\%Y-\%m-\%d \%H:\%M\%#z") + +# Auto infer format +s$str$strptime(pl$Datetime()) + +# Datetime with timezone is interpreted as UTC timezone +pl$Series("2020-01-01T01:00:00+09:00")$str$strptime(pl$Datetime()) + +# Dealing with different formats. s = pl$Series( c( "2021-04-22", @@ -59,24 +76,35 @@ s = pl$Series( ), "date" ) -#' #join multiple passes with different format -s$to_frame()$with_columns( - pl$col("date") - $str$strptime(pl$Date, "\%F", strict = FALSE) - $fill_null(pl$col("date")$str$strptime(pl$Date, "\%F \%T", strict = FALSE)) - $fill_null(pl$col("date")$str$strptime(pl$Date, "\%D", strict = FALSE)) - $fill_null(pl$col("date")$str$strptime(pl$Date, "\%c", strict = FALSE)) + +s$to_frame()$select( + pl$coalesce( + pl$col("date")$str$strptime(pl$Date, "\%F", strict = FALSE), + pl$col("date")$str$strptime(pl$Date, "\%F \%T", strict = FALSE), + pl$col("date")$str$strptime(pl$Date, "\%D", strict = FALSE), + pl$col("date")$str$strptime(pl$Date, "\%c", strict = FALSE) + ) ) -txt_datetimes = c( - "2023-01-01 11:22:33 -0100", - "2023-01-01 11:22:33 +0300", - "invalid time" +# Ignore invalid time +s = pl$Series( + c( + "2023-01-01 11:22:33 -0100", + "2023-01-01 11:22:33 +0300", + "invalid time" + ) ) -pl$lit(txt_datetimes)$str$strptime( +s$str$strptime( pl$Datetime("ns"), - format = "\%Y-\%m-\%d \%H:\%M:\%S \%z", strict = FALSE, -)$to_series() + format = "\%Y-\%m-\%d \%H:\%M:\%S \%z", + strict = FALSE, +) +} +\seealso{ +\itemize{ +\item \code{\link[=ExprStr_to_date]{$str$to_date()}} +\item \code{\link[=ExprStr_to_datetime]{$str$to_datetime()}} +\item \code{\link[=ExprStr_to_time]{$str$to_time()}} +} } -\keyword{ExprStr} diff --git a/man/ExprStr_to_date.Rd b/man/ExprStr_to_date.Rd index d1ec18061..68a1779b3 100644 --- a/man/ExprStr_to_date.Rd +++ b/man/ExprStr_to_date.Rd @@ -3,31 +3,48 @@ \name{ExprStr_to_date} \alias{ExprStr_to_date} \title{Convert a String column into a Date column} +\format{ +Format to use for conversion. Refer to +\href{https://docs.rs/chrono/latest/chrono/format/strftime/index.html}{the chrono crate documentation} +for the full specification. Example: \code{"\%Y-\%m-\%d"}. +If \code{NULL} (default), the format is inferred from the data. +} \usage{ ExprStr_to_date(format = NULL, strict = TRUE, exact = TRUE, cache = TRUE) } \arguments{ -\item{format}{Format to use for conversion. See \code{?strptime} for possible -values. Example: "\%Y-\%m-\%d". If \code{NULL} (default), the format is -inferred from the data. Notice that time zone \verb{\%Z} is not supported and will -just ignore timezones. Numeric time zones like \verb{\%z} or \verb{\%:z} are supported.} +\item{format}{Format to use for conversion. Refer to +\href{https://docs.rs/chrono/latest/chrono/format/strftime/index.html}{the chrono crate documentation} +for the full specification. Example: \code{"\%Y-\%m-\%d \%H:\%M:\%S"}. +If \code{NULL} (default), the format is inferred from the data. +Notice that time zone \verb{\%Z} is not supported and will just ignore timezones. +Numeric time zones like \verb{\%z} or \verb{\%:z} are supported.} \item{strict}{If \code{TRUE} (default), raise an error if a single string cannot -be parsed. If \code{FALSE}, parsing failure will produce a polars \code{null}.} +be parsed. If \code{FALSE}, produce a polars \code{null}.} -\item{exact}{If \code{TRUE} (default), require an exact format match. Otherwise, -allow the format to match anywhere in the target string.} +\item{exact}{If \code{TRUE} (default), require an exact format match. If \code{FALSE}, +allow the format to match anywhere in the target string. +Conversion to the Time type is always exact. +Note that using \code{exact = FALSE} introduces a performance penalty - +cleaning your data beforehand will almost certainly be more performant.} \item{cache}{Use a cache of unique, converted dates to apply the datetime conversion.} } \value{ -Expr +\link[=Expr_class]{Expr} of Date type } \description{ Convert a String column into a Date column } \examples{ -pl$DataFrame(str_date = c("2009-01-02", "2009-01-03", "2009-1-4", "2009 05 01"))$ - with_columns(date = pl$col("str_date")$str$to_date(strict = FALSE)) +s = pl$Series(c("2020/01/01", "2020/02/01", "2020/03/01")) + +s$str$to_date() +} +\seealso{ +\itemize{ +\item \code{\link[=ExprStr_strptime]{$str$strptime()}} +} } diff --git a/man/ExprStr_to_datetime.Rd b/man/ExprStr_to_datetime.Rd index 452aed605..86a0cbcc8 100644 --- a/man/ExprStr_to_datetime.Rd +++ b/man/ExprStr_to_datetime.Rd @@ -15,21 +15,26 @@ ExprStr_to_datetime( ) } \arguments{ -\item{format}{Format to use for conversion. See \code{?strptime} for possible -values. Example: "\%Y-\%m-\%d \%H:\%M:\%S". If \code{NULL} (default), the format is -inferred from the data. Notice that time zone \verb{\%Z} is not supported and will -just ignore timezones. Numeric time zones like \verb{\%z} or \verb{\%:z} are supported.} +\item{format}{Format to use for conversion. Refer to +\href{https://docs.rs/chrono/latest/chrono/format/strftime/index.html}{the chrono crate documentation} +for the full specification. Example: \code{"\%Y-\%m-\%d \%H:\%M:\%S"}. +If \code{NULL} (default), the format is inferred from the data. +Notice that time zone \verb{\%Z} is not supported and will just ignore timezones. +Numeric time zones like \verb{\%z} or \verb{\%:z} are supported.} -\item{time_unit}{String (\code{"ns"}, \code{"us"}, \code{"ms"}) or integer.} +\item{time_unit}{Unit of time for the resulting Datetime column. If \code{NULL} (default), +the time unit is inferred from the format string if given, +e.g.: \code{"\%F \%T\%.3f"} => \code{\link[=pl_Datetime]{pl$Datetime("ms")}}. +If no fractional second component is found, the default is \code{"us"} (microsecond).} -\item{time_zone}{String describing a timezone. If \code{NULL} (default), \verb{"GMT} is -used.} +\item{time_zone}{for the resulting \link[=pl_Datetime]{Datetime} column.} \item{strict}{If \code{TRUE} (default), raise an error if a single string cannot -be parsed. If \code{FALSE}, parsing failure will produce a polars \code{null}.} +be parsed. If \code{FALSE}, produce a polars \code{null}.} -\item{exact}{If \code{TRUE} (default), require an exact format match. Otherwise, -allow the format to match anywhere in the target string.} +\item{exact}{If \code{TRUE} (default), require an exact format match. If \code{FALSE}, allow the format to match +anywhere in the target string. Note that using \code{exact = FALSE} introduces a performance +penalty - cleaning your data beforehand will almost certainly be more performant.} \item{cache}{Use a cache of unique, converted dates to apply the datetime conversion.} @@ -42,12 +47,19 @@ conversion.} }} } \value{ -Expr +\link[=Expr_class]{Expr} of \link[=pl_Datetime]{Datetime} type } \description{ Convert a String column into a Datetime column } \examples{ -pl$DataFrame(str_date = c("2009-01-02 01:00", "2009-01-03 02:00", "2009-1-4 3:00"))$ - with_columns(datetime = pl$col("str_date")$str$to_datetime(strict = FALSE)) +s = pl$Series(c("2020-01-01 01:00Z", "2020-01-01 02:00Z")) + +s$str$to_datetime("\%Y-\%m-\%d \%H:\%M\%#z") +s$str$to_datetime(time_unit = "ms") +} +\seealso{ +\itemize{ +\item \code{\link[=ExprStr_strptime]{$str$strptime()}} +} } diff --git a/man/ExprStr_to_time.Rd b/man/ExprStr_to_time.Rd index 0b63c5eae..826abf58a 100644 --- a/man/ExprStr_to_time.Rd +++ b/man/ExprStr_to_time.Rd @@ -3,28 +3,42 @@ \name{ExprStr_to_time} \alias{ExprStr_to_time} \title{Convert a String column into a Time column} +\format{ +Format to use for conversion. Refer to +\href{https://docs.rs/chrono/latest/chrono/format/strftime/index.html}{the chrono crate documentation} +for the full specification. Example: \code{"\%H:\%M:\%S"}. +If \code{NULL} (default), the format is inferred from the data. +} \usage{ ExprStr_to_time(format = NULL, strict = TRUE, cache = TRUE) } \arguments{ -\item{format}{Format to use for conversion. See \code{?strptime} for possible -values. Example: "\%H:\%M:\%S". If \code{NULL} (default), the format is -inferred from the data. Notice that time zone \verb{\%Z} is not supported and will -just ignore timezones. Numeric time zones like \verb{\%z} or \verb{\%:z} are supported.} +\item{format}{Format to use for conversion. Refer to +\href{https://docs.rs/chrono/latest/chrono/format/strftime/index.html}{the chrono crate documentation} +for the full specification. Example: \code{"\%Y-\%m-\%d \%H:\%M:\%S"}. +If \code{NULL} (default), the format is inferred from the data. +Notice that time zone \verb{\%Z} is not supported and will just ignore timezones. +Numeric time zones like \verb{\%z} or \verb{\%:z} are supported.} \item{strict}{If \code{TRUE} (default), raise an error if a single string cannot -be parsed. If \code{FALSE}, parsing failure will produce a polars \code{null}.} +be parsed. If \code{FALSE}, produce a polars \code{null}.} \item{cache}{Use a cache of unique, converted dates to apply the datetime conversion.} } \value{ -Expr +\link[=Expr_class]{Expr} of Time type } \description{ Convert a String column into a Time column } \examples{ -pl$DataFrame(str_time = c("01:20:01", "28:00:02", "03:00:02"))$ - with_columns(time = pl$col("str_time")$str$to_time(strict = FALSE)) +s = pl$Series(c("01:00", "02:00", "03:00")) + +s$str$to_time("\%H:\%M") +} +\seealso{ +\itemize{ +\item \code{\link[=ExprStr_strptime]{$str$strptime()}} +} } diff --git a/man/Expr_add.Rd b/man/Expr_add.Rd index 67ba02a64..ddd8ddcc7 100644 --- a/man/Expr_add.Rd +++ b/man/Expr_add.Rd @@ -2,30 +2,39 @@ % Please edit documentation in R/expr__expr.R \name{Expr_add} \alias{Expr_add} -\alias{+.RPolarsExpr} \title{Add two expressions} \usage{ Expr_add(other) - -\method{+}{RPolarsExpr}(e1, e2) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} - -\item{e1}{Expr only} - -\item{e2}{Expr or anything that can be converted to a literal} +\item{other}{numeric or string value; accepts expression input.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). +Method equivalent of addition operator \code{expr + other}. } \examples{ -pl$lit(5) + 10 -pl$lit(5) + pl$lit(10) -pl$lit(5)$add(pl$lit(10)) -+pl$lit(5) # unary use resolves to same as pl$lit(5) +df = pl$DataFrame(x = 1:5) + +df$with_columns( + `x+int` = pl$col("x")$add(2L), + `x+expr` = pl$col("x")$add(pl$col("x")$cum_prod()) +) + +df = pl$DataFrame( + x = c("a", "d", "g"), + y = c("b", "e", "h"), + z = c("c", "f", "i") +) + +df$with_columns( + pl$col("x")$add(pl$col("y"))$add(pl$col("z"))$alias("xyz") +) +} +\seealso{ +\itemize{ +\item \link[=S3_arithmetic]{Arithmetic operators} +} } diff --git a/man/Expr_and.Rd b/man/Expr_and.Rd index 6b1357fbb..df528d838 100644 --- a/man/Expr_and.Rd +++ b/man/Expr_and.Rd @@ -7,10 +7,10 @@ Expr_and(other) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} +\item{other}{numeric or string value; accepts expression input.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ Combine two boolean expressions with AND. diff --git a/man/Expr_cosh.Rd b/man/Expr_cosh.Rd index e6b0f6545..1025ae072 100644 --- a/man/Expr_cosh.Rd +++ b/man/Expr_cosh.Rd @@ -13,6 +13,6 @@ Expr Compute hyperbolic cosine } \examples{ -pl$DataFrame(a = c(-1, acosh(0.5), 0, 1, NA_real_))$ +pl$DataFrame(a = c(-1, acosh(2), 0, 1, NA_real_))$ with_columns(cosh = pl$col("a")$cosh()) } diff --git a/man/Expr_div.Rd b/man/Expr_div.Rd index 723e8c771..afb5996f5 100644 --- a/man/Expr_div.Rd +++ b/man/Expr_div.Rd @@ -2,29 +2,40 @@ % Please edit documentation in R/expr__expr.R \name{Expr_div} \alias{Expr_div} -\alias{/.RPolarsExpr} \title{Divide two expressions} \usage{ Expr_div(other) - -\method{/}{RPolarsExpr}(e1, e2) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} - -\item{e1}{Expr only} - -\item{e2}{Expr or anything that can be converted to a literal} +\item{other}{Numeric literal or expression value.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). +Method equivalent of float division operator \code{expr / other}. +} +\details{ +Zero-division behaviour follows IEEE-754: +\itemize{ +\item \code{0/0}: Invalid operation - mathematically undefined, returns \code{NaN}. +\item \code{n/0}: On finite operands gives an exact infinite result, e.g.: ±infinity. +} } \examples{ -pl$lit(5) / 10 -pl$lit(5) / pl$lit(10) -pl$lit(5)$div(pl$lit(10)) +df = pl$DataFrame( + x = -2:2, + y = c(0.5, 0, 0, -4, -0.5) +) + +df$with_columns( + `x/2` = pl$col("x")$div(2), + `x/y` = pl$col("x")$div(pl$col("y")) +) +} +\seealso{ +\itemize{ +\item \link[=S3_arithmetic]{Arithmetic operators} +\item \code{\link[=Expr_floor_div]{$floor_div()}} +} } diff --git a/man/Expr_dot.Rd b/man/Expr_dot.Rd index ddd22a3ab..9679b6ff2 100644 --- a/man/Expr_dot.Rd +++ b/man/Expr_dot.Rd @@ -7,10 +7,10 @@ Expr_dot(other) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} +\item{other}{numeric or string value; accepts expression input.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ Compute the dot/inner product between two Expressions. diff --git a/man/Expr_eq.Rd b/man/Expr_eq.Rd index 27530e676..877bbccc6 100644 --- a/man/Expr_eq.Rd +++ b/man/Expr_eq.Rd @@ -2,26 +2,18 @@ % Please edit documentation in R/expr__expr.R \name{Expr_eq} \alias{Expr_eq} -\alias{==.RPolarsExpr} \title{Check equality} \usage{ Expr_eq(other) - -\method{==}{RPolarsExpr}(e1, e2) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} - -\item{e1}{Expr only} - -\item{e2}{Expr or anything that can be converted to a literal} +\item{other}{numeric or string value; accepts expression input.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). +Method equivalent of addition operator \code{expr + other}. } \examples{ pl$lit(2) == 2 diff --git a/man/Expr_eq_missing.Rd b/man/Expr_eq_missing.Rd index 71f9d87a3..7e4d2f05c 100644 --- a/man/Expr_eq_missing.Rd +++ b/man/Expr_eq_missing.Rd @@ -7,14 +7,13 @@ Expr_eq_missing(other) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} +\item{other}{numeric or string value; accepts expression input.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). +Method equivalent of addition operator \code{expr + other}. } \examples{ df = pl$DataFrame(x = c(NA, FALSE, TRUE), y = c(TRUE, TRUE, TRUE)) diff --git a/man/Expr_floor_div.Rd b/man/Expr_floor_div.Rd index ee11539b1..769ad26c6 100644 --- a/man/Expr_floor_div.Rd +++ b/man/Expr_floor_div.Rd @@ -2,29 +2,31 @@ % Please edit documentation in R/expr__expr.R \name{Expr_floor_div} \alias{Expr_floor_div} -\alias{\%/\%.RPolarsExpr} \title{Floor divide two expressions} \usage{ Expr_floor_div(other) - -\method{\%/\%}{RPolarsExpr}(e1, e2) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} - -\item{e1}{Expr only} - -\item{e2}{Expr or anything that can be converted to a literal} +\item{other}{Numeric literal or expression value.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). +Method equivalent of floor division operator \code{expr \%/\% other}. } \examples{ -pl$lit(5) \%/\% 10 -pl$lit(5) \%/\% pl$lit(10) -pl$lit(5)$floor_div(pl$lit(10)) +df = pl$DataFrame(x = 1:5) + +df$with_columns( + `x/2` = pl$col("x")$div(2), + `x\%/\%2` = pl$col("x")$floor_div(2) +) +} +\seealso{ +\itemize{ +\item \link[=S3_arithmetic]{Arithmetic operators} +\item \code{\link[=Expr_div]{$div()}} +\item \code{\link[=Expr_mod]{$mod()}} +} } diff --git a/man/Expr_gt.Rd b/man/Expr_gt.Rd index b13cdff3b..b53e442a5 100644 --- a/man/Expr_gt.Rd +++ b/man/Expr_gt.Rd @@ -2,26 +2,18 @@ % Please edit documentation in R/expr__expr.R \name{Expr_gt} \alias{Expr_gt} -\alias{>.RPolarsExpr} \title{Check strictly greater inequality} \usage{ Expr_gt(other) - -\method{>}{RPolarsExpr}(e1, e2) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} - -\item{e1}{Expr only} - -\item{e2}{Expr or anything that can be converted to a literal} +\item{other}{numeric or string value; accepts expression input.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). +Method equivalent of addition operator \code{expr + other}. } \examples{ pl$lit(2) > 1 diff --git a/man/Expr_gt_eq.Rd b/man/Expr_gt_eq.Rd index 5e1afad23..42473c42d 100644 --- a/man/Expr_gt_eq.Rd +++ b/man/Expr_gt_eq.Rd @@ -2,26 +2,18 @@ % Please edit documentation in R/expr__expr.R \name{Expr_gt_eq} \alias{Expr_gt_eq} -\alias{>=.RPolarsExpr} \title{Check greater or equal inequality} \usage{ Expr_gt_eq(other) - -\method{>=}{RPolarsExpr}(e1, e2) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} - -\item{e1}{Expr only} - -\item{e2}{Expr or anything that can be converted to a literal} +\item{other}{numeric or string value; accepts expression input.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). +Method equivalent of addition operator \code{expr + other}. } \examples{ pl$lit(2) >= 2 diff --git a/man/Expr_is_in.Rd b/man/Expr_is_in.Rd index f6d2d69be..e62cf384b 100644 --- a/man/Expr_is_in.Rd +++ b/man/Expr_is_in.Rd @@ -7,7 +7,7 @@ Expr_is_in(other) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} +\item{other}{numeric or string value; accepts expression input.} } \value{ Expr diff --git a/man/Expr_lt.Rd b/man/Expr_lt.Rd index 2e143f41b..ed75050b8 100644 --- a/man/Expr_lt.Rd +++ b/man/Expr_lt.Rd @@ -2,26 +2,18 @@ % Please edit documentation in R/expr__expr.R \name{Expr_lt} \alias{Expr_lt} -\alias{<.RPolarsExpr} \title{Check strictly lower inequality} \usage{ Expr_lt(other) - -\method{<}{RPolarsExpr}(e1, e2) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} - -\item{e1}{Expr only} - -\item{e2}{Expr or anything that can be converted to a literal} +\item{other}{numeric or string value; accepts expression input.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). +Method equivalent of addition operator \code{expr + other}. } \examples{ pl$lit(5) < 10 diff --git a/man/Expr_lt_eq.Rd b/man/Expr_lt_eq.Rd index 906846774..d31f18eee 100644 --- a/man/Expr_lt_eq.Rd +++ b/man/Expr_lt_eq.Rd @@ -2,26 +2,18 @@ % Please edit documentation in R/expr__expr.R \name{Expr_lt_eq} \alias{Expr_lt_eq} -\alias{<=.RPolarsExpr} \title{Check lower or equal inequality} \usage{ Expr_lt_eq(other) - -\method{<=}{RPolarsExpr}(e1, e2) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} - -\item{e1}{Expr only} - -\item{e2}{Expr or anything that can be converted to a literal} +\item{other}{numeric or string value; accepts expression input.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). +Method equivalent of addition operator \code{expr + other}. } \examples{ pl$lit(2) <= 2 diff --git a/man/Expr_mod.Rd b/man/Expr_mod.Rd index ca538c3e1..5e4916eeb 100644 --- a/man/Expr_mod.Rd +++ b/man/Expr_mod.Rd @@ -2,40 +2,29 @@ % Please edit documentation in R/expr__expr.R \name{Expr_mod} \alias{Expr_mod} -\alias{\%\%.RPolarsExpr} \title{Modulo two expressions} \usage{ Expr_mod(other) - -\method{\%\%}{RPolarsExpr}(e1, e2) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} - -\item{e1}{Expr only} - -\item{e2}{Expr or anything that can be converted to a literal} +\item{other}{Numeric literal or expression value.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). -} -\details{ -Currently, the modulo operator behaves differently than in R, -and not guaranteed \code{x == (x \%\% y) + y * (x \%/\% y)}. +Method equivalent of modulus operator \code{expr \%\% other}. } \examples{ -pl$select(pl$lit(-1:12) \%\% 3)$to_series()$to_vector() +df = pl$DataFrame(x = -5L:5L) -# The example is **NOT** equivalent to the followings: --1:12 \%\% 3 -pl$select(-1:12 \%\% 3)$to_series()$to_vector() - -# Not guaranteed `x == (x \%\% y) + y * (x \%/\% y)` -x = pl$lit(-1:12) -y = pl$lit(3) -pl$select(x == (x \%\% y) + y * (x \%/\% y)) +df$with_columns( + `x\%\%2` = pl$col("x")$mod(2) +) +} +\seealso{ +\itemize{ +\item \link[=S3_arithmetic]{Arithmetic operators} +\item \code{\link[=Expr_floor_div]{$floor_div()}} +} } diff --git a/man/Expr_mul.Rd b/man/Expr_mul.Rd index ea46e2d3a..592f7bcbd 100644 --- a/man/Expr_mul.Rd +++ b/man/Expr_mul.Rd @@ -2,29 +2,29 @@ % Please edit documentation in R/expr__expr.R \name{Expr_mul} \alias{Expr_mul} -\alias{*.RPolarsExpr} \title{Multiply two expressions} \usage{ Expr_mul(other) - -\method{*}{RPolarsExpr}(e1, e2) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} - -\item{e1}{Expr only} - -\item{e2}{Expr or anything that can be converted to a literal} +\item{other}{Numeric literal or expression value.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). +Method equivalent of multiplication operator \code{expr * other}. } \examples{ -pl$lit(5) * 10 -pl$lit(5) * pl$lit(10) -pl$lit(5)$mul(pl$lit(10)) +df = pl$DataFrame(x = c(1, 2, 4, 8, 16)) + +df$with_columns( + `x*2` = pl$col("x")$mul(2), + `x * xlog2` = pl$col("x")$mul(pl$col("x")$log(2)) +) +} +\seealso{ +\itemize{ +\item \link[=S3_arithmetic]{Arithmetic operators} +} } diff --git a/man/Expr_neq.Rd b/man/Expr_neq.Rd index 7d9a585d3..67f08b750 100644 --- a/man/Expr_neq.Rd +++ b/man/Expr_neq.Rd @@ -2,26 +2,18 @@ % Please edit documentation in R/expr__expr.R \name{Expr_neq} \alias{Expr_neq} -\alias{!=.RPolarsExpr} \title{Check inequality} \usage{ Expr_neq(other) - -\method{!=}{RPolarsExpr}(e1, e2) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} - -\item{e1}{Expr only} - -\item{e2}{Expr or anything that can be converted to a literal} +\item{other}{numeric or string value; accepts expression input.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). +Method equivalent of addition operator \code{expr + other}. } \examples{ pl$lit(1) != 2 diff --git a/man/Expr_neq_missing.Rd b/man/Expr_neq_missing.Rd index a23ac6bcc..10048887b 100644 --- a/man/Expr_neq_missing.Rd +++ b/man/Expr_neq_missing.Rd @@ -7,14 +7,13 @@ Expr_neq_missing(other) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} +\item{other}{numeric or string value; accepts expression input.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). +Method equivalent of addition operator \code{expr + other}. } \examples{ df = pl$DataFrame(x = c(NA, FALSE, TRUE), y = c(TRUE, TRUE, TRUE)) diff --git a/man/Expr_not.Rd b/man/Expr_not.Rd index d5dfc86e7..1129d8a46 100644 --- a/man/Expr_not.Rd +++ b/man/Expr_not.Rd @@ -2,22 +2,15 @@ % Please edit documentation in R/expr__expr.R \name{Expr_not} \alias{Expr_not} -\alias{!.RPolarsExpr} \title{Negate a boolean expression} \usage{ Expr_not() - -\method{!}{RPolarsExpr}(x) -} -\arguments{ -\item{x}{Expr} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). +Method equivalent of negation operator \code{!expr}. } \examples{ # two syntaxes same result diff --git a/man/Expr_or.Rd b/man/Expr_or.Rd index 9ab2ee3f9..6ea800463 100644 --- a/man/Expr_or.Rd +++ b/man/Expr_or.Rd @@ -7,10 +7,10 @@ Expr_or(other) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} +\item{other}{numeric or string value; accepts expression input.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ Combine two boolean expressions with OR. diff --git a/man/Expr_pow.Rd b/man/Expr_pow.Rd index c09428668..0588275fe 100644 --- a/man/Expr_pow.Rd +++ b/man/Expr_pow.Rd @@ -2,23 +2,29 @@ % Please edit documentation in R/expr__expr.R \name{Expr_pow} \alias{Expr_pow} -\title{Exponentiation} +\title{Exponentiation two expressions} \usage{ Expr_pow(exponent) } \arguments{ -\item{exponent}{Exponent value.} +\item{exponent}{Numeric literal or expression value.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -Raise expression to the power of exponent. +Method equivalent of exponentiation operator \code{expr ^ exponent}. } \examples{ -# use via `pow`-method and the `^`-operator -pl$DataFrame(a = -1:3, b = 2:6)$with_columns( - x = pl$col("a")$pow(2), - y = pl$col("a")^3 +df = pl$DataFrame(x = c(1, 2, 4, 8)) + +df$with_columns( + cube = pl$col("x")$pow(3), + `x^xlog2` = pl$col("x")$pow(pl$col("x")$log(2)) ) } +\seealso{ +\itemize{ +\item \link[=S3_arithmetic]{Arithmetic operators} +} +} diff --git a/man/Expr_rolling.Rd b/man/Expr_rolling.Rd index 0f106ab95..79c0eb9e4 100644 --- a/man/Expr_rolling.Rd +++ b/man/Expr_rolling.Rd @@ -95,7 +95,7 @@ dates = c( df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ with_columns( - pl$col("dt")$str$strptime(pl$Datetime(tu = "us"), format = "\%Y-\%m-\%d \%H:\%M:\%S")$set_sorted() + pl$col("dt")$str$strptime(pl$Datetime("us"), format = "\%Y-\%m-\%d \%H:\%M:\%S")$set_sorted() ) df$with_columns( diff --git a/man/Expr_sub.Rd b/man/Expr_sub.Rd index 2ce082db0..2ba5e235d 100644 --- a/man/Expr_sub.Rd +++ b/man/Expr_sub.Rd @@ -2,30 +2,29 @@ % Please edit documentation in R/expr__expr.R \name{Expr_sub} \alias{Expr_sub} -\alias{-.RPolarsExpr} \title{Substract two expressions} \usage{ Expr_sub(other) - -\method{-}{RPolarsExpr}(e1, e2) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} - -\item{e1}{Expr only} - -\item{e2}{Expr or anything that can be converted to a literal} +\item{other}{Numeric literal or expression value.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ -The RHS can either be an Expr or an object that can be converted to a literal -(e.g an integer). +Method equivalent of subtraction operator \code{expr - other}. } \examples{ -pl$lit(5) - 10 -pl$lit(5) - pl$lit(10) -pl$lit(5)$sub(pl$lit(10)) --pl$lit(5) +df = pl$DataFrame(x = 0:4) + +df$with_columns( + `x-2` = pl$col("x")$sub(2), + `x-expr` = pl$col("x")$sub(pl$col("x")$cum_sum()) +) +} +\seealso{ +\itemize{ +\item \link[=S3_arithmetic]{Arithmetic operators} +} } diff --git a/man/Expr_xor.Rd b/man/Expr_xor.Rd index 5bceb032b..20bd977d2 100644 --- a/man/Expr_xor.Rd +++ b/man/Expr_xor.Rd @@ -7,10 +7,10 @@ Expr_xor(other) } \arguments{ -\item{other}{Literal or object that can be converted to a literal} +\item{other}{numeric or string value; accepts expression input.} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ Combine two boolean expressions with XOR. diff --git a/man/LazyFrame_class.Rd b/man/LazyFrame_class.Rd index 308c27783..936f079d9 100644 --- a/man/LazyFrame_class.Rd +++ b/man/LazyFrame_class.Rd @@ -54,6 +54,50 @@ SQL DBs and other data sources such parquet files simultaneously. } } +\section{Conversion to R data types considerations}{ + +When converting Polars objects, such as \link[=DataFrame_class]{DataFrames} +to R objects, for example via the \code{\link[=as.data.frame.RPolarsDataFrame]{as.data.frame()}} generic function, +each type in the Polars object is converted to an R type. +In some cases, an error may occur because the conversion is not appropriate. +In particular, there is a high possibility of an error when converting +a \link[=DataType_Datetime]{Datetime} type without a time zone. +A \link[=DataType_Datetime]{Datetime} type without a time zone in Polars is converted +to the \link{POSIXct} type in R, which takes into account the time zone in which +the R session is running (which can be checked with the \code{\link[=Sys.timezone]{Sys.timezone()}} +function). In this case, if ambiguous times are included, a conversion error +will occur. In such cases, change the session time zone using +\code{\link[base:Sys.setenv]{Sys.setenv(TZ = "UTC")}} and then perform the conversion, or use the +\code{\link[=ExprDT_replace_time_zone]{$dt$replace_time_zone()}} method on the Datetime type column to +explicitly specify the time zone before conversion. + +\if{html}{\out{
}}\preformatted{# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am +# so this particular date-time doesn't exist +non_existent_time = pl$Series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "\%F \%T") + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + tryCatch( + # This causes an error due to the time zone (the `TZ` env var is affected). + as.vector(non_existent_time), + error = function(e) e + ) + \} +) +#> + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + # This is safe. + as.vector(non_existent_time$dt$replace_time_zone("UTC")) + \} +) +#> [1] "2020-03-08 02:00:00 UTC" +}\if{html}{\out{
}} +} + \examples{ # see all exported methods ls(.pr$env$RPolarsLazyFrame) diff --git a/man/LazyFrame_group_by_dynamic.Rd b/man/LazyFrame_group_by_dynamic.Rd index c24614559..fab7c82ef 100644 --- a/man/LazyFrame_group_by_dynamic.Rd +++ b/man/LazyFrame_group_by_dynamic.Rd @@ -123,8 +123,7 @@ lf = pl$LazyFrame( time = pl$date_range( start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), - interval = "30m", - eager = TRUE, + interval = "30m" ), n = 0:6 ) diff --git a/man/S3_arithmetic.Rd b/man/S3_arithmetic.Rd new file mode 100644 index 000000000..2508499ec --- /dev/null +++ b/man/S3_arithmetic.Rd @@ -0,0 +1,96 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/s3-methods-operator.R +\name{S3_arithmetic} +\alias{S3_arithmetic} +\alias{+.RPolarsExpr} +\alias{-.RPolarsExpr} +\alias{*.RPolarsExpr} +\alias{/.RPolarsExpr} +\alias{^.RPolarsExpr} +\alias{\%\%.RPolarsExpr} +\alias{\%/\%.RPolarsExpr} +\alias{+.RPolarsSeries} +\alias{-.RPolarsSeries} +\alias{*.RPolarsSeries} +\alias{/.RPolarsSeries} +\alias{^.RPolarsSeries} +\alias{\%\%.RPolarsSeries} +\alias{\%/\%.RPolarsSeries} +\title{Arithmetic operators for RPolars objects} +\usage{ +\method{+}{RPolarsExpr}(x, y) + +\method{-}{RPolarsExpr}(x, y) + +\method{*}{RPolarsExpr}(x, y) + +\method{/}{RPolarsExpr}(x, y) + +\method{^}{RPolarsExpr}(x, y) + +\method{\%\%}{RPolarsExpr}(x, y) + +\method{\%/\%}{RPolarsExpr}(x, y) + +\method{+}{RPolarsSeries}(x, y) + +\method{-}{RPolarsSeries}(x, y) + +\method{*}{RPolarsSeries}(x, y) + +\method{/}{RPolarsSeries}(x, y) + +\method{^}{RPolarsSeries}(x, y) + +\method{\%\%}{RPolarsSeries}(x, y) + +\method{\%/\%}{RPolarsSeries}(x, y) +} +\arguments{ +\item{x, y}{numeric type of RPolars objects or objects that can be coerced such. +Only \code{+} can take strings.} +} +\value{ +A Polars object the same type as the input. +} +\description{ +Arithmetic operators for RPolars objects +} +\examples{ +pl$lit(5) + 10 +5 + pl$lit(10) +pl$lit(5) + pl$lit(10) ++pl$lit(1) + +# This will not raise an error as it is not actually evaluated. +expr = pl$lit(5) + "10" +expr + +# Will raise an error as it is evaluated. +tryCatch( + expr$to_series(), + error = function(e) e +) + +pl$Series(5) + 10 ++pl$Series(5) +-pl$Series(5) +} +\seealso{ +\itemize{ +\item \code{\link[=Expr_add]{$add()}} +\item \code{\link[=Expr_sub]{$sub()}} +\item \code{\link[=Expr_mul]{$mul()}} +\item \code{\link[=Expr_div]{$div()}} +\item \code{\link[=Expr_pow]{$pow()}} +\item \code{\link[=Expr_mod]{$mod()}} +\item \code{\link[=Expr_floor_div]{$floor_div()}} +\item \code{\link[=Series_add]{$add()}} +\item \code{\link[=Series_sub]{$sub()}} +\item \code{\link[=Series_mul]{$mul()}} +\item \code{\link[=Series_div]{$div()}} +\item \code{\link[=Series_pow]{$pow()}} +\item \code{\link[=Series_mod]{$mod()}} +\item \code{\link[=Series_floor_div]{$floor_div()}} +} +} diff --git a/man/S3_as.character.Rd b/man/S3_as.character.Rd index df3eb5002..cecee4577 100644 --- a/man/S3_as.character.Rd +++ b/man/S3_as.character.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{as.character.RPolarsSeries} \alias{as.character.RPolarsSeries} \title{Convert to a character vector} diff --git a/man/S3_as.data.frame.Rd b/man/S3_as.data.frame.Rd index 450a9e1be..c89e13cc2 100644 --- a/man/S3_as.data.frame.Rd +++ b/man/S3_as.data.frame.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{as.data.frame.RPolarsDataFrame} \alias{as.data.frame.RPolarsDataFrame} \alias{as.data.frame.RPolarsLazyFrame} @@ -78,6 +78,50 @@ into the resulting DataFrame. Useful in interactive mode to not lock R session.} \description{ Equivalent to \code{as_polars_df(x, ...)$to_data_frame(...)}. } +\section{Conversion to R data types considerations}{ + +When converting Polars objects, such as \link[=DataFrame_class]{DataFrames} +to R objects, for example via the \code{\link[=as.data.frame.RPolarsDataFrame]{as.data.frame()}} generic function, +each type in the Polars object is converted to an R type. +In some cases, an error may occur because the conversion is not appropriate. +In particular, there is a high possibility of an error when converting +a \link[=DataType_Datetime]{Datetime} type without a time zone. +A \link[=DataType_Datetime]{Datetime} type without a time zone in Polars is converted +to the \link{POSIXct} type in R, which takes into account the time zone in which +the R session is running (which can be checked with the \code{\link[=Sys.timezone]{Sys.timezone()}} +function). In this case, if ambiguous times are included, a conversion error +will occur. In such cases, change the session time zone using +\code{\link[base:Sys.setenv]{Sys.setenv(TZ = "UTC")}} and then perform the conversion, or use the +\code{\link[=ExprDT_replace_time_zone]{$dt$replace_time_zone()}} method on the Datetime type column to +explicitly specify the time zone before conversion. + +\if{html}{\out{
}}\preformatted{# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am +# so this particular date-time doesn't exist +non_existent_time = pl$Series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "\%F \%T") + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + tryCatch( + # This causes an error due to the time zone (the `TZ` env var is affected). + as.vector(non_existent_time), + error = function(e) e + ) + \} +) +#> + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + # This is safe. + as.vector(non_existent_time$dt$replace_time_zone("UTC")) + \} +) +#> [1] "2020-03-08 02:00:00 UTC" +}\if{html}{\out{
}} +} + \seealso{ \itemize{ \item \code{\link[=as_polars_df]{as_polars_df()}} diff --git a/man/S3_as.matrix.Rd b/man/S3_as.matrix.Rd index 599ce79c4..f7912a426 100644 --- a/man/S3_as.matrix.Rd +++ b/man/S3_as.matrix.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{as.matrix.RPolarsDataFrame} \alias{as.matrix.RPolarsDataFrame} \alias{as.matrix.RPolarsLazyFrame} diff --git a/man/S3_as.vector.Rd b/man/S3_as.vector.Rd index 072c6250f..4ea9b8ac3 100644 --- a/man/S3_as.vector.Rd +++ b/man/S3_as.vector.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{as.vector.RPolarsSeries} \alias{as.vector.RPolarsSeries} \title{Convert to a vector} @@ -14,3 +14,47 @@ \description{ Convert to a vector } +\section{Conversion to R data types considerations}{ + +When converting Polars objects, such as \link[=DataFrame_class]{DataFrames} +to R objects, for example via the \code{\link[=as.data.frame.RPolarsDataFrame]{as.data.frame()}} generic function, +each type in the Polars object is converted to an R type. +In some cases, an error may occur because the conversion is not appropriate. +In particular, there is a high possibility of an error when converting +a \link[=DataType_Datetime]{Datetime} type without a time zone. +A \link[=DataType_Datetime]{Datetime} type without a time zone in Polars is converted +to the \link{POSIXct} type in R, which takes into account the time zone in which +the R session is running (which can be checked with the \code{\link[=Sys.timezone]{Sys.timezone()}} +function). In this case, if ambiguous times are included, a conversion error +will occur. In such cases, change the session time zone using +\code{\link[base:Sys.setenv]{Sys.setenv(TZ = "UTC")}} and then perform the conversion, or use the +\code{\link[=ExprDT_replace_time_zone]{$dt$replace_time_zone()}} method on the Datetime type column to +explicitly specify the time zone before conversion. + +\if{html}{\out{
}}\preformatted{# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am +# so this particular date-time doesn't exist +non_existent_time = pl$Series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "\%F \%T") + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + tryCatch( + # This causes an error due to the time zone (the `TZ` env var is affected). + as.vector(non_existent_time), + error = function(e) e + ) + \} +) +#> + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + # This is safe. + as.vector(non_existent_time$dt$replace_time_zone("UTC")) + \} +) +#> [1] "2020-03-08 02:00:00 UTC" +}\if{html}{\out{
}} +} + diff --git a/man/S3_c.Rd b/man/S3_c.Rd index a89af53d0..8cc6a5d4d 100644 --- a/man/S3_c.Rd +++ b/man/S3_c.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{c.RPolarsSeries} \alias{c.RPolarsSeries} \title{Combine to a Series} diff --git a/man/S3_dim.Rd b/man/S3_dim.Rd index 2e36a44b7..4492db4c4 100644 --- a/man/S3_dim.Rd +++ b/man/S3_dim.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{dim.RPolarsDataFrame} \alias{dim.RPolarsDataFrame} \alias{dim.RPolarsLazyFrame} diff --git a/man/S3_dimnames.Rd b/man/S3_dimnames.Rd index 7b4d6f57a..497750c4e 100644 --- a/man/S3_dimnames.Rd +++ b/man/S3_dimnames.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{dimnames.RPolarsDataFrame} \alias{dimnames.RPolarsDataFrame} \alias{dimnames.RPolarsLazyFrame} diff --git a/man/S3_extract.Rd b/man/S3_extract.Rd index 3a072626f..741132c92 100644 --- a/man/S3_extract.Rd +++ b/man/S3_extract.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{[.RPolarsDataFrame} \alias{[.RPolarsDataFrame} \alias{[.RPolarsLazyFrame} diff --git a/man/S3_head.Rd b/man/S3_head.Rd index 1310815f2..ff265de60 100644 --- a/man/S3_head.Rd +++ b/man/S3_head.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{head.RPolarsDataFrame} \alias{head.RPolarsDataFrame} \alias{head.RPolarsLazyFrame} diff --git a/man/S3_length.Rd b/man/S3_length.Rd index 05b014283..4938a1e18 100644 --- a/man/S3_length.Rd +++ b/man/S3_length.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{length.RPolarsDataFrame} \alias{length.RPolarsDataFrame} \alias{length.RPolarsLazyFrame} diff --git a/man/S3_max.Rd b/man/S3_max.Rd index 592c6e4eb..3a4487da6 100644 --- a/man/S3_max.Rd +++ b/man/S3_max.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{max.RPolarsDataFrame} \alias{max.RPolarsDataFrame} \alias{max.RPolarsLazyFrame} diff --git a/man/S3_mean.Rd b/man/S3_mean.Rd index 9ea9bf8ec..a7115c64f 100644 --- a/man/S3_mean.Rd +++ b/man/S3_mean.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{mean.RPolarsDataFrame} \alias{mean.RPolarsDataFrame} \alias{mean.RPolarsLazyFrame} diff --git a/man/S3_median.Rd b/man/S3_median.Rd index 801f56467..417b661fd 100644 --- a/man/S3_median.Rd +++ b/man/S3_median.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{median.RPolarsDataFrame} \alias{median.RPolarsDataFrame} \alias{median.RPolarsLazyFrame} diff --git a/man/S3_min.Rd b/man/S3_min.Rd index 7b18af496..1d0572956 100644 --- a/man/S3_min.Rd +++ b/man/S3_min.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{min.RPolarsDataFrame} \alias{min.RPolarsDataFrame} \alias{min.RPolarsLazyFrame} diff --git a/man/S3_na.omit.Rd b/man/S3_na.omit.Rd index 3babd0602..13ad768ed 100644 --- a/man/S3_na.omit.Rd +++ b/man/S3_na.omit.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{na.omit.RPolarsLazyFrame} \alias{na.omit.RPolarsLazyFrame} \alias{na.omit.RPolarsDataFrame} diff --git a/man/S3_names.Rd b/man/S3_names.Rd index 8f14307fe..2808de816 100644 --- a/man/S3_names.Rd +++ b/man/S3_names.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{names.RPolarsDataFrame} \alias{names.RPolarsDataFrame} \alias{names.RPolarsLazyFrame} diff --git a/man/S3_print.Rd b/man/S3_print.Rd index 748aa7a31..337eb511f 100644 --- a/man/S3_print.Rd +++ b/man/S3_print.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{print.RPolarsSeries} \alias{print.RPolarsSeries} \title{Print values} diff --git a/man/S3_rownames.Rd b/man/S3_rownames.Rd index adcd18e9e..65388da7e 100644 --- a/man/S3_rownames.Rd +++ b/man/S3_rownames.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{row.names.RPolarsDataFrame} \alias{row.names.RPolarsDataFrame} \title{Get the row names} diff --git a/man/S3_sum.Rd b/man/S3_sum.Rd index 2a6495566..6aef47768 100644 --- a/man/S3_sum.Rd +++ b/man/S3_sum.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{sum.RPolarsDataFrame} \alias{sum.RPolarsDataFrame} \alias{sum.RPolarsLazyFrame} diff --git a/man/S3_unique.Rd b/man/S3_unique.Rd index c3b9651e9..9b2505e7c 100644 --- a/man/S3_unique.Rd +++ b/man/S3_unique.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_methods.R +% Please edit documentation in R/s3-methods.R \name{unique.RPolarsDataFrame} \alias{unique.RPolarsDataFrame} \alias{unique.RPolarsLazyFrame} diff --git a/man/Series_add.Rd b/man/Series_add.Rd index c460100b6..ec237f4b8 100644 --- a/man/Series_add.Rd +++ b/man/Series_add.Rd @@ -2,32 +2,29 @@ % Please edit documentation in R/series__series.R \name{Series_add} \alias{Series_add} -\alias{add} -\alias{+.RPolarsSeries} -\title{add Series} +\title{Add Series} \usage{ Series_add(other) - -\method{+}{RPolarsSeries}(s1, s2) } \arguments{ -\item{other}{Series or into Series} - -\item{s1}{lhs Series} - -\item{s2}{rhs Series or any into Series} +\item{other}{\link[=Series_class]{Series} like object of numeric or string values. +Converted to \link[=Series_class]{Series} by \code{\link[=as_polars_series]{as_polars_series()}} in this method.} } \value{ -Series +\link[=Series_class]{Series} } \description{ -Series arithmetics +Method equivalent of addition operator \code{series + other}. } \examples{ -pl$Series(1:3)$add(11:13) pl$Series(1:3)$add(pl$Series(11:13)) +pl$Series(1:3)$add(11:13) pl$Series(1:3)$add(1L) -1L + pl$Series(1:3) -pl$Series(1:3) + 1L + +pl$Series("a")$add("-z") +} +\seealso{ +\itemize{ +\item \link[=S3_arithmetic]{Arithmetic operators} +} } -\keyword{Series} diff --git a/man/Series_class.Rd b/man/Series_class.Rd index ecf0dcc16..592459474 100644 --- a/man/Series_class.Rd +++ b/man/Series_class.Rd @@ -106,6 +106,50 @@ Some of these are stored in sub-namespaces. } } +\section{Conversion to R data types considerations}{ + +When converting Polars objects, such as \link[=DataFrame_class]{DataFrames} +to R objects, for example via the \code{\link[=as.data.frame.RPolarsDataFrame]{as.data.frame()}} generic function, +each type in the Polars object is converted to an R type. +In some cases, an error may occur because the conversion is not appropriate. +In particular, there is a high possibility of an error when converting +a \link[=DataType_Datetime]{Datetime} type without a time zone. +A \link[=DataType_Datetime]{Datetime} type without a time zone in Polars is converted +to the \link{POSIXct} type in R, which takes into account the time zone in which +the R session is running (which can be checked with the \code{\link[=Sys.timezone]{Sys.timezone()}} +function). In this case, if ambiguous times are included, a conversion error +will occur. In such cases, change the session time zone using +\code{\link[base:Sys.setenv]{Sys.setenv(TZ = "UTC")}} and then perform the conversion, or use the +\code{\link[=ExprDT_replace_time_zone]{$dt$replace_time_zone()}} method on the Datetime type column to +explicitly specify the time zone before conversion. + +\if{html}{\out{
}}\preformatted{# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am +# so this particular date-time doesn't exist +non_existent_time = pl$Series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "\%F \%T") + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + tryCatch( + # This causes an error due to the time zone (the `TZ` env var is affected). + as.vector(non_existent_time), + error = function(e) e + ) + \} +) +#> + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + # This is safe. + as.vector(non_existent_time$dt$replace_time_zone("UTC")) + \} +) +#> [1] "2020-03-08 02:00:00 UTC" +}\if{html}{\out{
}} +} + \examples{ # make a Series s = pl$Series(c(1:3, 1L)) diff --git a/man/Series_div.Rd b/man/Series_div.Rd index 275b8ac09..3fc155297 100644 --- a/man/Series_div.Rd +++ b/man/Series_div.Rd @@ -2,32 +2,27 @@ % Please edit documentation in R/series__series.R \name{Series_div} \alias{Series_div} -\alias{div} -\alias{/.RPolarsSeries} -\title{div Series} +\title{Divide Series} \usage{ Series_div(other) - -\method{/}{RPolarsSeries}(s1, s2) } \arguments{ -\item{other}{Series or into Series} - -\item{s1}{lhs Series} - -\item{s2}{rhs Series or any into Series} +\item{other}{\link[=Series_class]{Series} like object of numeric. +Converted to \link[=Series_class]{Series} by \code{\link[=as_polars_series]{as_polars_series()}} in this method.} } \value{ -Series +\link[=Series_class]{Series} } \description{ -Series arithmetics +Method equivalent of division operator \code{series / other}. } \examples{ pl$Series(1:3)$div(11:13) pl$Series(1:3)$div(pl$Series(11:13)) pl$Series(1:3)$div(1L) -2L / pl$Series(1:3) -pl$Series(1:3) / 2L } -\keyword{Series} +\seealso{ +\itemize{ +\item \link[=S3_arithmetic]{Arithmetic operators} +} +} diff --git a/man/Series_floor_div.Rd b/man/Series_floor_div.Rd new file mode 100644 index 000000000..ad6064434 --- /dev/null +++ b/man/Series_floor_div.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/series__series.R +\name{Series_floor_div} +\alias{Series_floor_div} +\title{Floor Divide Series} +\usage{ +Series_floor_div(other) +} +\arguments{ +\item{other}{\link[=Series_class]{Series} like object of numeric. +Converted to \link[=Series_class]{Series} by \code{\link[=as_polars_series]{as_polars_series()}} in this method.} +} +\value{ +\link[=Series_class]{Series} +} +\description{ +Method equivalent of floor division operator \code{series \%/\% other}. +} +\examples{ +pl$Series(1:3)$floor_div(11:13) +pl$Series(1:3)$floor_div(pl$Series(11:13)) +pl$Series(1:3)$floor_div(1L) +} +\seealso{ +\itemize{ +\item \link[=S3_arithmetic]{Arithmetic operators} +} +} diff --git a/man/Series_mod.Rd b/man/Series_mod.Rd new file mode 100644 index 000000000..70c96a612 --- /dev/null +++ b/man/Series_mod.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/series__series.R +\name{Series_mod} +\alias{Series_mod} +\title{Modulo Series} +\usage{ +Series_mod(other) +} +\arguments{ +\item{other}{\link[=Series_class]{Series} like object of numeric. +Converted to \link[=Series_class]{Series} by \code{\link[=as_polars_series]{as_polars_series()}} in this method.} +} +\value{ +\link[=Series_class]{Series} +} +\description{ +Method equivalent of modulo operator \code{series \%\% other}. +} +\examples{ +pl$Series(1:4)$mod(2L) +pl$Series(1:3)$mod(pl$Series(11:13)) +pl$Series(1:3)$mod(1L) +} +\seealso{ +\itemize{ +\item \link[=S3_arithmetic]{Arithmetic operators} +} +} diff --git a/man/Series_mul.Rd b/man/Series_mul.Rd index 562db9d9a..b33b55d29 100644 --- a/man/Series_mul.Rd +++ b/man/Series_mul.Rd @@ -2,32 +2,27 @@ % Please edit documentation in R/series__series.R \name{Series_mul} \alias{Series_mul} -\alias{mul} -\alias{*.RPolarsSeries} -\title{mul Series} +\title{Multiply Series} \usage{ Series_mul(other) - -\method{*}{RPolarsSeries}(s1, s2) } \arguments{ -\item{other}{Series or into Series} - -\item{s1}{lhs Series} - -\item{s2}{rhs Series or any into Series} +\item{other}{\link[=Series_class]{Series} like object of numeric. +Converted to \link[=Series_class]{Series} by \code{\link[=as_polars_series]{as_polars_series()}} in this method.} } \value{ -Series +\link[=Series_class]{Series} } \description{ -Series arithmetics +Method equivalent of multiplication operator \code{series * other}. } \examples{ pl$Series(1:3)$mul(11:13) pl$Series(1:3)$mul(pl$Series(11:13)) pl$Series(1:3)$mul(1L) -2L * pl$Series(1:3) -pl$Series(1:3) * 2L } -\keyword{Series} +\seealso{ +\itemize{ +\item \link[=S3_arithmetic]{Arithmetic operators} +} +} diff --git a/man/Series_pow.Rd b/man/Series_pow.Rd new file mode 100644 index 000000000..173720b85 --- /dev/null +++ b/man/Series_pow.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/series__series.R +\name{Series_pow} +\alias{Series_pow} +\title{Power Series} +\usage{ +Series_pow(exponent) +} +\arguments{ +\item{exponent}{\link[=Series_class]{Series} like object of numeric. +Converted to \link[=Series_class]{Series} by \code{\link[=as_polars_series]{as_polars_series()}} in this method.} +} +\value{ +\link[=Series_class]{Series} +} +\description{ +Method equivalent of power operator \code{series ^ other}. +} +\examples{ +s = as_polars_series(1:4, name = "foo") + +s$pow(3L) +} +\seealso{ +\itemize{ +\item \link[=S3_arithmetic]{Arithmetic operators} +} +} diff --git a/man/Series_rem.Rd b/man/Series_rem.Rd deleted file mode 100644 index df9e7ef2e..000000000 --- a/man/Series_rem.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/series__series.R -\name{Series_rem} -\alias{Series_rem} -\alias{rem} -\title{rem Series} -\usage{ -Series_rem(other) -} -\arguments{ -\item{other}{Series or into Series} -} -\value{ -Series -} -\description{ -Series arithmetics, remainder -} -\examples{ -pl$Series(1:4)$rem(2L) -pl$Series(1:3)$rem(pl$Series(11:13)) -pl$Series(1:3)$rem(1L) -} -\keyword{Series} diff --git a/man/Series_sub.Rd b/man/Series_sub.Rd index d8e634cba..fd25ecae2 100644 --- a/man/Series_sub.Rd +++ b/man/Series_sub.Rd @@ -2,26 +2,19 @@ % Please edit documentation in R/series__series.R \name{Series_sub} \alias{Series_sub} -\alias{sub} -\alias{-.RPolarsSeries} -\title{sub Series} +\title{Subtract Series} \usage{ Series_sub(other) - -\method{-}{RPolarsSeries}(s1, s2) } \arguments{ -\item{other}{Series or into Series} - -\item{s1}{lhs Series} - -\item{s2}{rhs Series or any into Series} +\item{other}{\link[=Series_class]{Series} like object of numeric. +Converted to \link[=Series_class]{Series} by \code{\link[=as_polars_series]{as_polars_series()}} in this method.} } \value{ -Series +\link[=Series_class]{Series} } \description{ -Series arithmetics +Method equivalent of subtraction operator \code{series - other}. } \examples{ pl$Series(1:3)$sub(11:13) @@ -30,4 +23,8 @@ pl$Series(1:3)$sub(1L) 1L - pl$Series(1:3) pl$Series(1:3) - 1L } -\keyword{Series} +\seealso{ +\itemize{ +\item \link[=S3_arithmetic]{Arithmetic operators} +} +} diff --git a/man/Series_to_r.Rd b/man/Series_to_r.Rd index 77628ca23..41b472bad 100644 --- a/man/Series_to_r.Rd +++ b/man/Series_to_r.Rd @@ -40,6 +40,50 @@ return R list (implicit as.list) Fun fact: Nested polars Series list must have same inner type, e.g. List(List(Int32)) Thus every leaf(non list type) will be placed on the same depth of the tree, and be the same type. } +\section{Conversion to R data types considerations}{ + +When converting Polars objects, such as \link[=DataFrame_class]{DataFrames} +to R objects, for example via the \code{\link[=as.data.frame.RPolarsDataFrame]{as.data.frame()}} generic function, +each type in the Polars object is converted to an R type. +In some cases, an error may occur because the conversion is not appropriate. +In particular, there is a high possibility of an error when converting +a \link[=DataType_Datetime]{Datetime} type without a time zone. +A \link[=DataType_Datetime]{Datetime} type without a time zone in Polars is converted +to the \link{POSIXct} type in R, which takes into account the time zone in which +the R session is running (which can be checked with the \code{\link[=Sys.timezone]{Sys.timezone()}} +function). In this case, if ambiguous times are included, a conversion error +will occur. In such cases, change the session time zone using +\code{\link[base:Sys.setenv]{Sys.setenv(TZ = "UTC")}} and then perform the conversion, or use the +\code{\link[=ExprDT_replace_time_zone]{$dt$replace_time_zone()}} method on the Datetime type column to +explicitly specify the time zone before conversion. + +\if{html}{\out{
}}\preformatted{# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am +# so this particular date-time doesn't exist +non_existent_time = pl$Series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "\%F \%T") + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + tryCatch( + # This causes an error due to the time zone (the `TZ` env var is affected). + as.vector(non_existent_time), + error = function(e) e + ) + \} +) +#> + +withr::with_envvar( + new = c(TZ = "America/New_York"), + \{ + # This is safe. + as.vector(non_existent_time$dt$replace_time_zone("UTC")) + \} +) +#> [1] "2020-03-08 02:00:00 UTC" +}\if{html}{\out{
}} +} + \examples{ series_vec = pl$Series(letters[1:3]) diff --git a/man/as_polars_df.Rd b/man/as_polars_df.Rd index b23e72f30..2bea4d325 100644 --- a/man/as_polars_df.Rd +++ b/man/as_polars_df.Rd @@ -12,6 +12,7 @@ \alias{as_polars_df.RPolarsLazyFrame} \alias{as_polars_df.RPolarsLazyGroupBy} \alias{as_polars_df.ArrowTabular} +\alias{as_polars_df.nanoarrow_array} \alias{as_polars_df.nanoarrow_array_stream} \title{To polars DataFrame} \usage{ @@ -59,6 +60,8 @@ as_polars_df(x, ...) \method{as_polars_df}{ArrowTabular}(x, ..., rechunk = TRUE, schema = NULL, schema_overrides = NULL) +\method{as_polars_df}{nanoarrow_array}(x, ...) + \method{as_polars_df}{nanoarrow_array_stream}(x, ...) } \arguments{ @@ -78,8 +81,8 @@ with unique names. If \code{FALSE} and there are duplicated column names, an error is thrown.} \item{schema}{named list of DataTypes, or character vector of column names. -Should be the same length as the number of columns of \code{x}. -If schema names or types do not match \code{x}, the columns will be renamed/recast. +Should match the number of columns in \code{x} and correspond to each column in \code{x} by position. +If a column in \code{x} does not match the name or type at the same position, it will be renamed/recast. If \code{NULL} (default), convert columns as is.} \item{schema_overrides}{named list of DataTypes. Cast some columns to the DataType.} @@ -154,10 +157,10 @@ as_polars_df( # Convert an arrow Table, with renaming and casting all columns as_polars_df( at, - schema = list(a = pl$Int64, b = pl$String) + schema = list(b = pl$Int64, a = pl$String) ) -# Convert an arrow Table, with renaming and casting some columns +# Convert an arrow Table, with casting some columns as_polars_df( at, schema_overrides = list(y = pl$String) # cast some columns diff --git a/man/pl_Series.Rd b/man/pl_Series.Rd index f40dc6dbd..33ad40614 100644 --- a/man/pl_Series.Rd +++ b/man/pl_Series.Rd @@ -5,20 +5,44 @@ \alias{Series} \title{Create new Series} \usage{ -pl_Series(x, name = NULL) +pl_Series(x, name = NULL, dtype = NULL, ..., nan_to_null = FALSE) } \arguments{ \item{x}{any vector} -\item{name}{string} +\item{name}{Name of the Series. If \code{NULL}, an empty string is used.} + +\item{dtype}{One of \link[=pl_dtypes]{polars data type} or \code{NULL}. +If not \code{NULL}, that data type is used to \link[=Expr_cast]{cast} the Series created from the vector +to a specific data type internally.} + +\item{...}{Ignored.} + +\item{nan_to_null}{If \code{TRUE}, \code{NaN} values contained in the Series are replaced to \code{null}. +Using the \code{\link[=Expr_fill_nan]{$fill_nan()}} method internally.} } \value{ -Series +\link[=Series_class]{Series} } \description{ -found in api as pl$Series named Series_constructor internally +This function is a simple way to convert basic types of vectors provided by base R to +\link[=Series_class]{the Series class object}. +For converting more types properly, use the generic function \code{\link[=as_polars_series]{as_polars_series()}}. } \examples{ -pl$Series(1:4) +# Constructing a Series by specifying name and values positionally: +s = pl$Series(1:3, "a") +s + +# Notice that the dtype is automatically inferred as a polars Int32: +s$dtype + +# Constructing a Series with a specific dtype: +s2 = pl$Series(1:3, "a", dtype = pl$Float32) +s2 +} +\seealso{ +\itemize{ +\item \code{\link[=as_polars_series]{as_polars_series()}} +} } -\keyword{Series_new} diff --git a/man/pl_date_range.Rd b/man/pl_date_range.Rd index d863538c6..6f46ec030 100644 --- a/man/pl_date_range.Rd +++ b/man/pl_date_range.Rd @@ -8,7 +8,6 @@ pl_date_range( start, end, interval, - eager = FALSE, closed = "both", time_unit = "us", time_zone = NULL, @@ -24,9 +23,6 @@ pl_date_range( \item{interval}{String, a Polars \code{duration} or R \code{\link[=difftime]{difftime()}}. Can be missing if \code{end} is missing also.} -\item{eager}{If \code{FALSE} (default), return an \code{Expr}. Otherwise, returns a -\code{Series}.} - \item{closed}{One of \code{"both"} (default), \code{"left"}, \code{"none"} or \code{"right"}.} \item{time_unit}{String (\code{"ns"}, \code{"us"}, \code{"ms"}) or integer.} @@ -53,6 +49,10 @@ timezone, R and polars. In R/r-polars it is perfectly fine to mix timezones of params \code{time_zone}, \code{start} and \code{end}. + +Compared to the Python implementation, \code{pl$date_range()} doesn't have the +argument \code{eager} and always returns an Expr. Use \verb{$to_series()} to return a +Series. } \examples{ # All in GMT, straight forward, no mental confusion diff --git a/man/pl_dtypes.Rd b/man/pl_dtypes.Rd index 9eff6173c..5a800efca 100644 --- a/man/pl_dtypes.Rd +++ b/man/pl_dtypes.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/datatype.R \name{pl_dtypes} \alias{pl_dtypes} +\alias{RPolarsDataType} \title{DataTypes (RPolarsDataType)} \value{ not applicable diff --git a/man/pl_struct.Rd b/man/pl_struct.Rd index b66f1fc54..6c072dc07 100644 --- a/man/pl_struct.Rd +++ b/man/pl_struct.Rd @@ -2,37 +2,31 @@ % Please edit documentation in R/functions__lazy.R \name{pl_struct} \alias{pl_struct} -\alias{struct} -\title{struct} +\title{Collect columns into a struct column} \usage{ -pl_struct(exprs, eager = FALSE, schema = NULL) +pl_struct(exprs, schema = NULL) } \arguments{ \item{exprs}{Columns/Expressions to collect into a Struct.} -\item{eager}{Evaluate immediately.} - -\item{schema}{Optional schema named list that explicitly defines the struct field dtypes. -Each name must match a column name wrapped in the struct. Can only be used to cast some or all -dtypes, not to change the names. NULL means to include keep columns into the struct by their -current DataType. If a column is not included in the schema it is removed from the final struct.} +\item{schema}{Optional schema named list that explicitly defines the struct +field dtypes. Each name must match a column name wrapped in the struct. Can +only be used to cast some or all dtypes, not to change the names. If \code{NULL} +(default), columns datatype are not modified. Columns that do not exist are +silently ignored and not included in the final struct.} } \value{ -Eager=FALSE: Expr of Series with dtype Struct | Eager=TRUE: Series with dtype Struct +Expr with dtype Struct } \description{ -Collect several columns into a Series of dtype Struct. +Collect columns into a struct column } \details{ -pl$struct creates Expr or Series of DataType Struct() -pl$Struct creates the DataType Struct() -In polars a schema is a named list of DataTypes. #' A schema describes e.g. a DataFrame. -More formally schemas consist of Fields. -A Field is an object describing the name and DataType of a column/Series, but same same. -A struct is a DataFrame wrapped into a Series, the DataType is Struct, and each -sub-datatype within are Fields. -In a dynamic language schema and a Struct (the DataType) are quite the same, except -schemas describe DataFrame and Struct's describe some Series. +\code{pl$struct()} creates an Expr of DataType \code{\link[=DataType_Struct]{Struct()}}. + +Compared to the Python implementation, \code{pl$struct()} doesn't have the +argument \code{eager} and always returns an Expr. Use \verb{$to_series()} to return a +Series. } \examples{ # isolated expression to wrap all columns in a struct aliased 'my_struct' diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index a96c879b9..07459948a 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -83,9 +83,9 @@ dependencies = [ [[package]] name = "argminmax" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "202108b46429b765ef483f8a24d5c46f48c14acfdacc086dd4ab6dddf6bcdbd2" +checksum = "52424b59d69d69d5056d508b260553afd91c57e21849579cd1f50ee8b8b88eaa" dependencies = [ "num-traits", ] @@ -1092,9 +1092,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.10" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "log", @@ -1354,8 +1354,8 @@ dependencies = [ [[package]] name = "polars" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "getrandom", "polars-arrow", @@ -1374,8 +1374,8 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "ahash", "atoi", @@ -1421,8 +1421,8 @@ dependencies = [ [[package]] name = "polars-compute" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "bytemuck", "either", @@ -1436,8 +1436,8 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1470,8 +1470,8 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "avro-schema", "polars-arrow-format", @@ -1482,8 +1482,8 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "ahash", "async-trait", @@ -1523,8 +1523,8 @@ dependencies = [ [[package]] name = "polars-json" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "ahash", "chrono", @@ -1543,8 +1543,8 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1566,8 +1566,8 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "ahash", "aho-corasick", @@ -1602,8 +1602,8 @@ dependencies = [ [[package]] name = "polars-parquet" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "ahash", "async-stream", @@ -1627,8 +1627,8 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -1651,8 +1651,8 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "ahash", "bytemuck", @@ -1678,8 +1678,8 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "bytemuck", "polars-arrow", @@ -1689,8 +1689,8 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "hex", "polars-arrow", @@ -1706,8 +1706,8 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "atoi", "chrono", @@ -1726,8 +1726,8 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.38.1" -source = "git+https://github.com/pola-rs/polars.git?rev=a199ed57898343dc033e52132e8dfbc6f5ab09e0#a199ed57898343dc033e52132e8dfbc6f5ab09e0" +version = "0.38.2" +source = "git+https://github.com/pola-rs/polars.git?rev=946fad7a7b56a360e7ec04867aa19f212fcdf5d6#946fad7a7b56a360e7ec04867aa19f212fcdf5d6" dependencies = [ "ahash", "bytemuck", @@ -1769,7 +1769,7 @@ dependencies = [ [[package]] name = "r-polars" -version = "0.38.0" +version = "0.38.1" dependencies = [ "either", "extendr-api", diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index a78c7a8f5..e2db246a1 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "r-polars" -version = "0.38.0" +version = "0.38.1" edition = "2021" -rust-version = "1.74.1" +rust-version = "1.76.0" publish = false [lib] @@ -52,8 +52,8 @@ serde_json = "*" smartstring = "1.0.1" state = "0.6.0" thiserror = "1.0.57" -polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "a199ed57898343dc033e52132e8dfbc6f5ab09e0", default-features = false } -polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "a199ed57898343dc033e52132e8dfbc6f5ab09e0", default-features = false } +polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "946fad7a7b56a360e7ec04867aa19f212fcdf5d6", default-features = false } +polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "946fad7a7b56a360e7ec04867aa19f212fcdf5d6", default-features = false } either = "1" #features copied from node-polars @@ -153,4 +153,4 @@ features = [ "zip_with", ] git = "https://github.com/pola-rs/polars.git" -rev = "a199ed57898343dc033e52132e8dfbc6f5ab09e0" +rev = "946fad7a7b56a360e7ec04867aa19f212fcdf5d6" diff --git a/src/rust/src/construct_expr.rs b/src/rust/src/construct_expr.rs deleted file mode 100644 index 6460d3efa..000000000 --- a/src/rust/src/construct_expr.rs +++ /dev/null @@ -1,25 +0,0 @@ -use crate::lazy::dsl::Expr; -use crate::robj_to; -use crate::rpolarserr::*; -use extendr_api::prelude::*; - -#[extendr] -pub fn internal_wrap_e(robj: Robj, str_to_lit: Robj) -> RResult { - if robj_to!(bool, str_to_lit)? { - robj_to!(Expr, robj) - } else { - robj_to!(ExprCol, robj) - } -} - -#[extendr] -pub fn robj_to_col(name: Robj) -> RResult { - let vs: Vec = robj_to!(Vec, String, name)?; - Ok(Expr::cols(vs)) -} - -extendr_module! { - mod construct_expr; - fn internal_wrap_e; - fn robj_to_col; -} diff --git a/src/rust/src/conversion_r_to_s.rs b/src/rust/src/conversion_r_to_s.rs index d5aeb31fe..39504308b 100644 --- a/src/rust/src/conversion_r_to_s.rs +++ b/src/rust/src/conversion_r_to_s.rs @@ -1,3 +1,4 @@ +use crate::robj_to; use crate::series::RPolarsSeries; use crate::utils::collect_hinted_result; use extendr_api::prelude::*; @@ -6,6 +7,8 @@ use extendr_api::prelude::*; use polars::prelude as pl; use polars::prelude::IntoSeries; use polars::prelude::NamedFrom; +use polars_lazy::dsl::col; +use polars_lazy::frame::IntoLazy; // Internal tree structure to contain Series of fully parsed nested Robject. // It is easier to resolve concatenated datatype after all elements have been parsed // because empty lists have no type in R, but the corrosponding polars type must be known before @@ -210,10 +213,42 @@ fn recursive_robjname2series_tree(x: &Robj, name: &str) -> pl::PolarsResult { + Ok(SeriesTree::Series( + (s * 1_000f64).cast(&pl::DataType::Int64)?.cast( + &pl::DataType::Datetime(pl::TimeUnit::Milliseconds, Some(tz)), + )?, + )) + } + // sys time + None => { + let sys_tz_robj = R!("Sys.timezone()") + .map_err(|err| pl::PolarsError::ComputeError(err.to_string().into()))?; + let sys_tz = robj_to!(String, sys_tz_robj) + .map_err(|err| pl::PolarsError::ComputeError(err.to_string().into()))?; + let s_name = s.name(); + let utc_s = (s.clone() * 1_000f64).cast(&pl::DataType::Int64)?.cast( + &pl::DataType::Datetime( + pl::TimeUnit::Milliseconds, + Some("UTC".to_string()), + ), + )?; + Ok(SeriesTree::Series( + pl::DataFrame::new(vec![utc_s.clone()])? + .lazy() + .select([col(s_name) + .dt() + .convert_time_zone(sys_tz) + .dt() + .replace_time_zone(None, pl::lit("raise"))]) + .collect()? + .column(s_name)? + .clone(), + )) + } + } } Ok(SeriesTree::Series(s)) if x.inherits("Date") => { Ok(SeriesTree::Series(s.cast(&pl::DataType::Date)?)) diff --git a/src/rust/src/conversion_s_to_r.rs b/src/rust/src/conversion_s_to_r.rs index d99e482b9..848ef2264 100644 --- a/src/rust/src/conversion_s_to_r.rs +++ b/src/rust/src/conversion_s_to_r.rs @@ -1,8 +1,9 @@ -use crate::rdataframe::RPolarsDataFrame; +use crate::{rdataframe::RPolarsDataFrame, robj_to}; use extendr_api::prelude::*; use pl::PolarsError as pl_error; use polars::prelude::{self as pl}; use polars_core::datatypes::DataType; +use polars_lazy::{dsl::col, frame::IntoLazy}; //TODO throw a warning if i32 contains a lowerbound value which is the NA in R. pub fn pl_series_to_list( @@ -212,9 +213,31 @@ pub fn pl_series_to_list( pl::TimeUnit::Milliseconds => 1_000.0, }; - //resolve timezone - let tz = opt_tz.as_ref().map(|s| s.as_str()).unwrap_or(""); - s.cast(&Float64)? + let zoned_s: pl::Series = match opt_tz { + Some(_tz) => { + // zoned time + s.clone() + } + None => { + // naive time + let sys_tz_robj = R!("Sys.timezone()") + .map_err(|err| pl::PolarsError::ComputeError(err.to_string().into()))?; + let sys_tz = robj_to!(String, sys_tz_robj) + .map_err(|err| pl::PolarsError::ComputeError(err.to_string().into()))?; + let s_name = s.name(); + pl::DataFrame::new(vec![s.clone()])? + .lazy() + .select([col(s_name) + .dt() + .replace_time_zone(Some(sys_tz), pl::lit("raise"))]) + .collect()? + .column(s_name)? + .clone() + } + }; + + zoned_s + .cast(&Float64)? .f64() .map(|ca| { ca.into_iter() @@ -226,7 +249,9 @@ pub fn pl_series_to_list( robj.set_class(&["POSIXct", "POSIXt"]) .expect("internal error: class POSIXct label failed") }) - .map(|mut robj| robj.set_attrib("tzone", tz)) + .map(|mut robj| { + robj.set_attrib("tzone", opt_tz.as_ref().map(|s| s.as_str()).unwrap_or("")) + }) .expect("internal error: attr tzone failed") .map_err(|err| { pl_error::ComputeError( diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index 7cd3d1ec8..dc668c993 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -564,6 +564,7 @@ impl RPolarsLazyFrame { comm_subexpr_elim, streaming, fast_projection: _, + row_estimate: _, eager, } = self.0.get_current_optimizations(); list!( diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 79e8e0d45..003113520 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -2698,84 +2698,6 @@ pub fn make_rolling_options( }) } -// #[derive(Clone, Debug)] -// pub struct When { -// predicate: Expr, -// } - -// #[derive(Clone, Debug)] -// pub struct Then { -// predicate: Expr, -// then: Expr, -// } - -// #[derive(Clone)] -// pub struct ChainWhen(dsl::ChainWhen); - -// #[extendr] -// impl WhenThenThen { -// pub fn when(&self, predicate: &RPolarsExpr) -> WhenThenThen { -// Self(self.0.clone().when(predicate.0.clone())) -// } -// pub fn then(&self, expr: &RPolarsExpr) -> WhenThenThen { -// Self(self.0.clone().then(expr.0.clone())) -// } -// pub fn otherwise(&self, expr: &RPolarsExpr) -> RPolarsExpr { -// self.0.clone().otherwise(expr.0.clone()).into() -// } - -// pub fn print(&self) { -// rprintln!("Polars WhenThenThen"); -// } -// } - -// #[derive(Clone)] -// pub struct ChainThen(dsl::ChainThen); - -// #[extendr] -// impl Then { -// pub fn when(&self, predicate: &RPolarsExpr) -> WhenThenThen { -// let e = dsl::when(self.predicate.0.clone()) -// .then(self.then.0.clone()) -// .when(predicate.0.clone()); -// WhenThenThen(e) -// } - -// pub fn otherwise(&self, expr: &RPolarsExpr) -> RPolarsExpr { -// dsl::ternary_expr( -// self.predicate.0.clone(), -// self.then.0.clone(), -// expr.0.clone(), -// ) -// .into() -// } - -// pub fn print(&self) { -// rprintln!("{:?}", self); -// } -// } - -// #[extendr] -// impl When { -// #[allow(clippy::self_named_constructors)] -// pub fn when(predicate: &RPolarsExpr) -> When { -// When { -// predicate: predicate.clone(), -// } -// } - -// pub fn then(&self, expr: &RPolarsExpr) -> WhenThen { -// WhenThen { -// predicate: self.predicate.clone(), -// then: expr.clone(), -// } -// } - -// pub fn print(&self) { -// rprintln!("{:?}", self); -// } -// } - #[extendr] pub fn internal_wrap_e(robj: Robj, str_to_lit: Robj) -> RResult { if robj_to!(bool, str_to_lit)? { diff --git a/src/rust/src/lazy/meta.rs b/src/rust/src/lazy/meta.rs deleted file mode 100644 index 581e77d64..000000000 --- a/src/rust/src/lazy/meta.rs +++ /dev/null @@ -1,2 +0,0 @@ -// extendr does not support muli-block impl -// These methods have been moved to extendr-impl in dsl.rs \ No newline at end of file diff --git a/src/rust/src/lazy/mod.rs b/src/rust/src/lazy/mod.rs index 7396877d8..2f114a819 100644 --- a/src/rust/src/lazy/mod.rs +++ b/src/rust/src/lazy/mod.rs @@ -1,42 +1,7 @@ -//mod apply; -//pub mod dataframe; use extendr_api::*; pub mod dataframe; pub mod dsl; pub mod whenthen; -//#[cfg(feature = "meta")] -//mod meta; -//pub mod utils; - -//pub use apply::*; -//use dsl::*; -//use polars_lazy::prelude::*; - -// pub(crate) trait ToExprs { -// fn to_exprs(self) -> Vec; -// } - -// impl ToExprs for Vec { -// fn to_exprs(self) -> Vec { -// // Safety -// // repr is transparent -// // and has only got one inner field` -// unsafe { std::mem::transmute(self) } -// } -// } - -// pub(crate) trait ToPyExprs { -// fn to_pyexprs(self) -> Vec; -// } - -// impl ToPyExprs for Vec { -// fn to_pyexprs(self) -> Vec { -// // Safety -// // repr is transparent -// // and has only got one inner field` -// unsafe { std::mem::transmute(self) } -// } -// } extendr_module! { mod lazy; diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index 501626fa6..7ac01120b 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -328,6 +328,21 @@ impl RPolarsDataFrame { self.lazy().unnest(names)?.collect() } + pub fn partition_by(&self, by: Robj, maintain_order: Robj, include_key: Robj) -> RResult { + let by = robj_to!(Vec, String, by)?; + let maintain_order = robj_to!(bool, maintain_order)?; + let include_key = robj_to!(bool, include_key)?; + let out = if maintain_order { + self.0.clone().partition_by_stable(by, include_key) + } else { + self.0.partition_by(by, include_key) + } + .map_err(polars_to_rpolars_err)?; + + let vec = unsafe { std::mem::transmute::, Vec>(out) }; + Ok(List::from_values(vec)) + } + pub fn export_stream(&self, stream_ptr: &str) { let schema = self.0.schema().to_arrow(false); let data_type = ArrowDataType::Struct(schema.fields); diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index 16ce05341..9ac105d87 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -79,16 +79,16 @@ [21] "group_by_dynamic" "head" "height" "join" [25] "join_asof" "last" "lazy" "limit" [29] "max" "mean" "median" "melt" - [33] "min" "n_chunks" "null_count" "pivot" - [37] "print" "quantile" "rechunk" "rename" - [41] "reverse" "rolling" "sample" "schema" - [45] "select" "shape" "shift" "shift_and_fill" - [49] "slice" "sort" "std" "sum" - [53] "tail" "to_data_frame" "to_list" "to_series" - [57] "to_struct" "transpose" "unique" "unnest" - [61] "var" "width" "with_columns" "with_row_count" - [65] "with_row_index" "write_csv" "write_json" "write_ndjson" - [69] "write_parquet" + [33] "min" "n_chunks" "null_count" "partition_by" + [37] "pivot" "print" "quantile" "rechunk" + [41] "rename" "reverse" "rolling" "sample" + [45] "schema" "select" "shape" "shift" + [49] "shift_and_fill" "slice" "sort" "std" + [53] "sum" "tail" "to_data_frame" "to_list" + [57] "to_series" "to_struct" "transpose" "unique" + [61] "unnest" "var" "width" "with_columns" + [65] "with_row_count" "with_row_index" "write_csv" "write_json" + [69] "write_ndjson" "write_parquet" --- @@ -104,18 +104,19 @@ [13] "get_columns" "lazy" [15] "melt" "n_chunks" [17] "new_with_capacity" "null_count" - [19] "pivot_expr" "print" - [21] "rechunk" "sample_frac" - [23] "sample_n" "schema" - [25] "select" "select_at_idx" - [27] "set_column_from_robj" "set_column_from_series" - [29] "set_column_names_mut" "shape" - [31] "to_list" "to_list_tag_structs" - [33] "to_list_unwind" "to_struct" - [35] "transpose" "unnest" - [37] "with_columns" "with_row_index" - [39] "write_csv" "write_json" - [41] "write_ndjson" "write_parquet" + [19] "partition_by" "pivot_expr" + [21] "print" "rechunk" + [23] "sample_frac" "sample_n" + [25] "schema" "select" + [27] "select_at_idx" "set_column_from_robj" + [29] "set_column_from_series" "set_column_names_mut" + [31] "shape" "to_list" + [33] "to_list_tag_structs" "to_list_unwind" + [35] "to_struct" "transpose" + [37] "unnest" "with_columns" + [39] "with_row_index" "write_csv" + [41] "write_json" "write_ndjson" + [43] "write_parquet" # public and private methods of each class GroupBy @@ -660,27 +661,26 @@ [121] "pct_change" "peak_max" "peak_min" [124] "pow" "print" "product" [127] "quantile" "rank" "rechunk" - [130] "reinterpret" "rem" "rename" - [133] "rep" "rep_extend" "repeat_by" - [136] "replace" "reshape" "reverse" - [139] "rle" "rle_id" "rolling" - [142] "rolling_max" "rolling_mean" "rolling_median" - [145] "rolling_min" "rolling_quantile" "rolling_skew" - [148] "rolling_std" "rolling_sum" "rolling_var" - [151] "round" "sample" "search_sorted" - [154] "set_sorted" "shape" "shift" - [157] "shift_and_fill" "shrink_dtype" "shuffle" - [160] "sign" "sin" "sinh" - [163] "skew" "slice" "sort" - [166] "sort_by" "sqrt" "std" - [169] "str" "struct" "sub" - [172] "sum" "tail" "tan" - [175] "tanh" "to_frame" "to_lit" - [178] "to_physical" "to_r" "to_r_list" - [181] "to_r_vector" "to_struct" "to_vector" - [184] "top_k" "unique" "unique_counts" - [187] "upper_bound" "value_counts" "var" - [190] "xor" + [130] "reinterpret" "rename" "rep" + [133] "rep_extend" "repeat_by" "replace" + [136] "reshape" "reverse" "rle" + [139] "rle_id" "rolling" "rolling_max" + [142] "rolling_mean" "rolling_median" "rolling_min" + [145] "rolling_quantile" "rolling_skew" "rolling_std" + [148] "rolling_sum" "rolling_var" "round" + [151] "sample" "search_sorted" "set_sorted" + [154] "shape" "shift" "shift_and_fill" + [157] "shrink_dtype" "shuffle" "sign" + [160] "sin" "sinh" "skew" + [163] "slice" "sort" "sort_by" + [166] "sqrt" "std" "str" + [169] "struct" "sub" "sum" + [172] "tail" "tan" "tanh" + [175] "to_frame" "to_lit" "to_physical" + [178] "to_r" "to_r_list" "to_r_vector" + [181] "to_struct" "to_vector" "top_k" + [184] "unique" "unique_counts" "upper_bound" + [187] "value_counts" "var" "xor" --- diff --git a/tests/testthat/_snaps/s3_methods.md b/tests/testthat/_snaps/s3-methods.md similarity index 100% rename from tests/testthat/_snaps/s3_methods.md rename to tests/testthat/_snaps/s3-methods.md diff --git a/tests/testthat/test-as_polars.R b/tests/testthat/test-as_polars.R index 31288484c..b5dedf11c 100644 --- a/tests/testthat/test-as_polars.R +++ b/tests/testthat/test-as_polars.R @@ -19,6 +19,7 @@ if (requireNamespace("arrow", quietly = TRUE) && requireNamespace("nanoarrow", q "polars_lazy_group_by_dynamic", pl$LazyFrame(test_df)$group_by_dynamic("col_int", every = "1i"), "arrow Table", arrow::as_arrow_table(test_df), "arrow RecordBatch", arrow::as_record_batch(test_df), + "nanoarrow_array", nanoarrow::as_nanoarrow_array(test_df), "nanoarrow_array_stream", nanoarrow::as_nanoarrow_array_stream(test_df), ) } @@ -101,13 +102,13 @@ test_that("as_polars_df throws error when make_names_unique = FALSE and there ar test_that("schema option and schema_overrides for as_polars_df.data.frame", { df = data.frame(a = 1:3, b = 4:6) - pl_df_1 = as_polars_df(df, schema = list(a = pl$String, b = pl$Int32)) + pl_df_1 = as_polars_df(df, schema = list(b = pl$String, y = pl$Int32)) pl_df_2 = as_polars_df(df, schema = c("x", "y")) pl_df_3 = as_polars_df(df, schema_overrides = list(a = pl$String)) expect_equal( pl_df_1$to_data_frame(), - data.frame(a = as.character(1:3), b = 4L:6L) + data.frame(b = as.character(1:3), y = 4L:6L) ) expect_equal( pl_df_2$to_data_frame(), @@ -401,6 +402,20 @@ patrick::with_parameters_test_that("clock package class support", as.POSIXct(as.vector(pl_sys_time), tz = "Asia/Kolkata"), as.POSIXct(clock_sys_time, tz = "Asia/Kolkata") ) + + # Test on other time zone + withr::with_envvar( + new = c(TZ = "Europe/Paris"), + { + expect_equal(as.POSIXct(as.vector(pl_naive_time)), as.POSIXct(clock_naive_time)) + expect_equal(as.POSIXct(as.vector(pl_zoned_time_1)), as.POSIXct(clock_zoned_time_1)) + expect_equal(as.POSIXct(as.vector(pl_sys_time)), as.POSIXct(clock_sys_time, tz = "UTC")) + expect_equal( + as.POSIXct(as.vector(pl_sys_time), tz = "Asia/Kolkata"), + as.POSIXct(clock_sys_time, tz = "Asia/Kolkata") + ) + } + ) }, precision = c("nanosecond", "microsecond", "millisecond", "second", "minute", "hour", "day"), .test_name = precision diff --git a/tests/testthat/test-csv-write.R b/tests/testthat/test-csv-write.R index 7ebfa3919..41e8abb81 100644 --- a/tests/testthat/test-csv-write.R +++ b/tests/testthat/test-csv-write.R @@ -70,8 +70,7 @@ test_that("write_csv: date_format works", { date = pl$date_range( as.Date("2020-01-01"), as.Date("2023-01-02"), - interval = "1y", - eager = TRUE + interval = "1y" ) ) dat$write_csv(temp_out, date_format = "%Y") @@ -85,8 +84,7 @@ test_that("write_csv: datetime_format works", { date = pl$date_range( as.Date("2020-01-01"), as.Date("2020-01-02"), - interval = "6h", - eager = TRUE + interval = "6h" ) ) dat$write_csv(temp_out, datetime_format = "%Hh%Mm - %d/%m/%Y") @@ -98,8 +96,7 @@ test_that("write_csv: time_format works", { date = pl$date_range( as.Date("2020-10-17"), as.Date("2020-10-18"), - "8h", - eager = TRUE + "8h" ) )$with_columns(pl$col("date")$dt$time()) dat$write_csv(temp_out, time_format = "%Hh%Mm%Ss") diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index d66f24a77..f15c7f3b2 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -172,7 +172,23 @@ test_that("DataFrame, custom schema", { ) }) +test_that("construct an empty DataFrame with schema only", { + df = pl$select( + int = 1L, + string = pl$lit("a"), + list = list(1), + struct = data.frame(a = 1L, b = "a"), + datetime = as.POSIXct("2021-01-01 00:00:00", tz = "UTC") + ) + expected_dtypes = df$dtypes + df_out = pl$DataFrame( + schema = df$schema + ) + + expect_identical(df_out$shape, c(0, 5)) + expect_true(mapply(`==`, df_out$dtypes, expected_dtypes) |> all()) +}) test_that("DataFrame, select sum over", { df = pl$DataFrame(iris)$select( @@ -1308,3 +1324,84 @@ test_that("flags work", { ) ) }) + +test_that("partition_by", { + df = pl$DataFrame( + col1 = 1:5, + col2 = c("a", "a", "b", "b", "b"), + col3 = c(rep_len("c", 3), rep_len("d", 2)) + ) + + # Test `maintain_order = TRUE` + expect_true( + df$equals(pl$concat(df$partition_by("col2"))) + ) + expect_true( + df$equals(pl$concat(df$partition_by("col2", "col3"))) + ) + expect_true( + df$drop("col3")$equals(pl$concat(df$partition_by("col3", include_key = FALSE))) + ) + + # Test `maintain_order = FALSE` + df_sorted = df$sort(pl$all()) + expect_true( + df_sorted$equals(pl$concat(df$partition_by("col2", maintain_order = FALSE))$sort(pl$all())) + ) + expect_true( + df_sorted$equals(pl$concat(df$partition_by("col2", "col3", maintain_order = FALSE))$sort(pl$all())) + ) + expect_true( + df$drop("col3")$sort(pl$all())$equals( + pl$concat(df$partition_by("col3", include_key = FALSE, maintain_order = FALSE))$sort(pl$all()) + ) + ) + + # Test selecting columns by data type + expect_true( + mapply( + df$partition_by("col2", "col3"), + df$partition_by(pl$String), + FUN = \(x, y) x$equals(y) + ) |> + all() + ) + + # Test errors + expect_error(df$partition_by("foo"), "not found: foo") + expect_error(df$partition_by(pl$Int8), "There is no column to partition by") + + # Test `as_nested_list = TRUE` + expect_true( + mapply( + df$partition_by("col2", "col3"), + df$partition_by("col2", "col3", as_nested_list = TRUE), + FUN = \(x, y) x$equals(y$data) + ) |> + all() + ) + expect_true( + mapply( + df$partition_by("col2", "col3", include_key = FALSE), + df$partition_by("col2", "col3", as_nested_list = TRUE, include_key = FALSE), + FUN = \(x, y) x$equals(y$data) + ) |> + all() + ) + expect_true( + df$partition_by("col2", "col3", as_nested_list = TRUE, include_key = FALSE) |> + lapply(\(x) x$data$with_columns(col2 = pl$lit(x$key$col2), col3 = pl$lit(x$key$col3))) |> + pl$concat() |> + df$equals() + ) + expect_true( + df$partition_by("col2", "col3", as_nested_list = TRUE, maintain_order = FALSE) |> + lapply(\(x) x$data) |> + pl$concat() |> + (\(x) df$equals(x$sort(pl$all())))() + ) + + expect_warning( + df$partition_by("col2", maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE) + ) +}) diff --git a/tests/testthat/test-datatype.R b/tests/testthat/test-datatype.R index 44664cd92..27d6d810e 100644 --- a/tests/testthat/test-datatype.R +++ b/tests/testthat/test-datatype.R @@ -39,13 +39,45 @@ test_that("POSIXct data conversion", { ) expect_identical( - pl$lit(as.POSIXct("2022-01-01", tz = "GMT"))$to_r(), - as.POSIXct("2022-01-01", tz = "GMT") + pl$lit("2022-01-01")$str$strptime(pl$Datetime(), "%F")$to_r(), + as.POSIXct("2022-01-01") + ) + # TODO: infer timezone from string, change the arugment name from `tz` + expect_true( + pl$Series("2022-01-01 UTC")$str$strptime(pl$Datetime(time_zone = "UTC"), "%F %Z")$eq( + pl$Series(as.POSIXct("2022-01-01", tz = "UTC")) + )$to_r() ) - expect_identical( - pl$lit(as.POSIXct("2022-01-01", tz = "HST"))$to_r(), - as.POSIXct("2022-01-01", tz = "HST") + withr::with_envvar( + new = c(TZ = "America/New_York"), + { + expect_identical( + pl$lit("2022-01-01")$str$strptime(pl$Datetime(), "%F")$to_r(), + as.POSIXct("2022-01-01") + ) + # TODO: infer timezone from string, change the arugment name from `tz` + expect_true( + pl$Series("2022-01-01 UTC")$str$strptime(pl$Datetime(time_zone = "UTC"), "%F %Z")$eq( + pl$Series(as.POSIXct("2022-01-01", tz = "UTC")) + )$to_r() + ) + + non_exsitent_time_chr = "2020-03-08 02:00:00" + ambiguous_time_chr = "2020-11-01 01:00:00" + expect_identical( + pl$lit(as.POSIXct(non_exsitent_time_chr))$to_r(), + as.POSIXct(non_exsitent_time_chr) + ) + expect_error( + pl$lit(non_exsitent_time_chr)$str$strptime(pl$Datetime(), "%F %T")$to_r(), + "non-existent" + ) + expect_error( + pl$lit(ambiguous_time_chr)$str$strptime(pl$Datetime(), "%F %T")$to_r(), + "ambiguous" + ) + } ) expect_identical( @@ -53,16 +85,22 @@ test_that("POSIXct data conversion", { as.POSIXct("2022-01-01", tz = "GMT") ) - - x = as.POSIXct( - c( - "2020-01-01 13:45:48.343", - "2020-01-01 13:45:48.343999" - ), - tz = "UTC" + expect_identical( + pl$lit(as.POSIXct("2022-01-01", tz = "HST"))$to_r(), + as.POSIXct("2022-01-01", tz = "HST") ) + # POSIXct is converted to datetime[ms], so sub-ms precision is lost - expect_identical(pl$lit(x)$to_r(), as.POSIXct(c("2020-01-01 13:45:48.343", "2020-01-01 13:45:48.343"), tz = "UTC")) + expect_identical( + pl$lit(as.POSIXct( + c( + "2020-01-01 13:45:48.343", + "2020-01-01 13:45:48.343999" + ), + tz = "UTC" + ))$to_r(), + as.POSIXct(c("2020-01-01 13:45:48.343", "2020-01-01 13:45:48.343"), tz = "UTC") + ) }) test_that("String and Utf8 are identical", { @@ -84,3 +122,13 @@ test_that("Categorical", { pl$Series(c("z", "z", "k", "a"))$cast(pl$Categorical("foobar")) ) }) + + +test_that("allow '*' for time_zone", { + df = pl$DataFrame( + naive_time = as.POSIXct("1900-01-01"), + zoned_time = as.POSIXct("1900-01-01", "UTC") + ) + + expect_identical(df$select(pl$col(pl$Datetime("ms", "*")))$width, 1) +}) diff --git a/tests/testthat/test-expr_datetime.R b/tests/testthat/test-expr_datetime.R index 6652b7a6e..0d687b5f1 100644 --- a/tests/testthat/test-expr_datetime.R +++ b/tests/testthat/test-expr_datetime.R @@ -12,7 +12,11 @@ test_that("pl$date_range", { ) expect_identical( pl$date_range(start = t1, end = t2, interval = "6h", time_zone = "GMT")$to_r(), - seq(t1, t2, by = as.difftime(6, units = "hours")) |> "attr<-"("tzone", "GMT") + seq( + as.POSIXct("2022-01-01", tz = "GMT"), + as.POSIXct("2022-01-02", tz = "GMT"), + by = as.difftime(6, units = "hours") + ) ) expect_identical( pl$date_range(start = t1, end = t2, interval = "3h", time_unit = "ms")$to_r(), @@ -83,7 +87,7 @@ test_that("dt$truncate", { # make a datetime t1 = as.POSIXct("3040-01-01", tz = "GMT") t2 = t1 + as.difftime(25, units = "secs") - s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms", eager = TRUE) + s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms") # use a dt namespace function df = pl$DataFrame(datetime = s)$with_columns( @@ -108,58 +112,36 @@ test_that("dt$truncate", { }) -test_that("pl$date_range lazy ", { +test_that("pl$date_range", { t1 = ISOdate(2022, 1, 1, 0) t2 = ISOdate(2022, 1, 2, 0) - expect_identical( - pl$date_range(start = t1, end = t2, interval = "6h", time_zone = "GMT")$to_r(), - pl$date_range(start = t1, end = t2, interval = "6h", time_zone = "GMT", eager = FALSE)$to_r() - ) - - # check variations of lazy input gives same result df = pl$DataFrame( t1 = t1, t2 = t2 )$select( pl$date_range("t1", "t2", "6h")$alias("s1"), - pl$date_range("t1", "t2", "6h", eager = FALSE)$alias("s2"), - pl$date_range(pl$col("t1"), pl$col("t2"), "6h", eager = FALSE)$alias("s3"), - pl$date_range(t1, t2, "6h")$alias("s4") + pl$date_range(pl$col("t1"), pl$col("t2"), "6h")$alias("s2"), + pl$date_range(t1, t2, "6h")$alias("s3") ) l = df$to_list() expect_identical(l$s1, l$s2) expect_identical(l$s1, l$s3) - expect_identical(l$s1, l$s4) }) -test_that("pl$date_range Date lazy/eager", { - r_vers = paste(unlist(R.version[c("major", "minor")]), collapse = ".") - if (r_vers >= "4.3.0") { - d1 = as.Date("2022-01-01") - s_d = pl$Series(d1, name = "Date") - s_dt = pl$Series(as.POSIXct(d1), name = "Date") # since R4.3 this becomes UTC timezone - df = pl$DataFrame(Date = d1)$to_series() - dr_e = pl$date_range(d1, d1 + 1, interval = "6h") - dr_l = pl$date_range(d1, d1 + 1, interval = "6h", eager = FALSE) - expect_identical(as.POSIXct(s_d$to_r()) |> "attr<-"("tzone", "UTC"), s_dt$to_r()) - expect_identical(d1, s_d$to_r()) - expect_identical(d1, df$to_r()) - expect_identical(s_dt$to_r(), dr_e$to_r()[1] |> "attr<-"("tzone", "UTC")) - expect_identical(s_dt$to_r(), dr_l$to_r()[1] |> "attr<-"("tzone", "UTC")) - } else { - d1 = as.Date("2022-01-01") - s_d = pl$Series(d1, name = "Date") - s_dt = pl$Series(as.POSIXct(d1), name = "Date") - df = pl$DataFrame(Date = d1)$to_series() - dr_e = pl$date_range(d1, d1 + 1, interval = "6h") - dr_l = pl$date_range(d1, d1 + 1, interval = "6h", eager = FALSE) - expect_identical(as.POSIXct(s_d$to_r()) |> "attr<-"("tzone", ""), s_dt$to_r()) - expect_identical(d1, s_d$to_r()) - expect_identical(d1, df$to_r()) - expect_identical(s_dt$to_r(), dr_e$to_r()[1]) - expect_identical(s_dt$to_r(), dr_l$to_r()[1]) - } +test_that("pl$date_range Date", { + d_chr = "2022-01-01" + d_plus1_chr = "2022-01-02" + d_date = as.Date(d_chr) + s_d = pl$Series(d_date) + s_dt = pl$Series(as.POSIXct(d_chr)) + df = pl$DataFrame(Date = d_date)$to_series() + + dr_e = pl$date_range(d_date, d_date + 1, interval = "6h") + + expect_identical(dr_e$to_r()[1], s_dt$to_r()) + expect_identical(rev(dr_e$to_r())[1], as.POSIXct(d_plus1_chr)) + expect_identical(dr_e$to_series()$len(), 5) }) @@ -167,7 +149,7 @@ test_that("dt$round", { # make a datetime t1 = as.POSIXct("3040-01-01", tz = "GMT") t2 = t1 + as.difftime(24, units = "secs") - s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms", eager = TRUE) + s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms") # use a dt namespace function ## TODO contribute POLARS, offset makes little sense, it should be implemented @@ -208,7 +190,7 @@ test_that("dt$combine", { ( pl$lit(as.Date("2021-01-01")) $dt$combine(pl$PTime("02:34:12")) - $cast(pl$Datetime(tu = "us", tz = "GMT")) + $cast(pl$Datetime("us", "GMT")) $to_r() ), as.POSIXct("2021-01-01 02:34:12", tz = "GMT") @@ -218,7 +200,7 @@ test_that("dt$combine", { ( pl$lit(as.Date("2021-01-01")) $dt$combine(pl$PTime(3600 * 1.5E3, tu = "ms")) - $cast(pl$Datetime(tu = "us", tz = "GMT")) + $cast(pl$Datetime("us", "GMT")) $to_r() ), as.POSIXct("2021-01-01 01:30:00", tz = "GMT") @@ -228,7 +210,7 @@ test_that("dt$combine", { ( pl$lit(as.Date("2021-01-01")) $dt$combine(3600 * 1.5E9, tu = "ns") - $cast(pl$Datetime(tu = "us", tz = "GMT")) + $cast(pl$Datetime("us", "GMT")) $to_r() ), as.POSIXct("2021-01-01 01:30:00", tz = "GMT") @@ -238,7 +220,7 @@ test_that("dt$combine", { ( pl$lit(as.Date("2021-01-01")) $dt$combine(-3600 * 1.5E9, tu = "ns") - $cast(pl$Datetime(tu = "us", tz = "GMT")) + $cast(pl$Datetime("us", "GMT")) $to_r() ), as.POSIXct("2020-12-31 22:30:00", tz = "GMT") @@ -264,8 +246,7 @@ test_that("dt$year iso_year", { as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", - time_zone = "GMT", - eager = TRUE + time_zone = "GMT" ) )$with_columns( pl$col("date")$dt$year()$alias("year"), @@ -294,8 +275,7 @@ test_that("dt$quarter, month, day", { as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", - time_zone = "GMT", - eager = TRUE + time_zone = "GMT" ) )$with_columns( pl$col("date")$dt$quarter()$alias("quarter"), @@ -328,8 +308,7 @@ test_that("hour minute", { as.Date("2020-12-25"), as.Date("2021-05-05"), interval = "1d2h3m4s", - time_zone = "GMT", - eager = TRUE + time_zone = "GMT" ) )$with_columns( pl$col("date")$dt$hour()$alias("hour"), @@ -428,7 +407,7 @@ test_that("offset_by", { df = pl$DataFrame( dates = pl$date_range( as.Date("2000-1-1"), as.Date("2005-1-1"), "1y", - time_zone = "GMT", eager = TRUE + time_zone = "GMT" ) ) l_actual = df$with_columns( @@ -503,11 +482,11 @@ test_that("offset_by", { test_that("dt$epoch", { df = pl$select( - pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch("ns")$alias("e_ns"), - pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch("us")$alias("e_us"), - pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch("ms")$alias("e_ms"), - pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch("s")$alias("e_s"), - pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch("d")$alias("e_d") + pl$date_range(as.Date("2022-1-1"))$dt$epoch("ns")$alias("e_ns"), + pl$date_range(as.Date("2022-1-1"))$dt$epoch("us")$alias("e_us"), + pl$date_range(as.Date("2022-1-1"))$dt$epoch("ms")$alias("e_ms"), + pl$date_range(as.Date("2022-1-1"))$dt$epoch("s")$alias("e_s"), + pl$date_range(as.Date("2022-1-1"))$dt$epoch("d")$alias("e_d") ) l_act = df$to_list() @@ -521,11 +500,11 @@ test_that("dt$epoch", { expect_identical(l_act$e_d, base_r_d_epochs) expect_grepl_error( - pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch("bob"), + pl$date_range(as.Date("2022-1-1"))$dt$epoch("bob"), "epoch: tu must be one of 'ns', 'us', 'ms', 's', 'd'" ) expect_grepl_error( - pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$epoch(42), + pl$date_range(as.Date("2022-1-1"))$dt$epoch(42), "epoch: tu must be a string" ) }) @@ -534,7 +513,7 @@ test_that("dt$epoch", { test_that("dt$timestamp", { df = pl$DataFrame( date = pl$date_range( - start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), interval = "1d", eager = TRUE + start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), interval = "1d" ) ) l_exp = df$select( @@ -556,11 +535,11 @@ test_that("dt$timestamp", { expect_identical(suppressWarnings(as.numeric(l_exp$timestamp_ns)), base_r_s_timestamp * 1E9) expect_error( - pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$timestamp("bob") + pl$date_range(as.Date("2022-1-1"))$dt$timestamp("bob") ) expect_error( - pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$timestamp(42) + pl$date_range(as.Date("2022-1-1"))$dt$timestamp(42) ) }) @@ -568,8 +547,7 @@ test_that("dt$timestamp", { test_that("dt$with_time_unit cast_time_unit", { df_time = pl$DataFrame( date = pl$date_range( - start = as.POSIXct("2001-1-1"), end = as.POSIXct("2001-1-3"), interval = "1d", time_unit = "us", - eager = TRUE + start = as.POSIXct("2001-1-1"), end = as.POSIXct("2001-1-3"), interval = "1d", time_unit = "us" ) )$select( pl$col("date"), @@ -608,22 +586,22 @@ test_that("dt$with_time_unit cast_time_unit", { expect_grepl_error( - pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$cast_time_unit("bob"), + pl$date_range(as.Date("2022-1-1"))$dt$cast_time_unit("bob"), r"{The argument \[tu\] caused an error}" ) expect_grepl_error( - pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$cast_time_unit(42), + pl$date_range(as.Date("2022-1-1"))$dt$cast_time_unit(42), r"{Expected a value of type \[\&str\]}" ) # with wrong inputs expect_grepl_error( - pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$with_time_unit("bob"), + pl$date_range(as.Date("2022-1-1"))$dt$with_time_unit("bob"), r"{The argument \[tu\] caused an error}" ) expect_grepl_error( - pl$date_range(as.Date("2022-1-1"), eager = FALSE)$dt$with_time_unit(42), + pl$date_range(as.Date("2022-1-1"))$dt$with_time_unit(42), r"{Expected a value of type \[\&str\]}" ) }) @@ -683,7 +661,7 @@ test_that("dt$replace_time_zone", { df = pl$DataFrame( london_timezone = pl$date_range( start = as.POSIXct("2001-3-1"), end = as.POSIXct("2001-7-1"), - interval = "1mo", time_zone = "Europe/London", eager = TRUE + interval = "1mo", time_zone = "Europe/London" ) ) @@ -738,7 +716,7 @@ test_that("dt$days, dt$hours, dt$mminutes, dt$seconds, + ms, us, ns", { diffy = \(x, units) as.numeric(diff(x), units = units) # days df = pl$DataFrame(date = pl$date_range( - start = as.Date("2020-3-1"), end = as.Date("2020-5-1"), interval = "1mo", eager = TRUE + start = as.Date("2020-3-1"), end = as.Date("2020-5-1"), interval = "1mo" ))$with_columns( pl$col("date")$diff()$dt$total_days()$alias("diff") )$to_list() @@ -746,7 +724,7 @@ test_that("dt$days, dt$hours, dt$mminutes, dt$seconds, + ms, us, ns", { # hours df = pl$DataFrame(date = pl$date_range( - start = as.Date("2020-1-1"), end = as.Date("2020-1-4"), interval = "1d", eager = TRUE + start = as.Date("2020-1-1"), end = as.Date("2020-1-4"), interval = "1d" ))$with_columns( pl$col("date")$diff()$dt$total_hours()$alias("diff") )$to_list() @@ -754,7 +732,7 @@ test_that("dt$days, dt$hours, dt$mminutes, dt$seconds, + ms, us, ns", { # minutes df = pl$DataFrame(date = pl$date_range( - start = as.Date("2020-1-1"), end = as.Date("2020-1-4"), interval = "1d", eager = TRUE + start = as.Date("2020-1-1"), end = as.Date("2020-1-4"), interval = "1d" ))$with_columns( pl$col("date")$diff()$dt$total_minutes()$alias("diff") )$to_list() @@ -763,7 +741,7 @@ test_that("dt$days, dt$hours, dt$mminutes, dt$seconds, + ms, us, ns", { # seconds df = pl$DataFrame(date = pl$date_range( start = as.Date("2020-1-1"), end = as.POSIXct("2020-1-1 00:04:00", tz = "GMT"), - interval = "1m", eager = TRUE + interval = "1m" ))$with_columns( pl$col("date")$diff()$dt$total_seconds()$alias("diff") )$to_list() @@ -773,7 +751,7 @@ test_that("dt$days, dt$hours, dt$mminutes, dt$seconds, + ms, us, ns", { # milliseconds df = pl$DataFrame(date = pl$date_range( start = as.Date("2020-1-1"), end = as.POSIXct("2020-1-1 00:04:00", tz = "GMT"), - interval = "1m", eager = TRUE + interval = "1m" ))$with_columns( pl$col("date")$diff()$dt$total_milliseconds()$alias("diff") )$to_list() @@ -782,7 +760,7 @@ test_that("dt$days, dt$hours, dt$mminutes, dt$seconds, + ms, us, ns", { # microseconds df = pl$DataFrame(date = pl$date_range( start = as.Date("2020-1-1"), end = as.POSIXct("2020-1-1 00:04:00", tz = "GMT"), - interval = "1m", eager = TRUE + interval = "1m" ))$with_columns( pl$col("date")$diff()$dt$total_microseconds()$alias("diff") )$to_list() @@ -791,7 +769,7 @@ test_that("dt$days, dt$hours, dt$mminutes, dt$seconds, + ms, us, ns", { # nanoseconds df = pl$DataFrame(date = pl$date_range( start = as.Date("2020-1-1"), end = as.POSIXct("2020-1-1 00:04:00", tz = "GMT"), - interval = "1m", eager = TRUE + interval = "1m" ))$with_columns( pl$col("date")$diff()$dt$total_nanoseconds()$alias("diff") )$to_list() @@ -803,8 +781,7 @@ test_that("$dt$time()", { dates = pl$date_range( as.Date("2000-1-1"), as.Date("2000-1-2"), - "6h", - eager = TRUE + "6h" ) ) expect_identical( diff --git a/tests/testthat/test-expr_expr.R b/tests/testthat/test-expr_expr.R index d950a0671..9380a9dfa 100644 --- a/tests/testthat/test-expr_expr.R +++ b/tests/testthat/test-expr_expr.R @@ -35,41 +35,15 @@ test_that("expression boolean operators", { expect_equal(names(fails), character()) }) -test_that("expression Arithmetics", { - check_list = pl$DataFrame(list())$with_columns( - (pl$lit(1) / 2 == (1 / 2))$alias("1 / 2 == (1/2)"), - (pl$lit(1) + 2 == (1 + 2))$alias("1 + 2 == (1+2)"), - (pl$lit(1) * 2 == (1 * 2))$alias("1 * 2 == (1*2)"), - (pl$lit(1) - 2 == (1 - 2))$alias("1 - 2 == (1-2)"), - (pl$lit(1)$div(pl$lit(2)) == (1 / 2))$alias("1$div(2) == (1/2)"), - (pl$lit(1)$floor_div(pl$lit(2)) == (1 %/% 2))$alias("1$floor_div(2) == (1%/%2)"), - (pl$lit(1)$mod(pl$lit(2)) == (1 %% 2))$alias("1$mod(2) == (1%%2)"), - (pl$lit(1)$mod(pl$lit(-2)) == (1 %% -2))$alias("1$mod(2) != (1%%-2)"), - (pl$lit(1)$add(pl$lit(2)) == (1 + 2))$alias("1$add(2) == (1+2)"), - (pl$lit(1)$mul(pl$lit(2)) == (1 * 2))$alias("1$mul(2) == (1*2)"), - (pl$lit(1)$sub(pl$lit(2)) == (1 - 2))$alias("1$sub(2) == (1-2)") - )$to_data_frame(check.names = FALSE) - - results = unlist(check_list) - expect_true(all(results)) -}) - make_cases = function() { tibble::tribble( ~.test_name, ~fn, - "mul", "*", - "add", "+", - "sub", "-", - "div", "/", - "floor_div", "%/%", - "mod", "%%", "gt", ">", "gte", ">=", "lt", "<", "lte", "<=", "eq", "==", "neq", "!=", - "pow", "^", ) } @@ -2486,7 +2460,7 @@ test_that("rolling, basic", { df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ with_columns( - pl$col("dt")$str$strptime(pl$Datetime(tu = "us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() + pl$col("dt")$str$strptime(pl$Datetime("us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() ) out = df$with_columns( @@ -2515,7 +2489,7 @@ test_that("rolling, arg closed", { df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ with_columns( - pl$col("dt")$str$strptime(pl$Datetime(tu = "us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() + pl$col("dt")$str$strptime(pl$Datetime("us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() ) out = df$with_columns( @@ -2544,7 +2518,7 @@ test_that("rolling, arg offset", { df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ with_columns( - pl$col("dt")$str$strptime(pl$Datetime(tu = "us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() + pl$col("dt")$str$strptime(pl$Datetime("us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() ) # with offset = "1d", we start the window at one or two days after the value @@ -2571,7 +2545,7 @@ test_that("rolling, arg check_sorted", { df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ with_columns( - pl$col("dt")$str$strptime(pl$Datetime(tu = "us"), format = "%Y-%m-%d %H:%M:%S") + pl$col("dt")$str$strptime(pl$Datetime("us"), format = "%Y-%m-%d %H:%M:%S") ) expect_error( diff --git a/tests/testthat/test-expr_string.R b/tests/testthat/test-expr_string.R index ff6bbdc94..b36e3e527 100644 --- a/tests/testthat/test-expr_string.R +++ b/tests/testthat/test-expr_string.R @@ -769,3 +769,29 @@ test_that("str$replace_many()", { "same amount of patterns as replacement" ) }) + + +make_datetime_format_cases = function() { + tibble::tribble( + ~.test_name, ~time_str, ~datatype, ~type_expected, + "utc-example", "2020-01-01 01:00Z", pl$Datetime(), pl$Datetime("us", "UTC"), + "iso8602_1", "2020-01-01T01:00:00", pl$Datetime(), pl$Datetime("us"), + "iso8602_2", "2020-01-01T01:00", pl$Datetime(), pl$Datetime("us"), + "iso8602_3", "2020-01-01T01:00:00.000000001Z", pl$Datetime("ns"), pl$Datetime("ns", "UTC"), + "iso8602_4", "2020-01-01T01:00:00+09:00", pl$Datetime(), pl$Datetime("us", "UTC"), + "date_1", "2020-01-01", pl$Date, pl$Date, + "date_2", "2020/01/01", pl$Date, pl$Date, + "time_1", "01:00:00", pl$Time, pl$Time, + "time_2", "1:00:00", pl$Time, pl$Time, + "time_3", "13:00:00", pl$Time, pl$Time, + ) +} + +patrick::with_parameters_test_that( + "parse time without format specified", + { + s = pl$Series(time_str)$str$strptime(datatype) + expect_true(s$dtype == type_expected) + }, + .cases = make_datetime_format_cases() +) diff --git a/tests/testthat/test-groupby.R b/tests/testthat/test-groupby.R index be6b82bb7..eda89d200 100644 --- a/tests/testthat/test-groupby.R +++ b/tests/testthat/test-groupby.R @@ -336,7 +336,7 @@ test_that("group_by_dynamic for LazyFrame: arg 'start_by' works", { ), n = 0:6 )$with_columns( - pl$col("dt")$str$strptime(pl$Datetime("ms", tz = "UTC"), format = NULL)$set_sorted() + pl$col("dt")$str$strptime(pl$Datetime("ms", "UTC"), format = NULL)$set_sorted() ) # TODO: any weekday should return the same since it is ignored when there's no diff --git a/tests/testthat/test-s3-methods-operator.R b/tests/testthat/test-s3-methods-operator.R new file mode 100644 index 000000000..e9c2a7f98 --- /dev/null +++ b/tests/testthat/test-s3-methods-operator.R @@ -0,0 +1,50 @@ +make_cases = function() { + tibble::tribble( + ~.test_name, ~fn, + "add", `+`, + "sub", `-`, + "div", `/`, + "floor_div", `%/%`, + "mul", `*`, + "mod", `%%`, + "pow", `^`, + ) +} + + +patrick::with_parameters_test_that( + "s3-arithmetic", + { + vec = -5:5 + e = pl$lit(vec) + s = as_polars_series(vec) + + expect_equal(fn(e, 2)$to_series()$to_r(), fn(vec, 2)) + expect_equal(fn(2, e)$to_series()$to_r(), fn(2, vec)) + expect_equal(fn(s, 2)$to_r(), fn(vec, 2)) + expect_equal(fn(2, s)$to_r(), fn(2, vec)) + }, + .cases = make_cases() +) + + +test_that("`+` and `-` works without y", { + vec = -5:5 + e = pl$lit(vec) + s = as_polars_series(vec) + + expect_equal((+e)$to_series()$to_r(), +vec) + expect_equal((-e)$to_series()$to_r(), -vec) + expect_equal((+s)$to_r(), +vec) + expect_equal((-s)$to_r(), -vec) +}) + + +test_that("`+` works for strings", { + chr_vec = c("a", "b", "c") + + expect_equal((pl$lit(chr_vec) + "d")$to_series()$to_r(), paste0(chr_vec, "d")) + expect_equal(("d" + pl$lit(chr_vec))$to_series()$to_r(), paste0("d", chr_vec)) + expect_equal((as_polars_series(chr_vec) + "d")$to_r(), paste0(chr_vec, "d")) + expect_equal(("d" + as_polars_series(chr_vec))$to_r(), paste0("d", chr_vec)) +}) diff --git a/tests/testthat/test-s3_methods.R b/tests/testthat/test-s3-methods.R similarity index 100% rename from tests/testthat/test-s3_methods.R rename to tests/testthat/test-s3-methods.R diff --git a/tests/testthat/test-series.R b/tests/testthat/test-series.R index fc075b7a3..0d88b63ce 100644 --- a/tests/testthat/test-series.R +++ b/tests/testthat/test-series.R @@ -577,3 +577,12 @@ test_that("method from Expr", { test_that("cum_sum", { expect_equal(pl$Series(c(1, 2, NA, 3))$cum_sum()$to_r(), c(1, 3, NA, 6)) }) + +test_that("the dtype argument of pl$Series", { + expect_identical(pl$Series(1, dtype = pl$String)$to_r(), "1.0") + expect_error(pl$Series("foo", dtype = pl$Int32), "conversion from `str` to `i32`") +}) + +test_that("the nan_to_null argument of pl$Series", { + expect_identical(pl$Series(c(1, 2, NA, NaN), nan_to_null = TRUE)$to_r(), c(1, 2, NA, NA)) +}) diff --git a/tests/testthat/test-sink_stream.R b/tests/testthat/test-sink_stream.R index d85a73dce..502076717 100644 --- a/tests/testthat/test-sink_stream.R +++ b/tests/testthat/test-sink_stream.R @@ -158,8 +158,7 @@ test_that("sink_csv: date_format works", { date = pl$date_range( as.Date("2020-01-01"), as.Date("2023-01-02"), - interval = "1y", - eager = TRUE + interval = "1y" ) ) dat$sink_csv(temp_out, date_format = "%Y") @@ -182,8 +181,7 @@ test_that("sink_csv: datetime_format works", { date = pl$date_range( as.Date("2020-01-01"), as.Date("2020-01-02"), - interval = "6h", - eager = TRUE + interval = "6h" ) ) dat$sink_csv(temp_out, datetime_format = "%Hh%Mm - %d/%m/%Y") @@ -202,8 +200,7 @@ test_that("sink_csv: time_format works", { date = pl$date_range( as.Date("2020-10-17"), as.Date("2020-10-18"), - "8h", - eager = TRUE + "8h" ) )$with_columns(pl$col("date")$dt$time()) dat$sink_csv(temp_out, time_format = "%Hh%Mm%Ss") diff --git a/tools/lib-sums.tsv b/tools/lib-sums.tsv new file mode 100644 index 000000000..d2b965c37 --- /dev/null +++ b/tools/lib-sums.tsv @@ -0,0 +1,6 @@ +url sha256sum +https://github.com/pola-rs/r-polars/releases/download/lib-v0.38.1/libr_polars-0.38.1-aarch64-apple-darwin.tar.gz 74496332ba599d9829f8f418c8f1cfd1d445282c6173b661a6a8afb86b617498 +https://github.com/pola-rs/r-polars/releases/download/lib-v0.38.1/libr_polars-0.38.1-aarch64-unknown-linux-gnu.tar.gz 298ca83bf0b27883ec7c10d23dbfbe8e044032110a8b0a4f144ae55ab160f371 +https://github.com/pola-rs/r-polars/releases/download/lib-v0.38.1/libr_polars-0.38.1-x86_64-apple-darwin.tar.gz 2b8ad42c90dbc281149afae03e874ec47ccc8fcfa3ee064e7a364aa296bb1132 +https://github.com/pola-rs/r-polars/releases/download/lib-v0.38.1/libr_polars-0.38.1-x86_64-pc-windows-gnu.tar.gz 88a72fd6ace7574b30c2dd542ded040c7a291130048772ee4bdf006daef8b7a4 +https://github.com/pola-rs/r-polars/releases/download/lib-v0.38.1/libr_polars-0.38.1-x86_64-unknown-linux-gnu.tar.gz 8ba938024659256b9478af4b65ad9f7568b73581b3e9c99eb8edabb8159f5e27 diff --git a/vignettes/performance.Rmd b/vignettes/performance.Rmd index 9c7e7fddb..281aa1401 100644 --- a/vignettes/performance.Rmd +++ b/vignettes/performance.Rmd @@ -17,7 +17,7 @@ options(rmarkdown.html_vignette.check_title = FALSE) As highlighted by the [DuckDB benchmarks](https://duckdblabs.github.io/db-benchmark/), -`polars` is very efficient to deal with large datasets. Still, one can make `polars` +`polars` is very efficient to deal with large datasets. Still, one can make `polars` even faster by following some good practices. @@ -100,7 +100,7 @@ will internally check whether it can be optimized, for example by reordering some operations. Let's re-use the example above but this time with `polars` syntax and 10M -observations. For the purpose of this vignette, we can create a `LazyFrame` +observations. For the purpose of this vignette, we can create a `LazyFrame` directly in our session, but if the data was stored in a CSV file for instance, we would have to scan it first with `pl$scan_csv()`: @@ -140,7 +140,7 @@ lazy_query = lf_test$ lazy_query ``` -However, this doesn't do anything to the data until we call `collect()` at the +However, this doesn't do anything to the data until we call `collect()` at the end. We can now compare the two approaches (in the `lazy` timing, calling `collect()` both reads the data and process it, so we include the data loading part in the `eager` timing as well): @@ -165,11 +165,11 @@ bench::mark( On this very simple query, using lazy execution instead of eager execution lead -to a 1.7-2.2x decrease in time. +to a 1.7-2.2x decrease in time. So what happened? Under the hood, `polars` reorganized the query so that it -filters rows while reading the csv into memory, and then sorts the remaining -data. This can be seen by comparing the original query (`describe_plan()`) and +filters rows while reading the csv into memory, and then sorts the remaining +data. This can be seen by comparing the original query (`describe_plan()`) and the optimized query (`describe_optimized_plan()`): ```{r} @@ -179,7 +179,7 @@ lazy_query$describe_optimized_plan() ``` -Note that the queries must be read from bottom to top, i.e the optimized query +Note that the queries must be read from bottom to top, i.e the optimized query is "select the dataset where the column 'country' matches these values, then sort the data by the values of 'country'". @@ -188,13 +188,13 @@ the data by the values of 'country'". `polars` comes with a large number of built-in, optimized, basic functions that should cover most aspects of data wrangling. These functions are designed to be -very memory efficient. Therefore, using R functions or converting data back and +very memory efficient. Therefore, using R functions or converting data back and forth between `polars` and R is discouraged as it can lead to a large decrease in efficiency. Let's use the test data from the previous section and let's say that we only want to check whether each country contains "na". This can be done in (at least) two -ways: with the built-in function `contains()` and with the base R function +ways: with the built-in function `contains()` and with the base R function `grepl()`. However, using the built-in function is much faster: ```r @@ -207,7 +207,7 @@ bench::mark( grepl("na", s) }) ), - grepl_nv = df_test$limit(1e6)$with_columns( + grepl_nv = df_test$limit(1e6)$with_columns( pl$col("country")$apply(\(str) { grepl("na", str) }, return_type = pl$Boolean) @@ -221,12 +221,12 @@ bench::mark( #> # A tibble: 3 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 contains 387.02ms 432.12ms 2.27 401.86KB 0 +#> 1 contains 387.02ms 432.12ms 2.27 401.86KB 0 #> 2 grepl 2.06s 2.11s 0.466 114.79MB 0.512 #> 3 grepl_nv 6.42s 6.52s 0.153 7.65MB 10.3 ``` -Using custom R functions can be useful, but when possible, you should use the +Using custom R functions can be useful, but when possible, you should use the functions provided by `polars`. See the Reference tab for a complete list of functions. @@ -236,7 +236,7 @@ functions. Finally, quoting [Polars User Guide](https://pola-rs.github.io/polars-book/user-guide/concepts/streaming/): > One additional benefit of the lazy API is that it allows queries to be executed -> in a streaming manner. Instead of processing the data all-at-once Polars can +> in a streaming manner. Instead of processing the data all-at-once Polars can > execute the query in batches allowing you to process datasets that are > larger-than-memory. diff --git a/vignettes/polars.Rmd b/vignettes/polars.Rmd index 9cebc4f9f..627538aae 100644 --- a/vignettes/polars.Rmd +++ b/vignettes/polars.Rmd @@ -319,7 +319,7 @@ column. See the section below for more details on data types. ## Reshape Polars supports data reshaping, going from both long to wide (a.k.a. "pivotting", -or `pivot_wider()` in `tidyr`), and from wide to long (a.k.a. "unpivotting", +or `pivot_wider()` in `tidyr`), and from wide to long (a.k.a. "unpivotting", "melting", or `pivot_longer()` in `tidyr`). Let's switch to the `Indometh` dataset to demonstrate some basic examples. Note that the data are currently in long format.