-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_arrow.R
38 lines (32 loc) · 1.43 KB
/
create_arrow.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
library(tidyverse) # data science in R
library(arrow) # fast data types
library(tictoc) # timing code
library(haven) # package for reading in files from other softwares
library(purrr) # for maps or 'loops'
# a few inputs
data_rows <- 9789093 # number of rows in data
chunk_size <- 250000 # how many rows to read/write at a time.
# creates the iter list
first <- seq(0, data_rows, by = chunk_size)
# [1] 0 20000 40000 60000 80000 ... 9780000
# need to create an `arrow` and `parquet` folder in the `folder_path` directory.
sas_arrow <- function(first, max_rows = chunk_size,
sas_file = "data/marathon.sas7bdat",
folder_path = "arrow"){
dat <- read_sas(sas_file, skip = first, n_max = max_rows)
dat <- dat %>%
mutate(marathon = marathon %>%
stringr::str_to_lower() %>%
stringr::str_replace_all(" ", "_") %>%
stringr::str_replace_all("\x97","")
)
feather_fname <- stringr::str_c(folder_path, "/feather/", stringr::str_c("file_", sprintf("%.0f", first/1000) ,"k.feather"))
parquet_fname <- stringr::str_c(folder_path, "/parquet/", stringr::str_c("file_", sprintf("%.0f", first/1000) ,"k.parquet"))
arrow::write_feather(dat, feather_fname) # no compression codecs installed
arrow::write_parquet(dat, parquet_fname) # no compression codecs installed
print(first/1000)
}
tic()
purrr::map(first, ~sas_arrow(first = .x))
toc()
# 122.029 sec elapsed