Skip to content

Commit

Permalink
bug introduced be update do cwf function. closes #47
Browse files Browse the repository at this point in the history
  • Loading branch information
floswald committed May 29, 2024
1 parent bfacc28 commit 55f53a4
Show file tree
Hide file tree
Showing 8 changed files with 130 additions and 27 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ cookies.txt$
\.Rhistory$
^.*\.Rproj$
^\.Rproj\.user$
.github
_pkgdown.yml
50 changes: 50 additions & 0 deletions .github/workflows/check-standard.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]

name: R-CMD-check

permissions: read-all

jobs:
R-CMD-check:
runs-on: ${{ matrix.config.os }}

name: ${{ matrix.config.os }} (${{ matrix.config.r }})

strategy:
fail-fast: false
matrix:
config:
- {os: macos-latest, r: 'release'}
- {os: windows-latest, r: 'release'}
- {os: ubuntu-latest, r: 'release'}

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
R_KEEP_PKG_SOURCE: yes

steps:
- uses: actions/checkout@v4

- uses: r-lib/actions/setup-pandoc@v2

- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.r }}
http-user-agent: ${{ matrix.config.http-user-agent }}
use-public-rspm: true

- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::rcmdcheck
needs: check

- uses: r-lib/actions/check-r-package@v2
with:
upload-snapshots: true
build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata
9 changes: 0 additions & 9 deletions .travis.yml

This file was deleted.

6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: psidR
Type: Package
Title: Build Panel Data Sets from PSID Raw Data
Version: 2.1
Date: 2021-05-06
Version: 2.2
Date: 2024-05-29
Author: Florian Oswald
Maintainer: Florian Oswald <[email protected]>
Description: Makes it easy to build panel data in wide format from Panel Survey
Expand Down Expand Up @@ -31,4 +31,4 @@ Collate:
'psidR-package.r'
Suggests:
testthat
RoxygenNote: 7.1.1
RoxygenNote: 7.2.3
30 changes: 24 additions & 6 deletions R/build.panel.r
Original file line number Diff line number Diff line change
Expand Up @@ -336,13 +336,31 @@ build.panel <- function(datadir=NULL,fam.vars,ind.vars=NULL,heads.only=FALSE,cur
# convert fam.vars to data.table
stopifnot(is.data.frame(fam.vars))
fam.vars <- data.table(fam.vars)

if (!all(c("year") %in% names(fam.vars))){
stop("your fam.vars needs to contain column `year`")
}

nlist = grepl(".year|.variable", names(fam.vars))
if (any(nlist)){
flog.error("your `fam.vars` contains illegal names ", names(fam.vars)[nlist], capture = TRUE)
stop()
}
fam.vars <- copy(fam.vars[,lapply(.SD,make.char)])
setkey(fam.vars,year)

# convert ind.vars to data.table if not null
if (!is.null(ind.vars)){
stopifnot(is.data.frame(ind.vars))
if (!all(c("year") %in% names(ind.vars))){
stop("your ind.vars needs to contain columns `year`")
}
ind.vars <- data.table(ind.vars)
nlist = grepl(".year|.variable", names(ind.vars))
if (any(nlist)){
flog.error("your `ind.vars` contains illegal names ", names(ind.vars)[nlist], capture = TRUE)
stop()
}
ind.vars <- copy(ind.vars[,lapply(.SD,make.char)])
setkey(ind.vars,year)
}
Expand Down Expand Up @@ -515,7 +533,7 @@ build.panel <- function(datadir=NULL,fam.vars,ind.vars=NULL,heads.only=FALSE,cur
tmp <- data.table(tmp)

vs = ceiling(object.size(tmp)/1024^2)
flog.debug('loaded family file: ',fam.dat[iy])
flog.debug('loaded family file: %s',fam.dat[iy])
flog.debug('current memory load in MB: %d',vs)


Expand Down Expand Up @@ -668,8 +686,8 @@ medium.test.ind <- function(dd=NULL){
cwf = openxlsx::read.xlsx(system.file(package="psidR","psid-lists","psid.xlsx"))
head_age_var_name <- getNamesPSID("ER17013", cwf, years=c(2003,2005,2007))
educ = getNamesPSID("ER30323",cwf,years=c(2003,2005,2007))
famvars = data.frame(year=c(2003,2005,2007),age=head_age_var_name)
indvars = data.frame(year=c(2003,2005,2007),educ=educ)
famvars = data.frame(year=c(2003,2005,2007),age=head_age_var_name$variable)
indvars = data.frame(year=c(2003,2005,2007),educ=educ$variable)
build.panel(fam.vars=famvars,ind.vars=indvars,datadir=dd)
}

Expand All @@ -680,7 +698,7 @@ medium.test.ind <- function(dd=NULL){
medium.test.noind <- function(dd=NULL){
cwf = openxlsx::read.xlsx(system.file(package="psidR","psid-lists","psid.xlsx"))
head_age_var_name <- getNamesPSID("ER17013", cwf, years=c(2003,2005,2007))
famvars = data.frame(year=c(2003,2005,2007),age=head_age_var_name)
famvars = data.frame(year=c(2003,2005,2007),age=head_age_var_name$variable)
build.panel(fam.vars=famvars,datadir=dd)
}

Expand All @@ -694,8 +712,8 @@ medium.test.ind.NA <- function(dd=NULL){
head_age_var_name <- getNamesPSID("ER17013", cwf, years=c(2003,2005,2007))
educ = getNamesPSID("ER30323",cwf,years=c(2003,2005,2007))
educ[2] = NA
famvars = data.frame(year=c(2003,2005,2007),age=head_age_var_name)
indvars = data.frame(year=c(2003,2005,2007),educ=educ)
famvars = data.frame(year=c(2003,2005,2007),age=head_age_var_name$variable)
indvars = data.frame(year=c(2003,2005,2007),educ=educ$variable)
build.panel(fam.vars=famvars,ind.vars=indvars,datadir=dd,loglevel = DEBUG)
}

Expand Down
44 changes: 35 additions & 9 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,38 @@ The [Panel Study of Income Dynamics](http://psidonline.isr.umich.edu/) is a publ
This package attempts to help the task of building a panel dataset. The user directly downloads ASCII data from the PSID server into `R`, **without the need** for any other software like stata or sas. To build the panel, the user must then specify the variable names in each wave of the questionnaire in a data.frame `fam.vars`, as well as the variables from the individual index in `ind.vars`. The helper function `getNamesPSID` is helpful in finding different variable names across waves - see examples below.


### Quick Start
### Quick Start and `API`

1. You must supply at least one data.frame with variables to read from the family file. Most of the time you will also supply a data.frame with variables from the individual files to read.
2. Those dataframes **must** be in the following format. I.e. column `year` is an integer and indicates calendar year, the other columns are the _variable names which will appear in your panel_.

```R
> head(i) # individiual file example
year age educ empstat weight
1: 1968 ER30004 ER30010 <NA> ER30019
2: 1969 ER30023 <NA> <NA> ER30042 # NOTICE THE NA for educ HERE!!
3: 1970 ER30046 ER30052 <NA> ER30066
4: 1971 ER30070 ER30076 <NA> ER30090
5: 1972 ER30094 ER30100 <NA> ER30116
6: 1973 ER30120 ER30126 <NA> ER30137

> head(f)) # family file example
year age_youngest_child debt empstat_ faminc hours hvalue ...
1: 1968 V120 <NA> V196 V81 V47 V5 ...
2: 1969 V1013 <NA> V639 V529 V465 V449 ...
3: 1970 V1243 <NA> V1278 V1514 V1138 V1122 ...
4: 1971 V1946 <NA> V1983 V2226 V1839 V1823 ...
5: 1972 V2546 <NA> V2581 V2852 V2439 V2423 ...
6: 1973 V3099 <NA> V3114 V3256 V3027 V3021 ...
```

Example usage:


```R
> library(psidR)

> build.psid(datadr = "~/data/PSID", small = TRUE) # directory `datadr` must exist!
> build.psid(datadir = "~/data/PSID", small = TRUE) # directory `datadir` must exist!
INFO [2021-07-13 10:34:26] Will download missing datasets now
INFO [2021-07-13 10:34:26] will download family files: 2013, 2015
INFO [2021-07-13 10:34:26] will download latest individual index: IND2019ER
Expand Down Expand Up @@ -133,7 +159,7 @@ i = fread(file.path(r,"psid-lists","indvars.txt"))
613: PSID Family-level 2017 ER66163 A52 LIKELIHOOD OF MOVING likelihood_move

# alternatively, use `getNamesPSID`:
# cwf <- read.xlsx("http://psidonline.isr.umich.edu/help/xyr/psid.xlsx")
# cwf <- openxlsx::read.xlsx("http://psidonline.isr.umich.edu/help/xyr/psid.xlsx")
# Suppose you know the name of the variable in a certain year, and it is
# "ER17013". then get the correpsonding name in another year with
# getNamesPSID("ER17013", cwf, years = 2001) # 2001 only
Expand Down Expand Up @@ -182,7 +208,7 @@ Here are some tests:
cwf = openxlsx::read.xlsx(system.file(package="psidR","psid-lists","psid.xlsx"))
head_age_var_name <- getNamesPSID("ER17013", cwf, years=c(2003))
# create family vars data.frame
famvars = data.frame(year=c(2003),age=head_age_var_name)
famvars = data.frame(year=c(2003),variable=head_age_var_name$variable)
# call function
build.panel(fam.vars=famvars,datadir=dd)

Expand All @@ -192,8 +218,8 @@ build.panel(fam.vars=famvars,datadir=dd)
cwf = openxlsx::read.xlsx(system.file(package="psidR","psid-lists","psid.xlsx"))
head_age_var_name <- getNamesPSID("ER17013", cwf, years=c(2003))
educ = getNamesPSID("ER30323",cwf,years=2003)
famvars = data.frame(year=c(2003),age=head_age_var_name)
indvars = data.frame(year=c(2003),educ=educ)
famvars = data.frame(year=c(2003),variable=head_age_var_name$variable)
indvars = data.frame(year=c(2003),variable=educ$variable)
build.panel(fam.vars=famvars,ind.vars=indvars,datadir=dd)


Expand All @@ -202,9 +228,9 @@ build.panel(fam.vars=famvars,ind.vars=indvars,datadir=dd)

cwf = openxlsx::read.xlsx(system.file(package="psidR","psid-lists","psid.xlsx"))
head_age_var_name <- getNamesPSID("ER17013", cwf, years=c(2003,2005,2007))
educ = getNamesPSID("ER30323",cwf,years=c(2003,2005,2007))
famvars = data.frame(year=c(2003,2005,2007),age=head_age_var_name)
indvars = data.frame(year=c(2003,2005,2007),educ=educ)
educ = getNamesPSID("ER30323",cwf,years=c(2003,2005,2007))
famvars = data.frame(year=c(2003,2005,2007),variable=head_age_var_name$variable)
indvars = data.frame(year=c(2003,2005,2007),variable=educ$variable)
build.panel(fam.vars=famvars,ind.vars=indvars,datadir=dd)

# etc for
Expand Down
12 changes: 12 additions & 0 deletions tests/testthat/test_buildpanel.R
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,16 @@ test_that("check subsetting to core/immigrant/latino", {

} )

test_that("wrong famvars and indvars raises an error",{
cwf <- openxlsx::read.xlsx("http://psidonline.isr.umich.edu/help/xyr/psid.xlsx")
famvars = getNamesPSID("ER17013", cwf, years = c(2005, 2007, 2009))
expect_error(build.panel(datadir=my.dir,fam.vars=famvars) )

head_age_var_name <- getNamesPSID("ER17013", cwf, years=c(2003))

famvars = data.frame(year=c(2005, 2007, 2009),age=head_age_var_name)
expect_error(build.panel(datadir=my.dir,fam.vars=famvars) )

})


0 comments on commit 55f53a4

Please sign in to comment.