Skip to content

Commit

Permalink
Merge pull request #2828 from martinholmer/tmd-weights-scaling
Browse files Browse the repository at this point in the history
Add weights_scale attribute to the Records and Data classes
  • Loading branch information
martinholmer authored Oct 27, 2024
2 parents cfca1d5 + 163d040 commit a559002
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 19 deletions.
7 changes: 7 additions & 0 deletions taxcalc.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@ Classifier: Programming Language :: Python :: 3.12
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Description-Content-Type: text/markdown
License-File: LICENSE
Requires-Dist: setuptools
Requires-Dist: numpy
Requires-Dist: pandas
Requires-Dist: bokeh
Requires-Dist: numba
Requires-Dist: requests
Requires-Dist: paramtools>=0.18.3

| | |
| --- | --- |
Expand Down
2 changes: 1 addition & 1 deletion taxcalc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@
from taxcalc.utils import *
from taxcalc.cli import *

__version__ = '4.3.0'
__version__ = '4.3.0e'
__min_python3_version__ = 10
__max_python3_version__ = 12
18 changes: 13 additions & 5 deletions taxcalc/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ class Data():
NOTE: when using custom weights, set this argument to a DataFrame.
NOTE: assumes weights are integers that are 100 times the real weights.
weights_scale: float
specifies the weights scaling factor used to convert contents
of weights file into the s006 variable. PUF and CPS input data
generated in the taxdata repository use a weights_scale of 0.01,
while TMD input data generated in the tax-microdata repository
use a 1.0 weights_scale value.
Raises
------
ValueError:
Expand All @@ -66,7 +73,8 @@ class instance: Data
VARINFO_FILE_NAME = None
VARINFO_FILE_PATH = None

def __init__(self, data, start_year, gfactors=None, weights=None):
def __init__(self, data, start_year, gfactors=None,
weights=None, weights_scale=0.01):
# initialize data variable info sets and read variable information
self.INTEGER_READ_VARS = set()
self.MUST_READ_VARS = set()
Expand Down Expand Up @@ -97,6 +105,7 @@ def __init__(self, data, start_year, gfactors=None, weights=None):
self.gfactors = gfactors
# read sample weights
self.WT = None
self.weights_scale = weights_scale
if self.__aging_data:
self._read_weights(weights)
# ... weights must be same size as data
Expand All @@ -114,7 +123,7 @@ def __init__(self, data, start_year, gfactors=None, weights=None):
assert wt_colname in self.WT.columns, (
f'no weights for start year {self.current_year}'
)
self.s006 = self.WT[wt_colname] * 0.01
self.s006 = self.WT[wt_colname] * self.weights_scale

@property
def data_year(self):
Expand Down Expand Up @@ -152,7 +161,7 @@ def increment_year(self):
assert wt_colname in self.WT.columns, (
f'no weights for new year {self.current_year}'
)
self.s006 = self.WT[wt_colname] * 0.01
self.s006 = self.WT[wt_colname] * self.weights_scale

# ----- begin private methods of Data class -----

Expand Down Expand Up @@ -260,7 +269,6 @@ def _read_weights(self, weights):
Read sample weights from file or
use specified DataFrame as weights or
create empty DataFrame if None.
NOTE: assumes weights are integers equal to 100 times the real weight.
"""
if weights is None:
return
Expand All @@ -276,7 +284,7 @@ def _read_weights(self, weights):
msg = 'weights is not None or a string or a Pandas DataFrame'
raise ValueError(msg)
assert isinstance(WT, pd.DataFrame)
setattr(self, 'WT', WT.astype(np.int32))
setattr(self, 'WT', WT.astype(np.float64))
del WT

def _extrapolate(self, year):
Expand Down
17 changes: 13 additions & 4 deletions taxcalc/records.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class Records(Data):
None creates empty sample-weights DataFrame;
default value is filename of the PUF weights.
NOTE: when using custom weights, set this argument to a DataFrame.
NOTE: assumes weights are integers that are 100 times the real weights.
NOTE: see weights_scale documentation below.
adjust_ratios: string or Pandas DataFrame or None
string describes CSV file in which adjustment ratios reside;
Expand All @@ -69,6 +69,13 @@ class Records(Data):
any smoothing of stair-step provisions in income tax law;
default value is false.
weights_scale: float
specifies the weights scaling factor used to convert contents
of weights file into the s006 variable. PUF and CPS input data
generated in the taxdata repository use a weights_scale of 0.01,
while TMD input data generated in the tax-microdata repository
use a 1.0 weights_scale value.
Raises
------
ValueError:
Expand Down Expand Up @@ -127,11 +134,12 @@ def __init__(self,
gfactors=GrowFactors(),
weights=PUF_WEIGHTS_FILENAME,
adjust_ratios=PUF_RATIOS_FILENAME,
exact_calculations=False):
exact_calculations=False,
weights_scale=0.01):
# pylint: disable=no-member,too-many-branches
if isinstance(weights, str):
weights = os.path.join(Records.CODE_PATH, weights)
super().__init__(data, start_year, gfactors, weights)
super().__init__(data, start_year, gfactors, weights, weights_scale)
if data is None:
return # because there are no data
# read adjustment ratios
Expand Down Expand Up @@ -228,7 +236,7 @@ def tmd_constructor(
data_path: Path,
weights_path: Path,
growfactors_path: Path,
exact_calculations=False
exact_calculations=False,
): # pragma: no cover
"""
Static method returns a Records object instantiated with TMD
Expand All @@ -250,6 +258,7 @@ def tmd_constructor(
gfactors=GrowFactors(growfactors_filename=str(growfactors_path)),
adjust_ratios=None,
exact_calculations=exact_calculations,
weights_scale=1.0,
)

def increment_year(self):
Expand Down
23 changes: 17 additions & 6 deletions taxcalc/taxcalcio.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,15 +355,17 @@ def init(self, input_data, tax_year, baseline, reform, assump,
weights=wghts,
gfactors=gfactors_ref,
adjust_ratios=None,
exact_calculations=exact_calculations
exact_calculations=exact_calculations,
weights_scale=1.0,
)
recs_base = Records(
data=pd.read_csv(input_data),
start_year=Records.TMDCSV_YEAR,
weights=wghts,
gfactors=gfactors_base,
adjust_ratios=None,
exact_calculations=exact_calculations
exact_calculations=exact_calculations,
weights_scale=1.0,
)
else: # if not {cps|tmd}_input_data but aging_input_data: puf
recs = Records(
Expand Down Expand Up @@ -548,8 +550,17 @@ def write_output_file(self, output_dump, dump_varset,
outdf = self.minimal_output()
column_order = outdf.columns
assert len(outdf.index) == self.calc.array_len
outdf.to_csv(self._output_filename, columns=column_order,
index=False, float_format='%.2f')
if self.tmd_input_data: # pragma: no cover
if "s006" in outdf:
weights = outdf["s006"].round(5)
outdf = outdf.round(2)
if "s006" in outdf:
outdf["s006"] = weights
outdf.to_csv(self._output_filename, columns=column_order,
index=False)
else:
outdf.to_csv(self._output_filename, columns=column_order,
index=False, float_format='%.2f')
del outdf
gc.collect()

Expand Down Expand Up @@ -786,8 +797,8 @@ def dump_output(self, calcx, dump_varset, mtr_inctax, mtr_paytax):
vardata = calcx.array(varname)
if varname in recs_vinfo.INTEGER_VARS:
odf[varname] = vardata
else:
odf[varname] = vardata.round(2) # rounded to nearest cent
else: # specify precision that can handle small TMD area weights
odf[varname] = vardata.round(5)
odf = odf.copy()
# specify mtr values in percentage terms
if 'mtr_inctax' in varset:
Expand Down
6 changes: 3 additions & 3 deletions taxcalc/tests/test_benefits.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ def test_benefits(tests_path, cps_fullsample):
if diffs:
msg = 'CPS BENEFITS RESULTS DIFFER\n'
msg += '-------------------------------------------------\n'
msg += '--- NEW RESULTS IN benefits_actual.txt FILE ---\n'
msg += '--- if new OK, copy benefits_actual.txt to ---\n'
msg += '--- benefits_expect.txt ---\n'
msg += '--- NEW RESULTS IN benefits_actual.csv FILE ---\n'
msg += '--- if new OK, copy benefits_actual.csv to ---\n'
msg += '--- benefits_expect.csv ---\n'
msg += '--- and rerun test. ---\n'
msg += '-------------------------------------------------\n'
raise ValueError(msg)
Expand Down

0 comments on commit a559002

Please sign in to comment.