Merge pull request #2828 from martinholmer/tmd-weights-scaling

Add weights_scale attribute to the Records and Data classes
PSLmodels · Oct 27, 2024 · a559002 · a559002
2 parents cfca1d5 + 163d040
commit a559002
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 19 deletions.
diff --git a/taxcalc.egg-info/PKG-INFO b/taxcalc.egg-info/PKG-INFO
@@ -18,6 +18,13 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: setuptools
+Requires-Dist: numpy
+Requires-Dist: pandas
+Requires-Dist: bokeh
+Requires-Dist: numba
+Requires-Dist: requests
+Requires-Dist: paramtools>=0.18.3
 
 | | |
 | --- | --- |

diff --git a/taxcalc/__init__.py b/taxcalc/__init__.py
@@ -14,6 +14,6 @@
 from taxcalc.utils import *
 from taxcalc.cli import *
 
-__version__ = '4.3.0'
+__version__ = '4.3.0e'
 __min_python3_version__ = 10
 __max_python3_version__ = 12
diff --git a/taxcalc/data.py b/taxcalc/data.py
@@ -42,6 +42,13 @@ class Data():
         NOTE: when using custom weights, set this argument to a DataFrame.
         NOTE: assumes weights are integers that are 100 times the real weights.
 
+    weights_scale: float
+        specifies the weights scaling factor used to convert contents
+        of weights file into the s006 variable.  PUF and CPS input data
+        generated in the taxdata repository use a weights_scale of 0.01,
+        while TMD input data generated in the tax-microdata repository
+        use a 1.0 weights_scale value.
+
     Raises
     ------
     ValueError:
@@ -66,7 +73,8 @@ class instance: Data
     VARINFO_FILE_NAME = None
     VARINFO_FILE_PATH = None
 
-    def __init__(self, data, start_year, gfactors=None, weights=None):
+    def __init__(self, data, start_year, gfactors=None,
+                 weights=None, weights_scale=0.01):
         # initialize data variable info sets and read variable information
         self.INTEGER_READ_VARS = set()
         self.MUST_READ_VARS = set()
@@ -97,6 +105,7 @@ def __init__(self, data, start_year, gfactors=None, weights=None):
             self.gfactors = gfactors
             # read sample weights
             self.WT = None
+            self.weights_scale = weights_scale
             if self.__aging_data:
                 self._read_weights(weights)
                 # ... weights must be same size as data
@@ -114,7 +123,7 @@ def __init__(self, data, start_year, gfactors=None, weights=None):
                 assert wt_colname in self.WT.columns, (
                     f'no weights for start year {self.current_year}'
                 )
-                self.s006 = self.WT[wt_colname] * 0.01
+                self.s006 = self.WT[wt_colname] * self.weights_scale
 
     @property
     def data_year(self):
@@ -152,7 +161,7 @@ def increment_year(self):
             assert wt_colname in self.WT.columns, (
                 f'no weights for new year {self.current_year}'
             )
-            self.s006 = self.WT[wt_colname] * 0.01
+            self.s006 = self.WT[wt_colname] * self.weights_scale
 
     # ----- begin private methods of Data class -----
 
@@ -260,7 +269,6 @@ def _read_weights(self, weights):
         Read sample weights from file or
         use specified DataFrame as weights or
         create empty DataFrame if None.
-        NOTE: assumes weights are integers equal to 100 times the real weight.
         """
         if weights is None:
             return
@@ -276,7 +284,7 @@ def _read_weights(self, weights):
             msg = 'weights is not None or a string or a Pandas DataFrame'
             raise ValueError(msg)
         assert isinstance(WT, pd.DataFrame)
-        setattr(self, 'WT', WT.astype(np.int32))
+        setattr(self, 'WT', WT.astype(np.float64))
         del WT
 
     def _extrapolate(self, year):

diff --git a/taxcalc/records.py b/taxcalc/records.py
@@ -53,7 +53,7 @@ class Records(Data):
         None creates empty sample-weights DataFrame;
         default value is filename of the PUF weights.
         NOTE: when using custom weights, set this argument to a DataFrame.
-        NOTE: assumes weights are integers that are 100 times the real weights.
+        NOTE: see weights_scale documentation below.
 
     adjust_ratios: string or Pandas DataFrame or None
         string describes CSV file in which adjustment ratios reside;
@@ -69,6 +69,13 @@ class Records(Data):
         any smoothing of stair-step provisions in income tax law;
         default value is false.
 
+    weights_scale: float
+        specifies the weights scaling factor used to convert contents
+        of weights file into the s006 variable.  PUF and CPS input data
+        generated in the taxdata repository use a weights_scale of 0.01,
+        while TMD input data generated in the tax-microdata repository
+        use a 1.0 weights_scale value.
+
     Raises
     ------
     ValueError:
@@ -127,11 +134,12 @@ def __init__(self,
                  gfactors=GrowFactors(),
                  weights=PUF_WEIGHTS_FILENAME,
                  adjust_ratios=PUF_RATIOS_FILENAME,
-                 exact_calculations=False):
+                 exact_calculations=False,
+                 weights_scale=0.01):
         # pylint: disable=no-member,too-many-branches
         if isinstance(weights, str):
             weights = os.path.join(Records.CODE_PATH, weights)
-        super().__init__(data, start_year, gfactors, weights)
+        super().__init__(data, start_year, gfactors, weights, weights_scale)
         if data is None:
             return  # because there are no data
         # read adjustment ratios
@@ -228,7 +236,7 @@ def tmd_constructor(
             data_path: Path,
             weights_path: Path,
             growfactors_path: Path,
-            exact_calculations=False
+            exact_calculations=False,
     ):  # pragma: no cover
         """
         Static method returns a Records object instantiated with TMD
@@ -250,6 +258,7 @@ def tmd_constructor(
             gfactors=GrowFactors(growfactors_filename=str(growfactors_path)),
             adjust_ratios=None,
             exact_calculations=exact_calculations,
+            weights_scale=1.0,
         )
 
     def increment_year(self):

diff --git a/taxcalc/taxcalcio.py b/taxcalc/taxcalcio.py
@@ -355,15 +355,17 @@ def init(self, input_data, tax_year, baseline, reform, assump,
                     weights=wghts,
                     gfactors=gfactors_ref,
                     adjust_ratios=None,
-                    exact_calculations=exact_calculations
+                    exact_calculations=exact_calculations,
+                    weights_scale=1.0,
                 )
                 recs_base = Records(
                     data=pd.read_csv(input_data),
                     start_year=Records.TMDCSV_YEAR,
                     weights=wghts,
                     gfactors=gfactors_base,
                     adjust_ratios=None,
-                    exact_calculations=exact_calculations
+                    exact_calculations=exact_calculations,
+                    weights_scale=1.0,
                 )
             else:  # if not {cps|tmd}_input_data but aging_input_data: puf
                 recs = Records(
@@ -548,8 +550,17 @@ def write_output_file(self, output_dump, dump_varset,
             outdf = self.minimal_output()
             column_order = outdf.columns
         assert len(outdf.index) == self.calc.array_len
-        outdf.to_csv(self._output_filename, columns=column_order,
-                     index=False, float_format='%.2f')
+        if self.tmd_input_data:  # pragma: no cover
+            if "s006" in outdf:
+                weights = outdf["s006"].round(5)
+            outdf = outdf.round(2)
+            if "s006" in outdf:
+                outdf["s006"] = weights
+            outdf.to_csv(self._output_filename, columns=column_order,
+                         index=False)
+        else:
+            outdf.to_csv(self._output_filename, columns=column_order,
+                         index=False, float_format='%.2f')
         del outdf
         gc.collect()
 
@@ -786,8 +797,8 @@ def dump_output(self, calcx, dump_varset, mtr_inctax, mtr_paytax):
             vardata = calcx.array(varname)
             if varname in recs_vinfo.INTEGER_VARS:
                 odf[varname] = vardata
-            else:
-                odf[varname] = vardata.round(2)  # rounded to nearest cent
+            else:  # specify precision that can handle small TMD area weights
+                odf[varname] = vardata.round(5)
             odf = odf.copy()
         # specify mtr values in percentage terms
         if 'mtr_inctax' in varset:

diff --git a/taxcalc/tests/test_benefits.py b/taxcalc/tests/test_benefits.py
@@ -77,9 +77,9 @@ def test_benefits(tests_path, cps_fullsample):
     if diffs:
         msg = 'CPS BENEFITS RESULTS DIFFER\n'
         msg += '-------------------------------------------------\n'
-        msg += '--- NEW RESULTS IN benefits_actual.txt FILE   ---\n'
-        msg += '--- if new OK, copy benefits_actual.txt to    ---\n'
-        msg += '---                 benefits_expect.txt       ---\n'
+        msg += '--- NEW RESULTS IN benefits_actual.csv FILE   ---\n'
+        msg += '--- if new OK, copy benefits_actual.csv to    ---\n'
+        msg += '---                 benefits_expect.csv       ---\n'
         msg += '---            and rerun test.                ---\n'
         msg += '-------------------------------------------------\n'
         raise ValueError(msg)