Skip to content

Commit

Permalink
epsilon merge tool: lost column support
Browse files Browse the repository at this point in the history
when between base and extension a column is lost
(when it appears in base, but not in extension)
then forward-fill that column into the extension
data set, using the last value from the base.

this is to accomodate the loss of the column
for AGS 16056 in the RKI data set around
October 2021 (unclear what happened exactly).
  • Loading branch information
jgehrcke committed Oct 20, 2021
1 parent d37574c commit a49b2ac
Showing 1 changed file with 16 additions and 2 deletions.
18 changes: 16 additions & 2 deletions tools/csv-epsilon-merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,15 @@ def main():
# Using `base` as the base of this comparison means that `base` (old) data
# will appear as `self` in the output, and `ext` (new) data will appear as
# `other`.
log.info("df_overlap_base: %s", df_overlap_base)
log.info("df_overlap_ext: %s", df_overlap_ext)
log.info("df_overlap_base:\n%s", df_overlap_base)
log.info("df_overlap_ext:\n%s", df_overlap_ext)

log.info('sort columns in both overlap DFs by column name to make compare() work')
df_overlap_base = df_overlap_base.sort_index(axis=1)
df_overlap_ext = df_overlap_ext.sort_index(axis=1)

log.info("df_overlap_base:\n%s", df_overlap_base)
log.info("df_overlap_ext:\n%s", df_overlap_ext)

try:
df_diff = df_overlap_base.compare(df_overlap_ext)
Expand Down Expand Up @@ -203,6 +210,13 @@ def parse_files_and_check_sanity(args):
log.info("ext shape: %s", df_ext.shape)
log.info("df_ext:\n%s", df_ext)

log.info('look for base columns that are not part of extension columns')
for c in df_base.columns:
if c not in df_ext.columns:
lv = df_base[c].iloc[-1]
log.info(f'base column `{c}` not in extension column set, add (forward-fill last value: {lv})')
df_ext[c] = lv

columns_diff = set(df_base.columns) - set(df_ext.columns)
if columns_diff:
log.error("these columns do not appear in both: %s", columns_diff)
Expand Down

0 comments on commit a49b2ac

Please sign in to comment.