Skip to content

Commit

Permalink
dedupe full dup rows and only do date imputation once
Browse files Browse the repository at this point in the history
  • Loading branch information
Laurie Merrell committed Sep 29, 2023
1 parent c573b9e commit dfde9ec
Showing 1 changed file with 12 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,15 @@ clean_columns_and_dedupe_files AS (
ELSE {{ extract_littlepay_filename_date() }}
END AS littlepay_export_date,
ts,
-- hash all content not generated by us to enable deduping full dup rows
-- hashing at this step will preserve distinction between nulls and empty strings in case that is meaningful upstream
{{ dbt_utils.generate_surrogate_key(['participant_id',
'aggregation_id', 'acquirer_id', 'request_type', 'transaction_amount', 'currency_code',
'retrieval_reference_number', 'littlepay_reference_number', 'external_reference_number',
'response_code', 'status', 'authorisation_date_time_utc']) }} AS content_hash,
FROM source
-- remove duplicate instances of the same file (file defined as date-level update from LP)
-- use dense rank instead of row number because we need to allow all rows from a given file to be included (allow ties)
QUALIFY DENSE_RANK()
OVER (PARTITION BY littlepay_export_date ORDER BY littlepay_export_ts DESC, ts DESC) = 1
),
Expand All @@ -55,19 +62,15 @@ stg_littlepay__authorisations AS (
_line_number,
`instance`,
extract_filename,


littlepay_export_ts,

CASE
WHEN extract_filename = "24jan_datafeed.psv" THEN DATE '2023-01-24'
WHEN extract_filename = "25jan_datafeed.psv" THEN DATE '2023-01-25'
ELSE littlepay_export_date
END AS littlepay_export_date,
littlepay_export_date,
ts,
{{ dbt_utils.generate_surrogate_key(['littlepay_export_date', '_line_number', 'instance']) }} AS _key,
{{ dbt_utils.generate_surrogate_key(['aggregation_id', 'retrieval_reference_number', 'authorisation_date_time_utc']) }} AS _payments_key,
{{ dbt_utils.generate_surrogate_key(['aggregation_id', 'authorisation_date_time_utc']) }} AS _payments_key,
FROM clean_columns_and_dedupe_files
-- remove full duplicate rows
QUALIFY ROW_NUMBER()
OVER (PARTITION BY content_hash ORDER BY littlepay_export_ts DESC, _line_number ASC) = 1
)

SELECT * FROM stg_littlepay__authorisations

0 comments on commit dfde9ec

Please sign in to comment.