Skip to content

Commit

Permalink
feat(ingest): improve query fingerprinting (datahub-project#12104)
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored and sleeperdeep committed Dec 17, 2024
1 parent 50bdd16 commit a673c15
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1383,8 +1383,7 @@ def _query_urn(cls, query_id: QueryId) -> str:
return QueryUrn(query_id).urn()

@classmethod
def _composite_query_id(cls, composed_of_queries: Iterable[QueryId]) -> str:
composed_of_queries = list(composed_of_queries)
def _composite_query_id(cls, composed_of_queries: List[QueryId]) -> str:
combined = json.dumps(composed_of_queries)
return f"composite_{generate_hash(combined)}"

Expand Down
10 changes: 8 additions & 2 deletions metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def _expression_to_string(
# Remove /* */ comments.
re.compile(r"/\*.*?\*/", re.DOTALL): "",
# Remove -- comments.
re.compile(r"--.*$"): "",
re.compile(r"--.*$", re.MULTILINE): "",
# Replace all runs of whitespace with a single space.
re.compile(r"\s+"): " ",
# Remove leading and trailing whitespace and trailing semicolons.
Expand All @@ -131,10 +131,16 @@ def _expression_to_string(
# Replace anything that looks like a string with a placeholder.
re.compile(r"'[^']*'"): "?",
# Replace sequences of IN/VALUES with a single placeholder.
re.compile(r"\b(IN|VALUES)\s*\(\?(?:, \?)*\)", re.IGNORECASE): r"\1 (?)",
# The r" ?" makes it more robust to uneven spacing.
re.compile(r"\b(IN|VALUES)\s*\( ?\?(?:, ?\?)* ?\)", re.IGNORECASE): r"\1 (?)",
# Normalize parenthesis spacing.
re.compile(r"\( "): "(",
re.compile(r" \)"): ")",
# Fix up spaces before commas in column lists.
# e.g. "col1 , col2" -> "col1, col2"
# e.g. "col1,col2" -> "col1, col2"
re.compile(r"\b ,"): ",",
re.compile(r"\b,\b"): ", ",
}
_TABLE_NAME_NORMALIZATION_RULES = {
# Replace UUID-like strings with a placeholder (both - and _ variants).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD)",
"type": "TRANSFORMED",
"query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c"
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
}
],
"fineGrainedLineages": [
Expand All @@ -147,7 +147,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c"
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
},
{
"upstreamType": "FIELD_SET",
Expand All @@ -159,7 +159,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c"
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
},
{
"upstreamType": "FIELD_SET",
Expand All @@ -171,15 +171,15 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c"
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
}
]
}
}
},
{
"entityType": "query",
"entityUrn": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c",
"entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
Expand All @@ -202,7 +202,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c",
"entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
Expand All @@ -229,7 +229,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c",
"entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)",
"type": "TRANSFORMED",
"query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500"
"query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
}
],
"fineGrainedLineages": [
Expand All @@ -147,7 +147,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500"
"query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
},
{
"upstreamType": "FIELD_SET",
Expand All @@ -159,7 +159,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),b)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500"
"query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
},
{
"upstreamType": "FIELD_SET",
Expand All @@ -171,15 +171,15 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),c)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500"
"query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
}
]
}
}
},
{
"entityType": "query",
"entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500",
"entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
Expand All @@ -202,7 +202,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500",
"entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
Expand All @@ -229,7 +229,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500",
"entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
Expand Down Expand Up @@ -411,7 +411,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)",
"type": "TRANSFORMED",
"query": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68"
"query": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904"
},
{
"auditStamp": {
Expand All @@ -432,7 +432,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68",
"entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
Expand All @@ -455,7 +455,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68",
"entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
Expand All @@ -473,7 +473,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68",
"entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)",
"type": "TRANSFORMED",
"query": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce"
"query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3"
},
{
"auditStamp": {
Expand All @@ -146,7 +146,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_dep,PROD)",
"type": "TRANSFORMED",
"query": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce"
"query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3"
}
],
"fineGrainedLineages": [
Expand All @@ -161,15 +161,15 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce"
"query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3"
}
]
}
}
},
{
"entityType": "query",
"entityUrn": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce",
"entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
Expand All @@ -192,7 +192,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce",
"entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
Expand All @@ -219,7 +219,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce",
"entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
Expand Down
21 changes: 21 additions & 0 deletions metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ class QueryGeneralizationTestMode(Enum):
"SELECT * FROM foo",
QueryGeneralizationTestMode.BOTH,
),
(
"SELECT a\n -- comment--\n,b --another comment\n FROM books",
"redshift",
"SELECT a, b FROM books",
QueryGeneralizationTestMode.BOTH,
),
# Parameter normalization.
(
"UPDATE \"books\" SET page_count = page_count + 1, author_count = author_count + 1 WHERE book_title = 'My New Book'",
Expand Down Expand Up @@ -105,6 +111,21 @@ class QueryGeneralizationTestMode(Enum):
"INSERT INTO MyTable (Column1, Column2, Column3) VALUES (?)",
QueryGeneralizationTestMode.BOTH,
),
(
# Uneven spacing within the IN clause.
"SELECT * FROM books WHERE zip_code IN (123,345, 423 )",
"redshift",
"SELECT * FROM books WHERE zip_code IN (?)",
QueryGeneralizationTestMode.BOTH,
),
# Uneven spacing in the column list.
# This isn't perfect e.g. we still have issues with function calls inside selects.
(
"SELECT a\n ,b FROM books",
"redshift",
"SELECT a, b FROM books",
QueryGeneralizationTestMode.BOTH,
),
(
textwrap.dedent(
"""\
Expand Down

0 comments on commit a673c15

Please sign in to comment.