Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(ingest): consistent fingerprint for sql parsing aggregator #12239

Merged
merged 3 commits into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
ColumnRef,
DownstreamColumnRef,
)
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
from datahub.utilities.perf_timer import PerfTimer

Expand Down Expand Up @@ -475,10 +476,11 @@ def _parse_audit_log_row(

entry = PreparsedQuery(
# Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
# job at eliminating redundant / repetitive queries. As such, we don't include the fingerprint
# here so that the aggregator auto-generates one.
# query_id=res["query_fingerprint"],
query_id=None,
# job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
# here
query_id=get_query_fingerprint(
res["query_text"], self.identifiers.platform, fast=True
),
query_text=res["query_text"],
upstreams=upstreams,
downstream=downstream,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,20 @@ def default_user_urn_builder(email: str) -> str:
return builder.make_user_urn(email.split("@")[0])


def extract_user_email(user: str) -> Optional[str]:
"""Extracts user email from user input

>>> extract_user_email('urn:li:corpuser:[email protected]')
'[email protected]'
>>> extract_user_email('urn:li:corpuser:abc')
>>> extract_user_email('[email protected]')
'[email protected]'
"""
if user.startswith(("urn:li:corpuser:", "urn:li:corpGroup:")):
user = user.split(":")[-1]
return user if "@" in user else None


def make_usage_workunit(
bucket_start_time: datetime,
resource: ResourceType,
Expand Down Expand Up @@ -104,7 +118,7 @@ def make_usage_workunit(
DatasetUserUsageCountsClass(
user=user_urn_builder(user),
count=count,
userEmail=user if "@" in user else None,
userEmail=extract_user_email(user),
)
for user, count in user_freq
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def id(self) -> str:

@dataclasses.dataclass
class PreparsedQuery:
# If not provided, we will generate one using the fast fingerprint generator.
# If not provided, we will generate one using the fingerprint generator.
query_id: Optional[QueryId]

query_text: str
Expand Down Expand Up @@ -622,7 +622,6 @@ def add_known_query_lineage(
query_fingerprint = get_query_fingerprint(
known_query_lineage.query_text,
platform=self.platform.platform_name,
fast=True,
)
formatted_query = self._maybe_format_query(known_query_lineage.query_text)

Expand Down Expand Up @@ -848,7 +847,6 @@ def add_preparsed_query(
query_fingerprint = get_query_fingerprint(
parsed.query_text,
platform=self.platform.platform_name,
fast=True,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we'll need to revisit this - we want fast fingerprinting for snowflake, and slow parsing-based for all other platforms

)

# Format the query.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)",
"type": "TRANSFORMED",
"query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
"query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
}
],
"fineGrainedLineages": [
Expand All @@ -32,7 +32,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
"query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
},
{
"upstreamType": "FIELD_SET",
Expand All @@ -44,7 +44,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
"query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
},
{
"upstreamType": "FIELD_SET",
Expand All @@ -56,15 +56,15 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
"query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
}
]
}
}
},
{
"entityType": "query",
"entityUrn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f",
"entityUrn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
Expand All @@ -87,7 +87,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f",
"entityUrn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
Expand All @@ -114,7 +114,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f",
"entityUrn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
Expand All @@ -137,7 +137,7 @@
},
"operationType": "INSERT",
"customProperties": {
"query_urn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
"query_urn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
},
"lastUpdatedTimestamp": 20000
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD)",
"type": "TRANSFORMED",
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
"query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b"
}
],
"fineGrainedLineages": [
Expand All @@ -147,7 +147,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
"query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b"
},
{
"upstreamType": "FIELD_SET",
Expand All @@ -159,7 +159,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
"query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b"
},
{
"upstreamType": "FIELD_SET",
Expand All @@ -171,15 +171,15 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
"query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b"
}
]
}
}
},
{
"entityType": "query",
"entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
"entityUrn": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
Expand All @@ -202,7 +202,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
"entityUrn": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
Expand All @@ -229,7 +229,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
"entityUrn": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD)",
"type": "TRANSFORMED",
"query": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332"
"query": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e"
}
],
"fineGrainedLineages": [
Expand All @@ -147,7 +147,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)"
],
"confidenceScore": 0.2,
"query": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332"
"query": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e"
},
{
"upstreamType": "FIELD_SET",
Expand All @@ -159,15 +159,15 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)"
],
"confidenceScore": 0.2,
"query": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332"
"query": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e"
}
]
}
}
},
{
"entityType": "query",
"entityUrn": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332",
"entityUrn": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
Expand All @@ -190,7 +190,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332",
"entityUrn": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
Expand All @@ -217,7 +217,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332",
"entityUrn": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
Expand Down
Loading
Loading