From 8c724dbf47dd76a4aefec0a93267e08ddeda7e58 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Wed, 18 Dec 2024 12:45:38 -0600 Subject: [PATCH 01/35] feat(api): authorization extended for soft-delete and suspend (#12158) --- datahub-frontend/app/auth/AuthModule.java | 2 + .../upgrade/config/SystemUpdateConfig.java | 2 + .../restorebackup/RestoreStorageStep.java | 2 +- .../upgrade/system/AbstractMCLStep.java | 3 +- .../bootstrapmcps/BootstrapMCPUtil.java | 4 +- ...ateSchemaFieldsFromSchemaMetadataStep.java | 10 +- ...chemaFieldsFromSchemaMetadataStepTest.java | 3 +- .../aspect/CachingAspectRetriever.java | 36 +++- .../metadata/aspect/GraphRetriever.java | 23 +++ .../metadata/entity/SearchRetriever.java | 19 ++ .../metadata/aspect/MockAspectRetriever.java | 4 +- .../java/com/linkedin/metadata/Constants.java | 2 + .../ebean/batch/AspectsBatchImplTest.java | 8 +- .../aspect/utils/DefaultAspectsUtil.java | 2 +- .../client/EntityClientAspectRetriever.java | 7 +- .../metadata/client/JavaEntityClient.java | 21 ++- .../client/SystemJavaEntityClient.java | 2 +- .../entity/EntityServiceAspectRetriever.java | 10 +- .../metadata/entity/EntityServiceImpl.java | 67 +++---- .../linkedin/metadata/entity/EntityUtils.java | 2 +- .../cassandra/CassandraRetentionService.java | 2 +- .../entity/ebean/EbeanRetentionService.java | 2 +- .../query/filter/BaseQueryFilterRewriter.java | 2 +- .../SearchDocumentTransformer.java | 2 - .../BusinessAttributeUpdateHookService.java | 4 +- .../service/UpdateGraphIndicesService.java | 3 +- .../service/UpdateIndicesService.java | 5 +- .../metadata/AspectIngestionUtils.java | 12 +- .../hooks/IgnoreUnknownMutatorTest.java | 12 +- .../aspect/utils/DefaultAspectsUtilTest.java | 3 +- .../DataProductUnsetSideEffectTest.java | 8 +- .../entity/EbeanEntityServiceTest.java | 36 ++-- .../metadata/entity/EntityServiceTest.java | 118 ++++++------ .../cassandra/CassandraEntityServiceTest.java | 11 +- .../ebean/batch/ChangeItemImplTest.java | 4 +- .../RecommendationsServiceTest.java | 3 +- .../SchemaFieldSideEffectTest.java | 12 +- .../ContainerExpansionRewriterTest.java | 5 +- .../filter/DomainExpansionRewriterTest.java | 9 +- .../request/AggregationQueryBuilderTest.java | 9 +- .../request/SearchRequestHandlerTest.java | 1 + .../SearchDocumentTransformerTest.java | 12 ++ ...ropertyDefinitionDeleteSideEffectTest.java | 12 +- .../ShowPropertyAsBadgeValidatorTest.java | 2 +- .../io/datahubproject/test/DataGenerator.java | 5 +- .../MCLSpringCommonTestConfiguration.java | 3 +- .../hook/BusinessAttributeUpdateHookTest.java | 16 +- .../metadata/context/ActorContext.java | 48 +++++ .../metadata/context/OperationContext.java | 123 ++++++++----- .../metadata/context/RetrieverContext.java | 29 +++ .../exception/ActorAccessException.java | 7 + .../exception/OperationContextException.java | 9 + .../context/TestOperationContexts.java | 139 ++++++-------- .../context/OperationContextTest.java | 3 +- .../token/StatefulTokenService.java | 2 +- .../src/main/resources/application.yaml | 6 +- .../SystemOperationContextFactory.java | 14 +- .../IngestDataPlatformInstancesStep.java | 4 +- .../boot/steps/IngestPoliciesStep.java | 2 +- .../GlobalControllerExceptionHandler.java | 14 +- .../controller/GenericEntitiesController.java | 8 +- .../openapi/operations/test/IdController.java | 54 ++++++ .../openapi/util/MappingUtil.java | 2 +- .../v2/controller/EntityController.java | 4 +- .../v3/controller/EntityController.java | 4 +- ...m.linkedin.entity.entitiesV2.restspec.json | 8 + ...m.linkedin.entity.entitiesV2.snapshot.json | 8 + .../linkedin/entity/client/EntityClient.java | 71 ++++++- .../entity/client/RestliEntityClient.java | 13 +- .../client/SystemRestliEntityClient.java | 2 +- .../resources/entity/AspectResource.java | 2 +- .../resources/entity/EntityV2Resource.java | 10 +- .../resources/restli/RestliConstants.java | 3 + .../resources/restli/RestliUtils.java | 8 + .../resources/entity/AspectResourceTest.java | 2 +- .../tokens/revokable_access_token_test.py | 44 +---- .../tests/tokens/session_access_token_test.py | 173 ++++++++++++++++++ smoke-test/tests/tokens/token_utils.py | 53 ++++++ 78 files changed, 980 insertions(+), 431 deletions(-) create mode 100644 metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java create mode 100644 metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java rename metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/{ => config}/GlobalControllerExceptionHandler.java (81%) create mode 100644 metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java create mode 100644 smoke-test/tests/tokens/session_access_token_test.py create mode 100644 smoke-test/tests/tokens/token_utils.py diff --git a/datahub-frontend/app/auth/AuthModule.java b/datahub-frontend/app/auth/AuthModule.java index 7fa99ab3cb2621..b95515684f01fc 100644 --- a/datahub-frontend/app/auth/AuthModule.java +++ b/datahub-frontend/app/auth/AuthModule.java @@ -27,6 +27,7 @@ import io.datahubproject.metadata.context.EntityRegistryContext; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.OperationContextConfig; +import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.SearchContext; import io.datahubproject.metadata.context.ValidationContext; import java.nio.charset.StandardCharsets; @@ -195,6 +196,7 @@ protected OperationContext provideOperationContext( .searchContext(SearchContext.EMPTY) .entityRegistryContext(EntityRegistryContext.builder().build(EmptyEntityRegistry.EMPTY)) .validationContext(ValidationContext.builder().alternateValidation(false).build()) + .retrieverContext(RetrieverContext.EMPTY) .build(systemAuthentication); } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java index 661717c6309cfc..fdd84da6044f73 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java @@ -13,6 +13,7 @@ import com.linkedin.gms.factory.kafka.common.TopicConventionFactory; import com.linkedin.gms.factory.kafka.schemaregistry.InternalSchemaRegistryFactory; import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.config.kafka.KafkaConfiguration; import com.linkedin.metadata.dao.producer.KafkaEventProducer; import com.linkedin.metadata.dao.producer.KafkaHealthChecker; @@ -186,6 +187,7 @@ protected OperationContext javaSystemOperationContext( components.getIndexConvention(), RetrieverContext.builder() .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(CachingAspectRetriever.EMPTY) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java index 4d53b603c1eaff..1e5cd6cdb24174 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java @@ -180,7 +180,7 @@ private void readerExecutable(ReaderWrapper reader, UpgradeContext context) { try { aspectRecord = EntityUtils.toSystemAspect( - context.opContext().getRetrieverContext().get(), aspect.toEntityAspect()) + context.opContext().getRetrieverContext(), aspect.toEntityAspect()) .get() .getRecordTemplate(); } catch (Exception e) { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java index cd7947ce3c11aa..56feffd211bcd7 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java @@ -113,8 +113,7 @@ public Function executable() { List, SystemAspect>> futures; futures = EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), - batch.collect(Collectors.toList())) + opContext.getRetrieverContext(), batch.collect(Collectors.toList())) .stream() .map( systemAspect -> { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java index 4cc3edff3eb52d..5b807c6c450afb 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java @@ -100,8 +100,8 @@ static AspectsBatch generateAspectBatch( .collect(Collectors.toList()); return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) - .retrieverContext(opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java index 55bc8edbf6a768..de03538907432f 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java @@ -168,13 +168,13 @@ public Function executable() { AspectsBatch aspectsBatch = AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( batch .flatMap( ebeanAspectV2 -> EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), Set.of(ebeanAspectV2)) .stream()) .map( @@ -189,11 +189,7 @@ public Function executable() { .auditStamp(systemAspect.getAuditStamp()) .systemMetadata( withAppSource(systemAspect.getSystemMetadata())) - .build( - opContext - .getRetrieverContext() - .get() - .getAspectRetriever())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList())) .build(); diff --git a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java index 3a2728b4e1d3d6..04b1095e770e0e 100644 --- a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java +++ b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java @@ -22,7 +22,6 @@ import com.linkedin.upgrade.DataHubUpgradeState; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RetrieverContext; -import java.util.Optional; import java.util.stream.Stream; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -48,7 +47,7 @@ public void setup() { step = new GenerateSchemaFieldsFromSchemaMetadataStep( mockOpContext, mockEntityService, mockAspectDao, 10, 100, 1000); - when(mockOpContext.getRetrieverContext()).thenReturn(Optional.of(mockRetrieverContext)); + when(mockOpContext.getRetrieverContext()).thenReturn(mockRetrieverContext); } /** Test to verify the correct step ID is returned. */ diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java index 77e799f752455c..375dd8cf8911e1 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java @@ -1,4 +1,38 @@ package com.linkedin.metadata.aspect; +import com.linkedin.common.urn.Urn; +import com.linkedin.entity.Aspect; +import com.linkedin.metadata.models.registry.EmptyEntityRegistry; +import com.linkedin.metadata.models.registry.EntityRegistry; +import java.util.Collections; +import java.util.Map; +import java.util.Set; +import javax.annotation.Nonnull; + /** Responses can be cached based on application.yaml caching configuration for the EntityClient */ -public interface CachingAspectRetriever extends AspectRetriever {} +public interface CachingAspectRetriever extends AspectRetriever { + + CachingAspectRetriever EMPTY = new EmptyAspectRetriever(); + + class EmptyAspectRetriever implements CachingAspectRetriever { + @Nonnull + @Override + public Map> getLatestAspectObjects( + Set urns, Set aspectNames) { + return Collections.emptyMap(); + } + + @Nonnull + @Override + public Map> getLatestSystemAspects( + Map> urnAspectNames) { + return Collections.emptyMap(); + } + + @Nonnull + @Override + public EntityRegistry getEntityRegistry() { + return EmptyEntityRegistry.EMPTY; + } + } +} diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java index f6858e7da4ba63..30a2c1eb9df8c1 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java @@ -4,6 +4,7 @@ import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.RelationshipFilter; import com.linkedin.metadata.query.filter.SortCriterion; +import java.util.Collections; import java.util.List; import java.util.function.Function; import javax.annotation.Nonnull; @@ -97,4 +98,26 @@ default void consumeRelatedEntities( } } } + + GraphRetriever EMPTY = new EmptyGraphRetriever(); + + class EmptyGraphRetriever implements GraphRetriever { + + @Nonnull + @Override + public RelatedEntitiesScrollResult scrollRelatedEntities( + @Nullable List sourceTypes, + @Nonnull Filter sourceEntityFilter, + @Nullable List destinationTypes, + @Nonnull Filter destinationEntityFilter, + @Nonnull List relationshipTypes, + @Nonnull RelationshipFilter relationshipFilter, + @Nonnull List sortCriterion, + @Nullable String scrollId, + int count, + @Nullable Long startTimeMillis, + @Nullable Long endTimeMillis) { + return new RelatedEntitiesScrollResult(0, 0, null, Collections.emptyList()); + } + } } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java index eaa106b8d1f638..d4894c97015f8f 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java @@ -2,6 +2,7 @@ import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.search.ScrollResult; +import com.linkedin.metadata.search.SearchEntityArray; import java.util.List; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -21,4 +22,22 @@ ScrollResult scroll( @Nullable Filter filters, @Nullable String scrollId, int count); + + SearchRetriever EMPTY = new EmptySearchRetriever(); + + class EmptySearchRetriever implements SearchRetriever { + + @Override + public ScrollResult scroll( + @Nonnull List entities, + @Nullable Filter filters, + @Nullable String scrollId, + int count) { + ScrollResult empty = new ScrollResult(); + empty.setEntities(new SearchEntityArray()); + empty.setNumEntities(0); + empty.setPageSize(0); + return empty; + } + } } diff --git a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java index 65705f15022b6b..98a6d59004a92a 100644 --- a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java +++ b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java @@ -5,7 +5,7 @@ import com.linkedin.data.DataMap; import com.linkedin.data.template.RecordTemplate; import com.linkedin.entity.Aspect; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.mxe.SystemMetadata; @@ -22,7 +22,7 @@ import javax.annotation.Nonnull; import org.mockito.Mockito; -public class MockAspectRetriever implements AspectRetriever { +public class MockAspectRetriever implements CachingAspectRetriever { private final Map> data; private final Map> systemData = new HashMap<>(); diff --git a/li-utils/src/main/java/com/linkedin/metadata/Constants.java b/li-utils/src/main/java/com/linkedin/metadata/Constants.java index ff6a79108600a3..09f873ebf7bc96 100644 --- a/li-utils/src/main/java/com/linkedin/metadata/Constants.java +++ b/li-utils/src/main/java/com/linkedin/metadata/Constants.java @@ -409,6 +409,8 @@ public class Constants { /** User Status */ public static final String CORP_USER_STATUS_ACTIVE = "ACTIVE"; + public static final String CORP_USER_STATUS_SUSPENDED = "SUSPENDED"; + /** Task Runs */ public static final String DATA_PROCESS_INSTANCE_ENTITY_NAME = "dataProcessInstance"; diff --git a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java index 9f57d36f800de3..a3099b9ee21ea4 100644 --- a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java +++ b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java @@ -16,7 +16,7 @@ import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; import com.linkedin.dataset.DatasetProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.patch.GenericJsonPatch; @@ -56,7 +56,7 @@ public class AspectsBatchImplTest { private EntityRegistry testRegistry; - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeTest @@ -75,12 +75,12 @@ public void beforeTest() throws EntityRegistryException { @BeforeMethod public void setup() { - this.mockAspectRetriever = mock(AspectRetriever.class); + this.mockAspectRetriever = mock(CachingAspectRetriever.class); when(this.mockAspectRetriever.getEntityRegistry()).thenReturn(testRegistry); this.retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .graphRetriever(mock(GraphRetriever.class)) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java index 99eadd223acd1a..82bc0ae1409c52 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java @@ -137,7 +137,7 @@ public static List getAdditionalChanges( getProposalFromAspectForDefault( entry.getKey(), entry.getValue(), entityKeyAspect, templateItem), templateItem.getAuditStamp(), - opContext.getAspectRetrieverOpt().get())) + opContext.getAspectRetriever())) .filter(Objects::nonNull); }) .collect(Collectors.toList()); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java b/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java index bba8324d0c5612..669ec751f87c69 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java @@ -35,7 +35,7 @@ public EntityRegistry getEntityRegistry() { @Override public Aspect getLatestAspectObject(@Nonnull Urn urn, @Nonnull String aspectName) { try { - return entityClient.getLatestAspectObject(systemOperationContext, urn, aspectName); + return entityClient.getLatestAspectObject(systemOperationContext, urn, aspectName, false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } @@ -49,7 +49,7 @@ public Map> getLatestAspectObjects( return Map.of(); } else { try { - return entityClient.getLatestAspects(systemOperationContext, urns, aspectNames); + return entityClient.getLatestAspects(systemOperationContext, urns, aspectNames, false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } @@ -70,7 +70,8 @@ public Map> getLatestSystemAspects( urnAspectNames.keySet(), urnAspectNames.values().stream() .flatMap(Collection::stream) - .collect(Collectors.toSet())); + .collect(Collectors.toSet()), + false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java index 29faa3955ea662..3d35f5956b0f4f 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java @@ -106,11 +106,17 @@ public EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final Set projectedAspects = aspectNames == null ? opContext.getEntityAspectNames(entityName) : aspectNames; - return entityService.getEntityV2(opContext, entityName, urn, projectedAspects); + return entityService.getEntityV2( + opContext, + entityName, + urn, + projectedAspects, + alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } @Override @@ -126,7 +132,8 @@ public Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull Set urns, - @Nullable Set aspectNames) + @Nullable Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final Set projectedAspects = aspectNames == null ? opContext.getEntityAspectNames(entityName) : aspectNames; @@ -139,7 +146,11 @@ public Map batchGetV2( try { responseMap.putAll( entityService.getEntitiesV2( - opContext, entityName, new HashSet<>(batch), projectedAspects)); + opContext, + entityName, + new HashSet<>(batch), + projectedAspects, + alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect)); } catch (URISyntaxException e) { throw new RuntimeException(e); } @@ -772,7 +783,7 @@ public List batchIngestProposals( .mcps( batch, auditStamp, - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), opContext.getValidationContext().isAlternateValidation()) .build(); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java index eda9b3a880228f..1d2fd422d7f460 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java @@ -89,6 +89,6 @@ public Map batchGetV2NoCache( @Nonnull Set urns, @Nullable Set aspectNames) throws RemoteInvocationException, URISyntaxException { - return super.batchGetV2(opContext, entityName, urns, aspectNames); + return super.batchGetV2(opContext, entityName, urns, aspectNames, false); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java index 626a1f72f5fb73..50cf8af30d606a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java @@ -5,7 +5,7 @@ import com.linkedin.common.urn.Urn; import com.linkedin.entity.Aspect; -import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.AspectRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.models.registry.EntityRegistry; import io.datahubproject.metadata.context.OperationContext; @@ -22,7 +22,7 @@ @Getter @Builder -public class EntityServiceAspectRetriever implements CachingAspectRetriever { +public class EntityServiceAspectRetriever implements AspectRetriever { @Setter private OperationContext systemOperationContext; private final EntityRegistry entityRegistry; @@ -46,7 +46,8 @@ public Map> getLatestAspectObjects( String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); try { return entityResponseToAspectMap( - entityService.getEntitiesV2(systemOperationContext, entityName, urns, aspectNames)); + entityService.getEntitiesV2( + systemOperationContext, entityName, urns, aspectNames, false)); } catch (URISyntaxException e) { throw new RuntimeException(e); } @@ -71,7 +72,8 @@ public Map> getLatestSystemAspects( urnAspectNames.keySet(), urnAspectNames.values().stream() .flatMap(Collection::stream) - .collect(Collectors.toSet())), + .collect(Collectors.toSet()), + false), entityRegistry); } catch (URISyntaxException e) { throw new RuntimeException(e); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java index 6de7784bfbc0ec..8ae09111204cab 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java @@ -261,8 +261,7 @@ public Map> getLatestAspects( } List systemAspects = - EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), batchGetResults.values()); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), batchGetResults.values()); systemAspects.stream() // for now, don't add the key aspect here we have already added it above @@ -290,8 +289,7 @@ public Map getLatestAspectsForUrn( Map batchGetResults = getLatestAspect(opContext, new HashSet<>(Arrays.asList(urn)), aspectNames, forUpdate); - return EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), batchGetResults.values()) + return EntityUtils.toSystemAspects(opContext.getRetrieverContext(), batchGetResults.values()) .stream() .map( systemAspect -> Pair.of(systemAspect.getAspectName(), systemAspect.getRecordTemplate())) @@ -335,7 +333,7 @@ public Pair getAspectVersionPair( final Optional maybeAspect = Optional.ofNullable(aspectDao.getAspect(primaryKey)); return Pair.of( - EntityUtils.toSystemAspect(opContext.getRetrieverContext().get(), maybeAspect.orElse(null)) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), maybeAspect.orElse(null)) .map(SystemAspect::getRecordTemplate) .orElse(null), version); @@ -721,7 +719,7 @@ public ListResult listLatestAspects( } return new ListResult<>( - EntityUtils.toSystemAspects(opContext.getRetrieverContext().get(), entityAspects).stream() + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), entityAspects).stream() .map(SystemAspect::getRecordTemplate) .collect(Collectors.toList()), aspectMetadataList.getMetadata(), @@ -758,12 +756,12 @@ public List ingestAspects( .recordTemplate(pair.getValue()) .systemMetadata(systemMetadata) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()); return ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -815,13 +813,13 @@ private void processPostCommitMCLSideEffects( log.debug("Considering {} MCLs post commit side effects.", mcls.size()); List batch = mcls.stream() - .map(mcl -> MCLItemImpl.builder().build(mcl, opContext.getAspectRetrieverOpt().get())) + .map(mcl -> MCLItemImpl.builder().build(mcl, opContext.getAspectRetriever())) .collect(Collectors.toList()); Iterable> iterable = () -> Iterators.partition( - AspectsBatch.applyPostMCPSideEffects(batch, opContext.getRetrieverContext().get()) + AspectsBatch.applyPostMCPSideEffects(batch, opContext.getRetrieverContext()) .iterator(), MCP_SIDE_EFFECT_KAFKA_BATCH_SIZE); StreamSupport.stream(iterable.spliterator(), false) @@ -831,7 +829,7 @@ private void processPostCommitMCLSideEffects( ingestProposalAsync( AspectsBatchImpl.builder() .items(sideEffects) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build()) .count(); log.info("Generated {} MCP SideEffects for async processing", count); @@ -879,8 +877,7 @@ private List ingestAspectsToLocalDB( aspectDao.getLatestAspects(urnAspects, true); final Map> batchAspects = - EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), databaseAspects); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), databaseAspects); // read #2 (potentially) final Map> nextVersions = @@ -903,7 +900,7 @@ private List ingestAspectsToLocalDB( Map> newLatestAspects = EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getLatestAspects(updatedItems.getFirst(), true)); // merge updatedLatestAspects = AspectsBatch.merge(batchAspects, newLatestAspects); @@ -941,7 +938,7 @@ private List ingestAspectsToLocalDB( // do final pre-commit checks with previous aspect value ValidationExceptionCollection exceptions = - AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext().get()); + AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext()); if (exceptions.hasFatalExceptions()) { // IF this is a client request/API request we fail the `transaction batch` @@ -1143,8 +1140,8 @@ public RecordTemplate ingestAspectIfNotPresent( .recordTemplate(newValue) .systemMetadata(systemMetadata) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get()), - opContext.getRetrieverContext().get()) + .build(opContext.getAspectRetriever()), + opContext.getRetrieverContext()) .build(); List ingested = ingestAspects(opContext, aspectsBatch, true, false); @@ -1169,7 +1166,7 @@ public IngestResult ingestProposal( return ingestProposal( opContext, AspectsBatchImpl.builder() - .mcps(List.of(proposal), auditStamp, opContext.getRetrieverContext().get()) + .mcps(List.of(proposal), auditStamp, opContext.getRetrieverContext()) .build(), async) .stream() @@ -1246,7 +1243,7 @@ private Stream ingestTimeseriesProposal( .recordTemplate( EntityApiUtils.buildKeyAspect( opContext.getEntityRegistry(), item.getUrn())) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()); ingestProposalSync( @@ -1469,7 +1466,7 @@ public List restoreIndices( List systemAspects = EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), batch.collect(Collectors.toList())); + opContext.getRetrieverContext(), batch.collect(Collectors.toList())); RestoreIndicesResult result = restoreIndices(opContext, systemAspects, logger); result.timeSqlQueryMs = timeSqlQueryMs; @@ -1513,7 +1510,7 @@ public List restoreIndices( long startTime = System.currentTimeMillis(); List systemAspects = EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), getLatestAspect(opContext, entityBatch.getValue(), aspectNames, false).values()); long timeSqlQueryMs = System.currentTimeMillis() - startTime; @@ -1649,12 +1646,12 @@ private RestoreIndicesResult restoreIndices( .auditStamp(auditStamp) .systemMetadata(latestSystemMetadata) .recordTemplate(EntityApiUtils.buildKeyAspect(opContext.getEntityRegistry(), urn)) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); Stream defaultAspectsResult = ingestProposalSync( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(keyAspect) .build()); defaultAspectsCreated += defaultAspectsResult.count(); @@ -1966,7 +1963,7 @@ private void ingestSnapshotUnion( AspectsBatchImpl aspectsBatch = AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( aspectRecordsToIngest.stream() .map( @@ -1977,7 +1974,7 @@ private void ingestSnapshotUnion( .recordTemplate(pair.getValue()) .auditStamp(auditStamp) .systemMetadata(systemMetadata) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList())) .build(); @@ -2128,7 +2125,7 @@ public RollbackRunResult deleteUrn(@Nonnull OperationContext opContext, Urn urn) } SystemMetadata latestKeySystemMetadata = - EntityUtils.toSystemAspect(opContext.getRetrieverContext().get(), latestKey) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), latestKey) .map(SystemAspect::getSystemMetadata) .get(); RollbackResult result = @@ -2253,11 +2250,11 @@ private RollbackResult deleteAspectWithoutMCL( .urn(entityUrn) .aspectName(aspectName) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get()); + .build(opContext.getAspectRetriever()); // Delete validation hooks ValidationExceptionCollection exceptions = - AspectsBatch.validateProposed(List.of(deleteItem), opContext.getRetrieverContext().get()); + AspectsBatch.validateProposed(List.of(deleteItem), opContext.getRetrieverContext()); if (!exceptions.isEmpty()) { throw new ValidationException(collectMetrics(exceptions).toString()); } @@ -2271,7 +2268,7 @@ private RollbackResult deleteAspectWithoutMCL( final EntityAspect.EntitySystemAspect latest = (EntityAspect.EntitySystemAspect) EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getLatestAspect(urn, aspectName, false)) .orElse(null); @@ -2299,7 +2296,7 @@ private RollbackResult deleteAspectWithoutMCL( EntityAspect.EntitySystemAspect candidateAspect = (EntityAspect.EntitySystemAspect) EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getAspect(urn, aspectName, maxVersion)) .orElse(null); SystemMetadata previousSysMetadata = @@ -2325,13 +2322,9 @@ private RollbackResult deleteAspectWithoutMCL( .urn(UrnUtils.getUrn(toDelete.getUrn())) .aspectName(toDelete.getAspect()) .auditStamp(auditStamp) - .build( - opContext - .getRetrieverContext() - .get() - .getAspectRetriever())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()), - opContext.getRetrieverContext().get()); + opContext.getRetrieverContext()); if (!preCommitExceptions.isEmpty()) { throw new ValidationException(collectMetrics(preCommitExceptions).toString()); } @@ -2509,7 +2502,7 @@ private Map getEnvelopedAspects( final Map dbEntries = aspectDao.batchGet(dbKeys, false); List envelopedAspects = - EntityUtils.toSystemAspects(opContext.getRetrieverContext().get(), dbEntries.values()); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), dbEntries.values()); return envelopedAspects.stream() .collect( diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java index 3c4109970e9d0b..da48a2b76d6d56 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java @@ -72,7 +72,7 @@ public static void ingestChangeProposals( entityService.ingestProposal( opContext, AspectsBatchImpl.builder() - .mcps(changes, getAuditStamp(actor), opContext.getRetrieverContext().get()) + .mcps(changes, getAuditStamp(actor), opContext.getRetrieverContext()) .build(), async); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java index ccc1910ba5cdbd..c595e3e07b8342 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java @@ -64,7 +64,7 @@ protected AspectsBatch buildAspectsBatch( List mcps, @Nonnull AuditStamp auditStamp) { return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java index 49fa555e006f61..74d0d8b0964de0 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java @@ -59,7 +59,7 @@ protected AspectsBatch buildAspectsBatch( List mcps, @Nonnull AuditStamp auditStamp) { return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java index 367705d369c7ce..6c5c6243d33620 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java @@ -143,7 +143,7 @@ private static QueryBuilder expandTerms( if (!queryUrns.isEmpty()) { scrollGraph( - opContext.getRetrieverContext().get().getGraphRetriever(), + opContext.getRetrieverContext().getGraphRetriever(), queryUrns, relationshipTypes, relationshipDirection, diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java index 4bb8e0630de480..b4ad847cb7afc2 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java @@ -437,8 +437,6 @@ private void setStructuredPropertiesSearchValue( Map> definitions = opContext - .getRetrieverContext() - .get() .getAspectRetriever() .getLatestAspectObjects( propertyMap.keySet(), Set.of(STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME)); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java index ad2825ead3d0da..4a692e95346222 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java @@ -112,7 +112,7 @@ private void fetchRelatedEntities( @Nullable String scrollId, int consumedEntityCount, int batchNumber) { - GraphRetriever graph = opContext.getRetrieverContext().get().getGraphRetriever(); + GraphRetriever graph = opContext.getRetrieverContext().getGraphRetriever(); final ArrayList> futureList = new ArrayList<>(); RelatedEntitiesScrollResult result = graph.scrollRelatedEntities( @@ -165,7 +165,7 @@ private Callable processBatch( return () -> { StopWatch stopWatch = new StopWatch(); stopWatch.start(); - AspectRetriever aspectRetriever = opContext.getAspectRetrieverOpt().get(); + AspectRetriever aspectRetriever = opContext.getAspectRetriever(); log.info("Batch {} for BA:{} started", batchNumber, entityKey); ExecutionResult executionResult = new ExecutionResult(); executionResult.setBatchNumber(batchNumber); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java index efe073fc00dfdc..4b09bc00efb61a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java @@ -94,8 +94,7 @@ public UpdateGraphIndicesService( public void handleChangeEvent( @Nonnull OperationContext opContext, @Nonnull final MetadataChangeLog event) { try { - MCLItemImpl mclItem = - MCLItemImpl.builder().build(event, opContext.getAspectRetrieverOpt().get()); + MCLItemImpl mclItem = MCLItemImpl.builder().build(event, opContext.getAspectRetriever()); if (UPDATE_CHANGE_TYPES.contains(event.getChangeType())) { handleUpdateChangeEvent(opContext, mclItem); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java index 187ef3e8c62290..c5fc9ebdac9fa6 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java @@ -121,11 +121,10 @@ public UpdateIndicesService( public void handleChangeEvent( @Nonnull OperationContext opContext, @Nonnull final MetadataChangeLog event) { try { - MCLItemImpl batch = - MCLItemImpl.builder().build(event, opContext.getAspectRetrieverOpt().get()); + MCLItemImpl batch = MCLItemImpl.builder().build(event, opContext.getAspectRetriever()); Stream sideEffects = - AspectsBatch.applyMCLSideEffects(List.of(batch), opContext.getRetrieverContext().get()); + AspectsBatch.applyMCLSideEffects(List.of(batch), opContext.getRetrieverContext()); for (MCLItem mclItem : Stream.concat(Stream.of(batch), sideEffects).collect(Collectors.toList())) { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java index 12b12cf105196e..fa6ab7932001b6 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java @@ -46,12 +46,12 @@ public static Map ingestCorpUserKeyAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -83,12 +83,12 @@ public static Map ingestCorpUserInfoAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -121,12 +121,12 @@ public static Map ingestChartInfoAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java index 11a3153abcaeed..19be1eb14667d8 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java @@ -16,7 +16,8 @@ import com.linkedin.data.template.StringMap; import com.linkedin.dataset.DatasetProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; import com.linkedin.metadata.entity.SearchRetriever; @@ -28,7 +29,6 @@ import com.linkedin.mxe.SystemMetadata; import com.linkedin.test.metadata.aspect.TestEntityRegistry; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; import java.util.List; @@ -53,17 +53,17 @@ public class IgnoreUnknownMutatorTest { private static final Urn TEST_DATASET_URN = UrnUtils.getUrn( "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)"); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java b/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java index 04aff4edf456d9..e7ed2671131592 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java @@ -56,8 +56,7 @@ public void testAdditionalChanges() { DefaultAspectsUtil.getAdditionalChanges( opContext, AspectsBatchImpl.builder() - .mcps( - List.of(proposal1), new AuditStamp(), opContext.getRetrieverContext().get()) + .mcps(List.of(proposal1), new AuditStamp(), opContext.getRetrieverContext()) .build() .getMCPItems(), entityServiceImpl, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java index 976b165fea53df..215e1e2431efa0 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java @@ -15,7 +15,7 @@ import com.linkedin.dataproduct.DataProductAssociationArray; import com.linkedin.dataproduct.DataProductProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.aspect.batch.MCPItem; @@ -75,12 +75,12 @@ public class DataProductUnsetSideEffectTest { .build())) .build(); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); GraphRetriever graphRetriever = mock(GraphRetriever.class); RelatedEntities relatedEntities = @@ -139,7 +139,7 @@ public void setup() { retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .graphRetriever(graphRetriever) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java index 0386031cbcad86..88f84ee94c8ee7 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java @@ -19,6 +19,7 @@ import com.linkedin.metadata.AspectGenerationUtils; import com.linkedin.metadata.Constants; import com.linkedin.metadata.EbeanTestUtils; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.config.EbeanConfiguration; import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.entity.ebean.EbeanAspectDao; @@ -98,12 +99,15 @@ public void setupTest() { .entityService(_entityServiceImpl) .entityRegistry(_testEntityRegistry) .build()) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> _testEntityRegistry)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, opContext -> - ((EntityServiceAspectRetriever) opContext.getAspectRetrieverOpt().get()) + ((EntityServiceAspectRetriever) opContext.getAspectRetriever()) .setSystemOperationContext(opContext), null); } @@ -152,25 +156,25 @@ public void testIngestListLatestAspects() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null))); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null))); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -230,25 +234,25 @@ public void testIngestListUrns() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null))); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null))); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -310,11 +314,11 @@ public void testSystemMetadataDuplicateKey() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(item)) .build(), false, @@ -356,7 +360,7 @@ public void testSystemMetadataDuplicateKey() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( List.of( ChangeItemImpl.builder() @@ -365,7 +369,7 @@ public void testSystemMetadataDuplicateKey() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)))) + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)))) .build(), false, true); @@ -600,7 +604,7 @@ public void run() { auditStamp.setTime(System.currentTimeMillis()); AspectsBatchImpl batch = AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, operationContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, operationContext.getRetrieverContext()) .build(); entityService.ingestProposal(operationContext, batch, false); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java index 2d59632e6f3c6d..c00632e5cf5424 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java @@ -945,32 +945,32 @@ public void testRollbackAspect() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1037,25 +1037,25 @@ public void testRollbackKey() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(keyAspectName) .recordTemplate(writeKey1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1130,39 +1130,39 @@ public void testRollbackUrn() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(keyAspectName) .recordTemplate(writeKey1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1208,11 +1208,11 @@ public void testIngestGetLatestAspect() throws AssertionError { .recordTemplate(writeAspect1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1264,11 +1264,11 @@ public void testIngestGetLatestAspect() throws AssertionError { .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata2) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1320,11 +1320,11 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception { .recordTemplate(writeAspect1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1347,11 +1347,11 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception { .recordTemplate(writeAspect2) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1416,11 +1416,11 @@ public void testIngestSameAspect() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1472,11 +1472,11 @@ public void testIngestSameAspect() throws AssertionError { .recordTemplate(writeAspect2) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1534,46 +1534,46 @@ public void testRetention() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName) .recordTemplate(writeAspect1a) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName) .recordTemplate(writeAspect1b) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2a) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2b) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1610,18 +1610,18 @@ public void testRetention() throws AssertionError { .recordTemplate(writeAspect1c) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2c) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1982,8 +1982,7 @@ public void testStructuredPropertyIngestProposal() throws Exception { stream .map( entityAspect -> - EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), entityAspect) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), entityAspect) .get() .getAspect(StructuredPropertyDefinition.class)) .collect(Collectors.toSet()); @@ -1995,7 +1994,10 @@ public void testStructuredPropertyIngestProposal() throws Exception { SystemEntityClient mockSystemEntityClient = Mockito.mock(SystemEntityClient.class); Mockito.when( mockSystemEntityClient.getLatestAspectObject( - any(OperationContext.class), eq(firstPropertyUrn), eq("propertyDefinition"))) + any(OperationContext.class), + eq(firstPropertyUrn), + eq("propertyDefinition"), + anyBoolean())) .thenReturn(new com.linkedin.entity.Aspect(structuredPropertyDefinition.data())); // Add a value for that property @@ -2062,8 +2064,7 @@ public void testStructuredPropertyIngestProposal() throws Exception { stream .map( entityAspect -> - EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), entityAspect) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), entityAspect) .get() .getAspect(StructuredPropertyDefinition.class)) .collect(Collectors.toSet()); @@ -2074,7 +2075,10 @@ public void testStructuredPropertyIngestProposal() throws Exception { Mockito.when( mockSystemEntityClient.getLatestAspectObject( - any(OperationContext.class), eq(secondPropertyUrn), eq("propertyDefinition"))) + any(OperationContext.class), + eq(secondPropertyUrn), + eq("propertyDefinition"), + anyBoolean())) .thenReturn(new com.linkedin.entity.Aspect(secondDefinition.data())); // Get existing value for first structured property @@ -2209,7 +2213,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); ChangeItemImpl item2 = ChangeItemImpl.builder() .urn(entityUrn) @@ -2217,11 +2221,11 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(item1, item2)) .build(), false, @@ -2269,7 +2273,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { .setTags(new TagAssociationArray(new TagAssociation().setTag(tag1)))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd2 = PatchItemImpl.builder() @@ -2311,7 +2315,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2320,7 +2324,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd2, patchRemoveNonExistent)) .build(), false, @@ -2368,7 +2372,7 @@ public void testBatchPatchAdd() throws Exception { .setTags(new TagAssociationArray(new TagAssociation().setTag(tag1)))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd3 = PatchItemImpl.builder() @@ -2428,7 +2432,7 @@ public void testBatchPatchAdd() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2437,7 +2441,7 @@ public void testBatchPatchAdd() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd3, patchAdd2, patchAdd1)) .build(), false, @@ -2491,7 +2495,7 @@ public void testBatchPatchAddDuplicate() throws Exception { .recordTemplate(new GlobalTags().setTags(new TagAssociationArray(initialTags))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd2 = PatchItemImpl.builder() @@ -2516,7 +2520,7 @@ public void testBatchPatchAddDuplicate() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2525,7 +2529,7 @@ public void testBatchPatchAddDuplicate() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd2, patchAdd2)) // duplicate .build(), false, @@ -2581,7 +2585,7 @@ public void testPatchRemoveNonExistent() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchRemove)) .build(), false, @@ -2638,7 +2642,7 @@ public void testPatchAddNonExistent() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd)) .build(), false, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java index 550f55e6bfd0b9..b4fbfecc9d60d3 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java @@ -10,11 +10,13 @@ import com.linkedin.metadata.AspectGenerationUtils; import com.linkedin.metadata.AspectIngestionUtils; import com.linkedin.metadata.CassandraTestUtils; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.entity.EntityServiceAspectRetriever; import com.linkedin.metadata.entity.EntityServiceImpl; import com.linkedin.metadata.entity.EntityServiceTest; import com.linkedin.metadata.entity.ListResult; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.key.CorpUserKey; import com.linkedin.metadata.models.registry.EntityRegistryException; @@ -93,12 +95,15 @@ private void configureComponents() { .entityService(_entityServiceImpl) .entityRegistry(_testEntityRegistry) .build()) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> _testEntityRegistry)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, opContext -> - ((EntityServiceAspectRetriever) opContext.getAspectRetrieverOpt().get()) + ((EntityServiceAspectRetriever) opContext.getAspectRetriever()) .setSystemOperationContext(opContext), null); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java index 3f6b301e72aa5a..0a867ae3c8f2e0 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java @@ -26,7 +26,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); ChangeItemImpl item2 = ChangeItemImpl.builder() .urn(entityUrn) @@ -34,7 +34,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); assertFalse(item1.isDatabaseDuplicateOf(item2)); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java index ca42f0327c86db..8f68f119cb0b7d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java @@ -11,6 +11,7 @@ import com.linkedin.metadata.recommendation.ranker.SimpleRecommendationRanker; import io.datahubproject.test.metadata.context.TestOperationContexts; import java.net.URISyntaxException; +import java.nio.file.AccessDeniedException; import java.util.List; import java.util.stream.Collectors; import org.testng.annotations.Test; @@ -74,7 +75,7 @@ private List getContentFromUrns(List urns) { } @Test - public void testService() throws URISyntaxException { + public void testService() throws URISyntaxException, AccessDeniedException { // Test non-eligible and empty RecommendationsService service = new RecommendationsService(ImmutableList.of(nonEligibleSource, emptySource), ranker); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java index 1661f5f02ee593..fa895cb4540117 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java @@ -21,7 +21,8 @@ import com.linkedin.data.ByteString; import com.linkedin.entity.Aspect; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCLItem; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; @@ -46,7 +47,6 @@ import com.linkedin.test.metadata.aspect.TestEntityRegistry; import com.linkedin.test.metadata.aspect.batch.TestMCP; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; @@ -87,18 +87,18 @@ public class SchemaFieldSideEffectTest { .build())) .build(); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java index fd768424e13c19..1825b65a18ab19 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java @@ -20,6 +20,7 @@ import com.linkedin.metadata.aspect.models.graph.RelatedEntities; import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.config.search.QueryFilterRewriterConfiguration; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; @@ -71,8 +72,10 @@ public void init() { () -> io.datahubproject.metadata.context.RetrieverContext.builder() .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever(() -> entityRegistry)) .graphRetriever(mockGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, null, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java index 8741e24b1bca50..de375271ed6602 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java @@ -13,13 +13,14 @@ import static org.mockito.Mockito.when; import static org.testng.Assert.assertEquals; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.RetrieverContext; import com.linkedin.metadata.aspect.models.graph.Edge; import com.linkedin.metadata.aspect.models.graph.RelatedEntities; import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.config.search.QueryFilterRewriterConfiguration; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; @@ -54,7 +55,7 @@ public class DomainExpansionRewriterTest @BeforeMethod public void init() { EntityRegistry entityRegistry = new TestEntityRegistry(); - AspectRetriever mockAspectRetriever = mock(AspectRetriever.class); + CachingAspectRetriever mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(entityRegistry); mockGraphRetriever = spy(GraphRetriever.class); @@ -71,8 +72,10 @@ public void init() { () -> io.datahubproject.metadata.context.RetrieverContext.builder() .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever(() -> entityRegistry)) .graphRetriever(mockGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, null, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java index c68997e25bcff7..d6f5f9c3eedbe7 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java @@ -18,6 +18,7 @@ import com.linkedin.data.template.StringArray; import com.linkedin.entity.Aspect; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.config.search.SearchConfiguration; import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.models.annotation.SearchableAnnotation; @@ -49,8 +50,8 @@ public class AggregationQueryBuilderTest { - private static AspectRetriever aspectRetriever; - private static AspectRetriever aspectRetrieverV1; + private static CachingAspectRetriever aspectRetriever; + private static CachingAspectRetriever aspectRetrieverV1; private static String DEFAULT_FILTER = "_index"; @BeforeClass @@ -61,7 +62,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException { Urn.createFromString("urn:li:structuredProperty:under.scores.and.dots_make_a_mess"); // legacy - aspectRetriever = mock(AspectRetriever.class); + aspectRetriever = mock(CachingAspectRetriever.class); when(aspectRetriever.getEntityRegistry()) .thenReturn(TestOperationContexts.defaultEntityRegistry()); @@ -106,7 +107,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException { new Aspect(structPropUnderscoresAndDotsDefinition.data())))); // V1 - aspectRetrieverV1 = mock(AspectRetriever.class); + aspectRetrieverV1 = mock(CachingAspectRetriever.class); when(aspectRetrieverV1.getEntityRegistry()) .thenReturn(TestOperationContexts.defaultEntityRegistry()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java index 393ca3ca5d4a64..e51511699e345a 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java @@ -662,6 +662,7 @@ public void testInvalidStructuredProperty() { TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever(TestOperationContexts.emptyActiveUsersAspectRetriever(null)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java index 2c5bcd1294fa15..65b73b7425b743 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java @@ -247,6 +247,9 @@ public void testSetSearchableRefValue() throws URISyntaxException, RemoteInvocat TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -301,6 +304,9 @@ public void testSetSearchableRefValue_RuntimeException() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -337,6 +343,9 @@ public void testSetSearchableRefValue_RuntimeException_URNExist() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -369,6 +378,9 @@ void testSetSearchableRefValue_WithInvalidURN() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java index b1b716c5604816..9a0a82c7f9f49d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java @@ -18,7 +18,8 @@ import com.linkedin.common.urn.UrnUtils; import com.linkedin.entity.Aspect; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.batch.PatchMCP; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; @@ -36,7 +37,6 @@ import com.linkedin.test.metadata.aspect.TestEntityRegistry; import com.linkedin.test.metadata.aspect.batch.TestMCL; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import jakarta.json.Json; import jakarta.json.JsonPatch; import java.util.List; @@ -76,13 +76,13 @@ public class PropertyDefinitionDeleteSideEffectTest { private static final Urn TEST_DATASET_URN = UrnUtils.getUrn( "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)"); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private SearchRetriever mockSearchRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); when(mockAspectRetriever.getLatestAspectObject( eq(TEST_PROPERTY_URN), eq(STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME))) @@ -101,8 +101,8 @@ public void setup() { retrieverContext = RetrieverContext.builder() .searchRetriever(mockSearchRetriever) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java index 2503faa00f6e71..6e8886f495c95a 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java @@ -58,7 +58,7 @@ public void setup() { mockGraphRetriever = Mockito.mock(GraphRetriever.class); retrieverContext = io.datahubproject.metadata.context.RetrieverContext.builder() - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .searchRetriever(mockSearchRetriever) .graphRetriever(mockGraphRetriever) .build(); diff --git a/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java b/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java index 3acd2bf3413578..02cd28eb202e94 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java +++ b/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java @@ -171,10 +171,7 @@ public Stream> generateMCPs( DefaultAspectsUtil.getAdditionalChanges( opContext, AspectsBatchImpl.builder() - .mcps( - List.of(mcp), - auditStamp, - opContext.getRetrieverContext().get()) + .mcps(List.of(mcp), auditStamp, opContext.getRetrieverContext()) .build() .getMCPItems(), entityService, diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java index cf9d73dfa729be..f16c9dbd82e749 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java @@ -20,7 +20,6 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.OperationContextConfig; -import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.ServicesRegistryContext; import io.datahubproject.metadata.context.ValidationContext; import io.datahubproject.test.metadata.context.TestOperationContexts; @@ -95,7 +94,7 @@ public OperationContext operationContext( entityRegistry, mock(ServicesRegistryContext.class), indexConvention, - mock(RetrieverContext.class), + TestOperationContexts.emptyActiveUsersRetrieverContext(() -> entityRegistry), mock(ValidationContext.class)); } diff --git a/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java b/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java index 47740b02d6166c..65ee6b8591f489 100644 --- a/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java +++ b/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java @@ -93,8 +93,6 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { new RelatedEntity(BUSINESS_ATTRIBUTE_OF, SCHEMA_FIELD_URN.toString()))); when(opContext - .getRetrieverContext() - .get() .getAspectRetriever() .getLatestAspectObjects( eq(Set.of(SCHEMA_FIELD_URN)), eq(Set.of(BUSINESS_ATTRIBUTE_ASPECT)))) @@ -108,7 +106,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { // verify // page 1 - Mockito.verify(opContext.getRetrieverContext().get().getGraphRetriever(), Mockito.times(1)) + Mockito.verify(opContext.getRetrieverContext().getGraphRetriever(), Mockito.times(1)) .scrollRelatedEntities( isNull(), any(Filter.class), @@ -122,7 +120,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { isNull(), isNull()); // page 2 - Mockito.verify(opContext.getRetrieverContext().get().getGraphRetriever(), Mockito.times(1)) + Mockito.verify(opContext.getRetrieverContext().getGraphRetriever(), Mockito.times(1)) .scrollRelatedEntities( isNull(), any(Filter.class), @@ -136,7 +134,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { isNull(), isNull()); - Mockito.verifyNoMoreInteractions(opContext.getRetrieverContext().get().getGraphRetriever()); + Mockito.verifyNoMoreInteractions(opContext.getRetrieverContext().getGraphRetriever()); // 2 pages = 2 ingest proposals Mockito.verify(mockUpdateIndicesService, Mockito.times(2)) @@ -152,8 +150,8 @@ private void testMCLOnInvalidCategory() throws Exception { businessAttributeServiceHook.handleChangeEvent(opContext, platformEvent); // verify - Mockito.verifyNoInteractions(opContext.getRetrieverContext().get().getGraphRetriever()); - Mockito.verifyNoInteractions(opContext.getAspectRetrieverOpt().get()); + Mockito.verifyNoInteractions(opContext.getRetrieverContext().getGraphRetriever()); + Mockito.verifyNoInteractions(opContext.getAspectRetriever()); Mockito.verifyNoInteractions(mockUpdateIndicesService); } @@ -226,13 +224,15 @@ private OperationContext mockOperationContextWithGraph(List graph RetrieverContext mockRetrieverContext = mock(RetrieverContext.class); when(mockRetrieverContext.getAspectRetriever()).thenReturn(mock(AspectRetriever.class)); + when(mockRetrieverContext.getCachingAspectRetriever()) + .thenReturn(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); when(mockRetrieverContext.getGraphRetriever()).thenReturn(graphRetriever); OperationContext opContext = TestOperationContexts.systemContextNoSearchAuthorization(mockRetrieverContext); // reset mock for test - reset(opContext.getAspectRetrieverOpt().get()); + reset(opContext.getAspectRetriever()); if (!graphEdges.isEmpty()) { diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java index e65bf22991736d..c08b7fad4dee32 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java @@ -1,12 +1,23 @@ package io.datahubproject.metadata.context; +import static com.linkedin.metadata.Constants.CORP_USER_KEY_ASPECT_NAME; +import static com.linkedin.metadata.Constants.CORP_USER_STATUS_ASPECT_NAME; +import static com.linkedin.metadata.Constants.CORP_USER_STATUS_SUSPENDED; +import static com.linkedin.metadata.Constants.STATUS_ASPECT_NAME; +import static com.linkedin.metadata.Constants.SYSTEM_ACTOR; + import com.datahub.authentication.Authentication; +import com.linkedin.common.Status; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; +import com.linkedin.entity.Aspect; +import com.linkedin.identity.CorpUserStatus; +import com.linkedin.metadata.aspect.AspectRetriever; import com.linkedin.metadata.authorization.PoliciesConfig; import com.linkedin.policy.DataHubPolicyInfo; import java.util.Collection; import java.util.Collections; +import java.util.Map; import java.util.Optional; import java.util.Set; import lombok.Builder; @@ -48,6 +59,43 @@ public Urn getActorUrn() { return UrnUtils.getUrn(authentication.getActor().toUrnStr()); } + /** + * Actor is considered active if the user is not hard-deleted, soft-deleted, and is not suspended + * + * @param aspectRetriever aspect retriever - ideally the SystemEntityClient backed one for caching + * @return active status + */ + public boolean isActive(AspectRetriever aspectRetriever) { + // system cannot be disabled + if (SYSTEM_ACTOR.equals(authentication.getActor().toUrnStr())) { + return true; + } + + Urn selfUrn = UrnUtils.getUrn(authentication.getActor().toUrnStr()); + Map> urnAspectMap = + aspectRetriever.getLatestAspectObjects( + Set.of(selfUrn), + Set.of(STATUS_ASPECT_NAME, CORP_USER_STATUS_ASPECT_NAME, CORP_USER_KEY_ASPECT_NAME)); + + Map aspectMap = urnAspectMap.getOrDefault(selfUrn, Map.of()); + + if (!aspectMap.containsKey(CORP_USER_KEY_ASPECT_NAME)) { + // user is hard deleted + return false; + } + + Status status = + Optional.ofNullable(aspectMap.get(STATUS_ASPECT_NAME)) + .map(a -> new Status(a.data())) + .orElse(new Status().setRemoved(false)); + CorpUserStatus corpUserStatus = + Optional.ofNullable(aspectMap.get(CORP_USER_STATUS_ASPECT_NAME)) + .map(a -> new CorpUserStatus(a.data())) + .orElse(new CorpUserStatus().setStatus("")); + + return !status.isRemoved() && !CORP_USER_STATUS_SUSPENDED.equals(corpUserStatus.getStatus()); + } + /** * The current implementation creates a cache entry unique for the set of policies. * diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java index 9a058c526647c2..9158129235b39e 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java @@ -16,6 +16,8 @@ import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.utils.AuditStampUtils; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; +import io.datahubproject.metadata.exception.ActorAccessException; +import io.datahubproject.metadata.exception.OperationContextException; import java.util.Collection; import java.util.Objects; import java.util.Optional; @@ -63,6 +65,24 @@ public static OperationContext asSession( @Nonnull Authorizer authorizer, @Nonnull Authentication sessionAuthentication, boolean allowSystemAuthentication) { + return OperationContext.asSession( + systemOperationContext, + requestContext, + authorizer, + sessionAuthentication, + allowSystemAuthentication, + false); + } + + @Nonnull + public static OperationContext asSession( + OperationContext systemOperationContext, + @Nonnull RequestContext requestContext, + @Nonnull Authorizer authorizer, + @Nonnull Authentication sessionAuthentication, + boolean allowSystemAuthentication, + boolean skipCache) + throws ActorAccessException { return systemOperationContext.toBuilder() .operationContextConfig( // update allowed system authentication @@ -72,7 +92,7 @@ public static OperationContext asSession( .authorizationContext(AuthorizationContext.builder().authorizer(authorizer).build()) .requestContext(requestContext) .validationContext(systemOperationContext.getValidationContext()) - .build(sessionAuthentication); + .build(sessionAuthentication, skipCache); } /** @@ -85,10 +105,14 @@ public static OperationContext asSession( public static OperationContext withSearchFlags( OperationContext opContext, Function flagDefaults) { - return opContext.toBuilder() - // update search flags for the request's session - .searchContext(opContext.getSearchContext().withFlagDefaults(flagDefaults)) - .build(opContext.getSessionActorContext()); + try { + return opContext.toBuilder() + // update search flags for the request's session + .searchContext(opContext.getSearchContext().withFlagDefaults(flagDefaults)) + .build(opContext.getSessionActorContext(), false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } /** @@ -101,10 +125,14 @@ public static OperationContext withSearchFlags( public static OperationContext withLineageFlags( OperationContext opContext, Function flagDefaults) { - return opContext.toBuilder() - // update lineage flags for the request's session - .searchContext(opContext.getSearchContext().withLineageFlagDefaults(flagDefaults)) - .build(opContext.getSessionActorContext()); + try { + return opContext.toBuilder() + // update lineage flags for the request's session + .searchContext(opContext.getSearchContext().withLineageFlagDefaults(flagDefaults)) + .build(opContext.getSessionActorContext(), false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } /** @@ -155,18 +183,22 @@ public static OperationContext asSystem( ? SearchContext.EMPTY : SearchContext.builder().indexConvention(indexConvention).build(); - return OperationContext.builder() - .operationContextConfig(systemConfig) - .systemActorContext(systemActorContext) - .searchContext(systemSearchContext) - .entityRegistryContext(EntityRegistryContext.builder().build(entityRegistry)) - .servicesRegistryContext(servicesRegistryContext) - // Authorizer.EMPTY doesn't actually apply to system auth - .authorizationContext(AuthorizationContext.builder().authorizer(Authorizer.EMPTY).build()) - .retrieverContext(retrieverContext) - .objectMapperContext(objectMapperContext) - .validationContext(validationContext) - .build(systemAuthentication); + try { + return OperationContext.builder() + .operationContextConfig(systemConfig) + .systemActorContext(systemActorContext) + .searchContext(systemSearchContext) + .entityRegistryContext(EntityRegistryContext.builder().build(entityRegistry)) + .servicesRegistryContext(servicesRegistryContext) + // Authorizer.EMPTY doesn't actually apply to system auth + .authorizationContext(AuthorizationContext.builder().authorizer(Authorizer.EMPTY).build()) + .retrieverContext(retrieverContext) + .objectMapperContext(objectMapperContext) + .validationContext(validationContext) + .build(systemAuthentication, false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } @Nonnull private final OperationContextConfig operationContextConfig; @@ -177,7 +209,7 @@ public static OperationContext asSystem( @Nonnull private final EntityRegistryContext entityRegistryContext; @Nullable private final ServicesRegistryContext servicesRegistryContext; @Nullable private final RequestContext requestContext; - @Nullable private final RetrieverContext retrieverContext; + @Nonnull private final RetrieverContext retrieverContext; @Nonnull private final ObjectMapperContext objectMapperContext; @Nonnull private final ValidationContext validationContext; @@ -194,13 +226,15 @@ public OperationContext withLineageFlags( public OperationContext asSession( @Nonnull RequestContext requestContext, @Nonnull Authorizer authorizer, - @Nonnull Authentication sessionAuthentication) { + @Nonnull Authentication sessionAuthentication) + throws ActorAccessException { return OperationContext.asSession( this, requestContext, authorizer, sessionAuthentication, - getOperationContextConfig().isAllowSystemAuthentication()); + getOperationContextConfig().isAllowSystemAuthentication(), + false); } @Nonnull @@ -284,17 +318,9 @@ public AuditStamp getAuditStamp() { return getAuditStamp(null); } - public Optional getRetrieverContext() { - return Optional.ofNullable(retrieverContext); - } - - @Nullable + @Nonnull public AspectRetriever getAspectRetriever() { - return getAspectRetrieverOpt().orElse(null); - } - - public Optional getAspectRetrieverOpt() { - return getRetrieverContext().map(RetrieverContext::getAspectRetriever); + return retrieverContext.getAspectRetriever(); } /** @@ -336,10 +362,7 @@ public String getGlobalContextId() { ? EmptyContext.EMPTY : getServicesRegistryContext()) .add(getRequestContext() == null ? EmptyContext.EMPTY : getRequestContext()) - .add( - getRetrieverContext().isPresent() - ? getRetrieverContext().get() - : EmptyContext.EMPTY) + .add(getRetrieverContext()) .add(getObjectMapperContext()) .build() .stream() @@ -364,10 +387,7 @@ public String getSearchContextId() { getServicesRegistryContext() == null ? EmptyContext.EMPTY : getServicesRegistryContext()) - .add( - getRetrieverContext().isPresent() - ? getRetrieverContext().get() - : EmptyContext.EMPTY) + .add(getRetrieverContext()) .build() .stream() .map(ContextInterface::getCacheKeyComponent) @@ -438,6 +458,12 @@ public static class OperationContextBuilder { @Nonnull public OperationContext build(@Nonnull Authentication sessionAuthentication) { + return build(sessionAuthentication, false); + } + + @Nonnull + public OperationContext build( + @Nonnull Authentication sessionAuthentication, boolean skipCache) { final Urn actorUrn = UrnUtils.getUrn(sessionAuthentication.getActor().toUrnStr()); final ActorContext sessionActor = ActorContext.builder() @@ -451,11 +477,20 @@ public OperationContext build(@Nonnull Authentication sessionAuthentication) { .policyInfoSet(this.authorizationContext.getAuthorizer().getActorPolicies(actorUrn)) .groupMembership(this.authorizationContext.getAuthorizer().getActorGroups(actorUrn)) .build(); - return build(sessionActor); + return build(sessionActor, skipCache); } @Nonnull - public OperationContext build(@Nonnull ActorContext sessionActor) { + public OperationContext build(@Nonnull ActorContext sessionActor, boolean skipCache) { + AspectRetriever retriever = + skipCache + ? this.retrieverContext.getAspectRetriever() + : this.retrieverContext.getCachingAspectRetriever(); + + if (!sessionActor.isActive(retriever)) { + throw new ActorAccessException("Actor is not active"); + } + return new OperationContext( this.operationContextConfig, sessionActor, diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java index 9337fbfe3bb003..9afc4138810bb2 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java @@ -1,8 +1,10 @@ package io.datahubproject.metadata.context; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.entity.SearchRetriever; +import java.util.Objects; import java.util.Optional; import javax.annotation.Nonnull; import lombok.Builder; @@ -15,10 +17,37 @@ public class RetrieverContext @Nonnull private final GraphRetriever graphRetriever; @Nonnull private final AspectRetriever aspectRetriever; + @Nonnull private final CachingAspectRetriever cachingAspectRetriever; @Nonnull private final SearchRetriever searchRetriever; @Override public Optional getCacheKeyComponent() { return Optional.empty(); } + + public static class RetrieverContextBuilder { + public RetrieverContext build() { + if (this.aspectRetriever == null && this.cachingAspectRetriever != null) { + this.aspectRetriever = this.cachingAspectRetriever; + } + + if (this.cachingAspectRetriever == null + && this.aspectRetriever instanceof CachingAspectRetriever) { + this.cachingAspectRetriever = (CachingAspectRetriever) this.aspectRetriever; + } + + return new RetrieverContext( + this.graphRetriever, + Objects.requireNonNull(this.aspectRetriever), + Objects.requireNonNull(this.cachingAspectRetriever), + this.searchRetriever); + } + } + + public static final RetrieverContext EMPTY = + RetrieverContext.builder() + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) + .cachingAspectRetriever(CachingAspectRetriever.EMPTY) + .build(); } diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java new file mode 100644 index 00000000000000..bca2594b96430e --- /dev/null +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java @@ -0,0 +1,7 @@ +package io.datahubproject.metadata.exception; + +public class ActorAccessException extends OperationContextException { + public ActorAccessException(String string) { + super(string); + } +} diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java new file mode 100644 index 00000000000000..1aac8dc3e60ec9 --- /dev/null +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java @@ -0,0 +1,9 @@ +package io.datahubproject.metadata.exception; + +public class OperationContextException extends RuntimeException { + public OperationContextException(String message) { + super(message); + } + + public OperationContextException() {} +} diff --git a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java index 42de6b7398c616..4abfbb196f067c 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java @@ -8,21 +8,17 @@ import com.linkedin.common.urn.Urn; import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; import com.linkedin.entity.Aspect; +import com.linkedin.identity.CorpUserInfo; +import com.linkedin.metadata.Constants; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; -import com.linkedin.metadata.aspect.SystemAspect; -import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.ConfigEntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistryException; import com.linkedin.metadata.models.registry.MergedEntityRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; -import com.linkedin.metadata.query.filter.Filter; -import com.linkedin.metadata.query.filter.RelationshipFilter; -import com.linkedin.metadata.query.filter.SortCriterion; -import com.linkedin.metadata.search.ScrollResult; -import com.linkedin.metadata.search.SearchEntityArray; import com.linkedin.metadata.snapshot.Snapshot; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; @@ -32,15 +28,14 @@ import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.ServicesRegistryContext; import io.datahubproject.metadata.context.ValidationContext; -import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.function.Consumer; import java.util.function.Supplier; +import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.Builder; /** * Useful for testing. If the defaults are not sufficient, try using the .toBuilder() and replacing @@ -81,26 +76,53 @@ public static EntityRegistry defaultEntityRegistry() { return defaultEntityRegistryInstance; } - public static AspectRetriever emptyAspectRetriever( + public static RetrieverContext emptyActiveUsersRetrieverContext( @Nullable Supplier entityRegistrySupplier) { - return new EmptyAspectRetriever( - () -> - Optional.ofNullable(entityRegistrySupplier) - .map(Supplier::get) - .orElse(defaultEntityRegistry())); - } - public static GraphRetriever emptyGraphRetriever = new EmptyGraphRetriever(); - public static SearchRetriever emptySearchRetriever = new EmptySearchRetriever(); + return RetrieverContext.builder() + .cachingAspectRetriever(emptyActiveUsersAspectRetriever(entityRegistrySupplier)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) + .build(); + } - public static RetrieverContext emptyRetrieverContext( + public static CachingAspectRetriever emptyActiveUsersAspectRetriever( @Nullable Supplier entityRegistrySupplier) { - return RetrieverContext.builder() - .aspectRetriever(emptyAspectRetriever(entityRegistrySupplier)) - .graphRetriever(emptyGraphRetriever) - .searchRetriever(emptySearchRetriever) - .build(); + return new CachingAspectRetriever.EmptyAspectRetriever() { + + @Nonnull + @Override + public Map> getLatestAspectObjects( + Set urns, Set aspectNames) { + if (urns.stream().allMatch(urn -> urn.toString().startsWith("urn:li:corpuser:")) + && aspectNames.contains(Constants.CORP_USER_KEY_ASPECT_NAME)) { + return urns.stream() + .map( + urn -> + Map.entry( + urn, + Map.of( + Constants.CORP_USER_KEY_ASPECT_NAME, + new Aspect( + new CorpUserInfo() + .setActive(true) + .setEmail(urn.getId()) + .setDisplayName(urn.getId()) + .data())))) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + return super.getLatestAspectObjects(urns, aspectNames); + } + + @Nonnull + @Override + public EntityRegistry getEntityRegistry() { + return Optional.ofNullable(entityRegistrySupplier) + .map(Supplier::get) + .orElse(defaultEntityRegistry()); + } + }; } public static OperationContext systemContextNoSearchAuthorization( @@ -140,8 +162,10 @@ public static OperationContext systemContextNoSearchAuthorization( RetrieverContext retrieverContext = RetrieverContext.builder() .aspectRetriever(aspectRetriever) - .graphRetriever(emptyGraphRetriever) - .searchRetriever(emptySearchRetriever) + .cachingAspectRetriever( + emptyActiveUsersAspectRetriever(() -> aspectRetriever.getEntityRegistry())) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(); return systemContextNoSearchAuthorization( () -> retrieverContext.getAspectRetriever().getEntityRegistry(), @@ -208,7 +232,7 @@ public static OperationContext systemContext( RetrieverContext retrieverContext = Optional.ofNullable(retrieverContextSupplier) .map(Supplier::get) - .orElse(emptyRetrieverContext(entityRegistrySupplier)); + .orElse(emptyActiveUsersRetrieverContext(entityRegistrySupplier)); EntityRegistry entityRegistry = Optional.ofNullable(entityRegistrySupplier) @@ -298,66 +322,5 @@ public static OperationContext userContextNoSearchAuthorization( .asSession(requestContext, Authorizer.EMPTY, TEST_USER_AUTH); } - @Builder - public static class EmptyAspectRetriever implements AspectRetriever { - private final Supplier entityRegistrySupplier; - - @Nonnull - @Override - public Map> getLatestAspectObjects( - Set urns, Set aspectNames) { - return Map.of(); - } - - @Nonnull - @Override - public Map> getLatestSystemAspects( - Map> urnAspectNames) { - return Map.of(); - } - - @Nonnull - @Override - public EntityRegistry getEntityRegistry() { - return entityRegistrySupplier.get(); - } - } - - public static class EmptyGraphRetriever implements GraphRetriever { - - @Nonnull - @Override - public RelatedEntitiesScrollResult scrollRelatedEntities( - @Nullable List sourceTypes, - @Nonnull Filter sourceEntityFilter, - @Nullable List destinationTypes, - @Nonnull Filter destinationEntityFilter, - @Nonnull List relationshipTypes, - @Nonnull RelationshipFilter relationshipFilter, - @Nonnull List sortCriterion, - @Nullable String scrollId, - int count, - @Nullable Long startTimeMillis, - @Nullable Long endTimeMillis) { - return new RelatedEntitiesScrollResult(0, 0, null, List.of()); - } - } - - public static class EmptySearchRetriever implements SearchRetriever { - - @Override - public ScrollResult scroll( - @Nonnull List entities, - @Nullable Filter filters, - @Nullable String scrollId, - int count) { - ScrollResult empty = new ScrollResult(); - empty.setEntities(new SearchEntityArray()); - empty.setNumEntities(0); - empty.setPageSize(0); - return empty; - } - } - private TestOperationContexts() {} } diff --git a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java index 3e092e20127ee5..f77b244d8f2d86 100644 --- a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java +++ b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java @@ -8,6 +8,7 @@ import com.datahub.authentication.Authentication; import com.datahub.plugins.auth.authorization.Authorizer; import com.linkedin.metadata.models.registry.EntityRegistry; +import io.datahubproject.test.metadata.context.TestOperationContexts; import org.testng.annotations.Test; public class OperationContextTest { @@ -25,7 +26,7 @@ public void testSystemPrivilegeEscalation() { mock(EntityRegistry.class), mock(ServicesRegistryContext.class), null, - mock(RetrieverContext.class), + TestOperationContexts.emptyActiveUsersRetrieverContext(null), mock(ValidationContext.class)); OperationContext opContext = diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java index 6724f35d840adb..a9871f1ed99482 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java @@ -145,7 +145,7 @@ public String generateAccessToken( _entityService.ingestProposal( systemOperationContext, AspectsBatchImpl.builder() - .mcps(List.of(proposal), auditStamp, systemOperationContext.getRetrieverContext().get()) + .mcps(List.of(proposal), auditStamp, systemOperationContext.getRetrieverContext()) .build(), false); diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 9348416606d0a9..75b4c8e8b002f9 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -522,12 +522,12 @@ cache: entityAspectTTLSeconds: # cache user aspects for 20s corpuser: - corpUserKey: 20 + corpUserKey: 300 # 5 min corpUserInfo: 20 corpUserEditableInfo: 20 - corpUserStatus: 20 + corpUserStatus: 300 # 5 min globalTags: 20 - status: 20 + status: 300 # 5 min corpUserCredentials: 20 corpUserSettings: 20 roleMembership: 20 diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java index f5235dc3682fce..3e2823591e168c 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java @@ -45,7 +45,8 @@ protected OperationContext javaSystemOperationContext( @Nonnull final SearchService searchService, @Qualifier("baseElasticSearchComponents") BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components, - @Nonnull final ConfigurationProvider configurationProvider) { + @Nonnull final ConfigurationProvider configurationProvider, + @Qualifier("systemEntityClient") @Nonnull final SystemEntityClient systemEntityClient) { EntityServiceAspectRetriever entityServiceAspectRetriever = EntityServiceAspectRetriever.builder() @@ -53,6 +54,9 @@ protected OperationContext javaSystemOperationContext( .entityService(entityService) .build(); + EntityClientAspectRetriever entityClientAspectRetriever = + EntityClientAspectRetriever.builder().entityClient(systemEntityClient).build(); + SystemGraphRetriever systemGraphRetriever = SystemGraphRetriever.builder().graphService(graphService).build(); @@ -68,6 +72,7 @@ protected OperationContext javaSystemOperationContext( components.getIndexConvention(), RetrieverContext.builder() .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(entityClientAspectRetriever) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), @@ -76,6 +81,7 @@ protected OperationContext javaSystemOperationContext( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) .build()); + entityClientAspectRetriever.setSystemOperationContext(systemOperationContext); entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext); systemGraphRetriever.setSystemOperationContext(systemOperationContext); searchServiceSearchRetriever.setSystemOperationContext(systemOperationContext); @@ -104,7 +110,7 @@ protected OperationContext restliSystemOperationContext( BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components, @Nonnull final ConfigurationProvider configurationProvider) { - EntityClientAspectRetriever entityServiceAspectRetriever = + EntityClientAspectRetriever entityClientAspectRetriever = EntityClientAspectRetriever.builder().entityClient(systemEntityClient).build(); SystemGraphRetriever systemGraphRetriever = @@ -121,7 +127,7 @@ protected OperationContext restliSystemOperationContext( ServicesRegistryContext.builder().restrictedService(restrictedService).build(), components.getIndexConvention(), RetrieverContext.builder() - .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(entityClientAspectRetriever) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), @@ -130,7 +136,7 @@ protected OperationContext restliSystemOperationContext( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) .build()); - entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext); + entityClientAspectRetriever.setSystemOperationContext(systemOperationContext); systemGraphRetriever.setSystemOperationContext(systemOperationContext); searchServiceSearchRetriever.setSystemOperationContext(systemOperationContext); diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java index 22ce06a5984ea6..c04dd25ccd4ac9 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java @@ -84,14 +84,14 @@ public void execute(@Nonnull OperationContext systemOperationContext) throws Exc .aspectName(DATA_PLATFORM_INSTANCE_ASPECT_NAME) .recordTemplate(dataPlatformInstance.get()) .auditStamp(aspectAuditStamp) - .build(systemOperationContext.getAspectRetrieverOpt().get())); + .build(systemOperationContext.getAspectRetriever())); } } _entityService.ingestAspects( systemOperationContext, AspectsBatchImpl.builder() - .retrieverContext(systemOperationContext.getRetrieverContext().get()) + .retrieverContext(systemOperationContext.getRetrieverContext()) .items(items) .build(), true, diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java index eb6bfe17ac198e..dac2879487469c 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java @@ -225,7 +225,7 @@ private void ingestPolicy( new AuditStamp() .setActor(Urn.createFromString(Constants.SYSTEM_ACTOR)) .setTime(System.currentTimeMillis()), - systemOperationContext.getRetrieverContext().get()) + systemOperationContext.getRetrieverContext()) .build(), false); } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java similarity index 81% rename from metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java rename to metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java index ba0a426fa20e89..c756827cad56ba 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java @@ -1,9 +1,11 @@ -package io.datahubproject.openapi; +package io.datahubproject.openapi.config; import com.linkedin.metadata.dao.throttle.APIThrottleException; +import io.datahubproject.metadata.exception.ActorAccessException; import io.datahubproject.openapi.exception.InvalidUrnException; import io.datahubproject.openapi.exception.UnauthorizedException; import java.util.Map; +import javax.annotation.PostConstruct; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.ConversionNotSupportedException; import org.springframework.core.Ordered; @@ -19,6 +21,11 @@ @ControllerAdvice public class GlobalControllerExceptionHandler extends DefaultHandlerExceptionResolver { + @PostConstruct + public void init() { + log.info("GlobalControllerExceptionHandler initialized"); + } + public GlobalControllerExceptionHandler() { setOrder(Ordered.HIGHEST_PRECEDENCE); setWarnLogCategory(getClass().getName()); @@ -52,4 +59,9 @@ public static ResponseEntity> handleUnauthorizedException( UnauthorizedException e) { return new ResponseEntity<>(Map.of("error", e.getMessage()), HttpStatus.FORBIDDEN); } + + @ExceptionHandler(ActorAccessException.class) + public static ResponseEntity> actorAccessException(ActorAccessException e) { + return new ResponseEntity<>(Map.of("error", e.getMessage()), HttpStatus.FORBIDDEN); + } } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java index 579a62c084999a..592d7bba4211fe 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java @@ -637,7 +637,7 @@ public ResponseEntity createAspect( AspectSpec aspectSpec = lookupAspectSpec(entitySpec, aspectName).get(); ChangeMCP upsert = toUpsertItem( - opContext.getRetrieverContext().get().getAspectRetriever(), + opContext.getRetrieverContext().getAspectRetriever(), urn, aspectSpec, createIfEntityNotExists, @@ -649,7 +649,7 @@ public ResponseEntity createAspect( entityService.ingestProposal( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(upsert)) .build(), async); @@ -725,7 +725,7 @@ public ResponseEntity patchAspect( .build(); ChangeMCP upsert = toUpsertItem( - opContext.getRetrieverContext().get().getAspectRetriever(), + opContext.getRetrieverContext().getAspectRetriever(), validatedUrn(entityUrn), aspectSpec, currentValue, @@ -736,7 +736,7 @@ public ResponseEntity patchAspect( entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(upsert)) .build(), true, diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java new file mode 100644 index 00000000000000..99d3879ab9a320 --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java @@ -0,0 +1,54 @@ +package io.datahubproject.openapi.operations.test; + +import com.datahub.authentication.Authentication; +import com.datahub.authentication.AuthenticationContext; +import com.datahub.authorization.AuthorizerChain; +import io.datahubproject.metadata.context.OperationContext; +import io.datahubproject.metadata.context.RequestContext; +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.tags.Tag; +import jakarta.servlet.http.HttpServletRequest; +import java.util.List; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; + +@RestController +@RequestMapping("/operations/identity") +@Slf4j +@Tag(name = "Identity", description = "An API for checking identity") +public class IdController { + private final AuthorizerChain authorizerChain; + private final OperationContext systemOperationContext; + + public IdController(OperationContext systemOperationContext, AuthorizerChain authorizerChain) { + this.systemOperationContext = systemOperationContext; + this.authorizerChain = authorizerChain; + } + + @Tag(name = "User") + @GetMapping(path = "/user/urn", produces = MediaType.APPLICATION_JSON_VALUE) + @Operation(summary = "User id") + public ResponseEntity> getUserId( + HttpServletRequest request, + @RequestParam(value = "skipCache", required = false, defaultValue = "false") + Boolean skipCache) { + Authentication authentication = AuthenticationContext.getAuthentication(); + String actorUrnStr = authentication.getActor().toUrnStr(); + + OperationContext.asSession( + systemOperationContext, + RequestContext.builder().buildOpenapi(actorUrnStr, request, "getUserIdentity", List.of()), + authorizerChain, + authentication, + true, + skipCache); + + return ResponseEntity.ok(Map.of("urn", actorUrnStr)); + } +} diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java index c38f2db0eefbb3..ca425810c87a09 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java @@ -491,7 +491,7 @@ public static List> ingestBatchProposal( try { AspectsBatch batch = AspectsBatchImpl.builder() - .mcps(serviceProposals, auditStamp, opContext.getRetrieverContext().get()) + .mcps(serviceProposals, auditStamp, opContext.getRetrieverContext()) .build(); Map> resultMap = diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java index 56a7955b9fe871..b1c5709ef01470 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java @@ -203,7 +203,7 @@ protected AspectsBatch toMCPBatch( objectMapper.writeValueAsString(aspect.getValue().get("systemMetadata")))); } - items.add(builder.build(opContext.getAspectRetrieverOpt().get())); + items.add(builder.build(opContext.getAspectRetriever())); } } } @@ -211,7 +211,7 @@ protected AspectsBatch toMCPBatch( return AspectsBatchImpl.builder() .items(items) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java index ce7fd73f99b9e5..af13cd3aab0510 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java @@ -554,14 +554,14 @@ protected AspectsBatch toMCPBatch( GenericRecordUtils.JSON, aspectSpec)); - items.add(builder.build(opContext.getRetrieverContext().get().getAspectRetriever())); + items.add(builder.build(opContext.getRetrieverContext().getAspectRetriever())); } } } } return AspectsBatchImpl.builder() .items(items) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json b/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json index 33cfba0f27802c..27731af9ffaa71 100644 --- a/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json +++ b/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json @@ -19,6 +19,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] }, { "method" : "batch_get", @@ -27,6 +31,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] } ], "entity" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json index 9bf7f97b34be18..9c5f41281fcfbe 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json @@ -182,6 +182,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] }, { "method" : "batch_get", @@ -190,6 +194,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] } ], "entity" : { diff --git a/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java b/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java index cf6e571cb8cbeb..b85f22e781d0b0 100644 --- a/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java +++ b/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java @@ -45,12 +45,34 @@ // Consider renaming this to datahub client. public interface EntityClient { + /** + * This version follows the legacy behavior of returning key aspects regardless of whether they + * exist + * + * @param opContext operation context + * @param entityName entity type + * @param urn urn id for the entity + * @param aspectNames set of aspects + * @return requested entity/aspects + */ + @Deprecated @Nullable - EntityResponse getV2( + default EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, @Nullable final Set aspectNames) + throws RemoteInvocationException, URISyntaxException { + return getV2(opContext, entityName, urn, aspectNames, true); + } + + @Nullable + EntityResponse getV2( + @Nonnull OperationContext opContext, + @Nonnull String entityName, + @Nonnull final Urn urn, + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException; @Nonnull @@ -58,12 +80,34 @@ EntityResponse getV2( Entity get(@Nonnull OperationContext opContext, @Nonnull final Urn urn) throws RemoteInvocationException; + /** + * This version follows the legacy behavior of returning key aspects regardless of whether they + * exist + * + * @param opContext operation context + * @param entityName entity type + * @param urns urn ids for the entities + * @param aspectNames set of aspects + * @return requested entity/aspects + */ + @Deprecated @Nonnull - Map batchGetV2( + default Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Set urns, @Nullable final Set aspectNames) + throws RemoteInvocationException, URISyntaxException { + return batchGetV2(opContext, entityName, urns, aspectNames, true); + } + + @Nonnull + Map batchGetV2( + @Nonnull OperationContext opContext, + @Nonnull String entityName, + @Nonnull final Set urns, + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException; @Nonnull @@ -589,27 +633,38 @@ void rollbackIngestion( @Nullable default Aspect getLatestAspectObject( - @Nonnull OperationContext opContext, @Nonnull Urn urn, @Nonnull String aspectName) + @Nonnull OperationContext opContext, + @Nonnull Urn urn, + @Nonnull String aspectName, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { - return getLatestAspects(opContext, Set.of(urn), Set.of(aspectName)) + return getLatestAspects(opContext, Set.of(urn), Set.of(aspectName), alwaysIncludeKeyAspect) .getOrDefault(urn, Map.of()) .get(aspectName); } @Nonnull default Map> getLatestAspects( - @Nonnull OperationContext opContext, @Nonnull Set urns, @Nonnull Set aspectNames) + @Nonnull OperationContext opContext, + @Nonnull Set urns, + @Nonnull Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); - return entityResponseToAspectMap(batchGetV2(opContext, entityName, urns, aspectNames)); + return entityResponseToAspectMap( + batchGetV2(opContext, entityName, urns, aspectNames, alwaysIncludeKeyAspect)); } @Nonnull default Map> getLatestSystemAspect( - @Nonnull OperationContext opContext, @Nonnull Set urns, @Nonnull Set aspectNames) + @Nonnull OperationContext opContext, + @Nonnull Set urns, + @Nonnull Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); return entityResponseToSystemAspectMap( - batchGetV2(opContext, entityName, urns, aspectNames), opContext.getEntityRegistry()); + batchGetV2(opContext, entityName, urns, aspectNames, alwaysIncludeKeyAspect), + opContext.getEntityRegistry()); } } diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java index 516902601f08a1..8d4c5e9228a71c 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java @@ -156,10 +156,15 @@ public EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final EntitiesV2GetRequestBuilder requestBuilder = - ENTITIES_V2_REQUEST_BUILDERS.get().aspectsParam(aspectNames).id(urn.toString()); + ENTITIES_V2_REQUEST_BUILDERS + .get() + .aspectsParam(aspectNames) + .id(urn.toString()) + .alwaysIncludeKeyAspectParam(alwaysIncludeKeyAspect); return sendClientRequest(requestBuilder, opContext.getSessionAuthentication()).getEntity(); } @@ -241,7 +246,8 @@ public Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Set urns, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { Map responseMap = new HashMap<>(); @@ -260,6 +266,7 @@ public Map batchGetV2( ENTITIES_V2_REQUEST_BUILDERS .batchGet() .aspectsParam(aspectNames) + .alwaysIncludeKeyAspectParam(alwaysIncludeKeyAspect) .ids( batch.stream() .map(Urn::toString) diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java index 2637e2d067c6d5..aa17f1951bc912 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java @@ -59,6 +59,6 @@ public Map batchGetV2NoCache( @Nonnull Set urns, @Nullable Set aspectNames) throws RemoteInvocationException, URISyntaxException { - return super.batchGetV2(opContext, entityName, urns, aspectNames); + return super.batchGetV2(opContext, entityName, urns, aspectNames, false); } } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java index 6033ead36f10ec..30b187da00e91a 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java @@ -309,7 +309,7 @@ private Task ingestProposals( log.debug("Proposals: {}", metadataChangeProposals); try { final AspectsBatch batch = AspectsBatchImpl.builder() - .mcps(metadataChangeProposals, auditStamp, opContext.getRetrieverContext().get(), + .mcps(metadataChangeProposals, auditStamp, opContext.getRetrieverContext(), opContext.getValidationContext().isAlternateValidation()) .build(); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java index 20209ddf44d643..896d81d3cbecc3 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java @@ -64,7 +64,8 @@ public class EntityV2Resource extends CollectionResourceTaskTemplate get( - @Nonnull String urnStr, @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames) + @Nonnull String urnStr, @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames, + @QueryParam(PARAM_ALWAYS_INCLUDE_KEY_ASPECT) @Optional @Nullable Boolean alwaysIncludeKeyAspect) throws URISyntaxException { log.debug("GET V2 {}", urnStr); final Urn urn = Urn.createFromString(urnStr); @@ -90,7 +91,7 @@ public Task get( ? opContext.getEntityAspectNames(entityName) : new HashSet<>(Arrays.asList(aspectNames)); try { - return _entityService.getEntityV2(opContext, entityName, urn, projectedAspects); + return _entityService.getEntityV2(opContext, entityName, urn, projectedAspects, alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } catch (Exception e) { throw new RuntimeException( String.format( @@ -106,7 +107,8 @@ public Task get( @WithSpan public Task> batchGet( @Nonnull Set urnStrs, - @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames) + @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames, + @QueryParam(PARAM_ALWAYS_INCLUDE_KEY_ASPECT) @Optional @Nullable Boolean alwaysIncludeKeyAspect) throws URISyntaxException { log.debug("BATCH GET V2 {}", urnStrs.toString()); final Set urns = new HashSet<>(); @@ -138,7 +140,7 @@ public Task> batchGet( ? opContext.getEntityAspectNames(entityName) : new HashSet<>(Arrays.asList(aspectNames)); try { - return _entityService.getEntitiesV2(opContext, entityName, urns, projectedAspects); + return _entityService.getEntitiesV2(opContext, entityName, urns, projectedAspects, alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } catch (Exception e) { throw new RuntimeException( String.format( diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java index ef79a404c2145e..11df52ad66709e 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java @@ -1,5 +1,7 @@ package com.linkedin.metadata.resources.restli; +import javax.annotation.Nullable; + public final class RestliConstants { private RestliConstants() {} @@ -21,6 +23,7 @@ private RestliConstants() {} public static final String PARAM_INPUT = "input"; public static final String PARAM_MAX_HOPS = "maxHops"; public static final String PARAM_ASPECTS = "aspects"; + public static final String PARAM_ALWAYS_INCLUDE_KEY_ASPECT = "alwaysIncludeKeyAspect"; public static final String PARAM_FILTER = "filter"; public static final String PARAM_GROUP = "group"; public static final String PARAM_SORT = "sort"; diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java index 185874fac1382d..a2092405da3ff6 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java @@ -8,6 +8,7 @@ import com.linkedin.parseq.Task; import com.linkedin.restli.common.HttpStatus; import com.linkedin.restli.server.RestLiServiceException; +import io.datahubproject.metadata.exception.ActorAccessException; import java.util.Optional; import java.util.function.Supplier; import javax.annotation.Nonnull; @@ -38,6 +39,8 @@ public static Task toTask(@Nonnull Supplier supplier) { if (throwable instanceof IllegalArgumentException || throwable.getCause() instanceof IllegalArgumentException) { finalException = badRequestException(throwable.getMessage()); + } else if (throwable.getCause() instanceof ActorAccessException) { + finalException = forbidden(throwable.getCause().getMessage()); } else if (throwable instanceof APIThrottleException) { finalException = apiThrottled(throwable.getMessage()); } else if (throwable instanceof RestLiServiceException) { @@ -109,4 +112,9 @@ public static RestLiServiceException invalidArgumentsException(@Nullable String public static RestLiServiceException apiThrottled(@Nullable String message) { return new RestLiServiceException(HttpStatus.S_429_TOO_MANY_REQUESTS, message); } + + @Nonnull + public static RestLiServiceException forbidden(@Nullable String message) { + return new RestLiServiceException(HttpStatus.S_403_FORBIDDEN, message); + } } diff --git a/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java b/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java index a39401c170a114..037b5b81fd4df0 100644 --- a/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java +++ b/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java @@ -100,7 +100,7 @@ public void testAsyncDefaultAspects() throws URISyntaxException { .recordTemplate(mcp.getAspect()) .auditStamp(new AuditStamp()) .metadataChangeProposal(mcp) - .build(opContext.getAspectRetrieverOpt().get()); + .build(opContext.getAspectRetriever()); when(aspectDao.runInTransactionWithRetry(any(), any(), anyInt())) .thenReturn( List.of(List.of( diff --git a/smoke-test/tests/tokens/revokable_access_token_test.py b/smoke-test/tests/tokens/revokable_access_token_test.py index af29437c051e19..006daae39333ed 100644 --- a/smoke-test/tests/tokens/revokable_access_token_test.py +++ b/smoke-test/tests/tokens/revokable_access_token_test.py @@ -9,6 +9,8 @@ wait_for_writes_to_sync, ) +from .token_utils import listUsers, removeUser + pytestmark = pytest.mark.no_cypress_suite1 # Disable telemetry @@ -490,45 +492,3 @@ def getAccessTokenMetadata(session, token): response.raise_for_status() return response.json() - - -def removeUser(session, urn): - # Remove user - json = { - "query": """mutation removeUser($urn: String!) { - removeUser(urn: $urn) - }""", - "variables": {"urn": urn}, - } - - response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) - - response.raise_for_status() - return response.json() - - -def listUsers(session): - input = { - "start": "0", - "count": "20", - } - - # list users - json = { - "query": """query listUsers($input: ListUsersInput!) { - listUsers(input: $input) { - start - count - total - users { - username - } - } - }""", - "variables": {"input": input}, - } - - response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) - - response.raise_for_status() - return response.json() diff --git a/smoke-test/tests/tokens/session_access_token_test.py b/smoke-test/tests/tokens/session_access_token_test.py new file mode 100644 index 00000000000000..a16abc44453036 --- /dev/null +++ b/smoke-test/tests/tokens/session_access_token_test.py @@ -0,0 +1,173 @@ +import os +import time + +import pytest +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import AuditStampClass, CorpUserStatusClass +from requests.exceptions import HTTPError + +from tests.utils import ( + get_admin_credentials, + get_frontend_url, + login_as, + wait_for_writes_to_sync, +) + +from .token_utils import getUserId, listUsers, removeUser + +pytestmark = pytest.mark.no_cypress_suite1 + +# Disable telemetry +os.environ["DATAHUB_TELEMETRY_ENABLED"] = "false" + +(admin_user, admin_pass) = get_admin_credentials() +user_urn = "urn:li:corpuser:sessionUser" + + +@pytest.fixture(scope="class") +def custom_user_session(): + """Fixture to execute setup before and tear down after all tests are run""" + admin_session = login_as(admin_user, admin_pass) + + res_data = removeUser(admin_session, user_urn) + assert res_data + assert "error" not in res_data + + # Test getting the invite token + get_invite_token_json = { + "query": """query getInviteToken($input: GetInviteTokenInput!) { + getInviteToken(input: $input){ + inviteToken + } + }""", + "variables": {"input": {}}, + } + + get_invite_token_response = admin_session.post( + f"{get_frontend_url()}/api/v2/graphql", json=get_invite_token_json + ) + get_invite_token_response.raise_for_status() + get_invite_token_res_data = get_invite_token_response.json() + + assert get_invite_token_res_data + assert get_invite_token_res_data["data"] + invite_token = get_invite_token_res_data["data"]["getInviteToken"]["inviteToken"] + assert invite_token is not None + assert "error" not in invite_token + + # Pass the invite token when creating the user + sign_up_json = { + "fullName": "Test Session User", + "email": "sessionUser", + "password": "sessionUser", + "title": "Date Engineer", + "inviteToken": invite_token, + } + + sign_up_response = admin_session.post( + f"{get_frontend_url()}/signUp", json=sign_up_json + ) + sign_up_response.raise_for_status() + assert sign_up_response + assert "error" not in sign_up_response + # Sleep for eventual consistency + wait_for_writes_to_sync() + + # signUp will override the session cookie to the new user to be signed up. + admin_session.cookies.clear() + admin_session = login_as(admin_user, admin_pass) + + # Make user created user is there. + res_data = listUsers(admin_session) + assert res_data["data"] + assert res_data["data"]["listUsers"] + assert {"username": "sessionUser"} in res_data["data"]["listUsers"]["users"] + + yield login_as(sign_up_json["email"], sign_up_json["password"]) + + # Delete created user + res_data = removeUser(admin_session, user_urn) + assert res_data + assert res_data["data"] + assert res_data["data"]["removeUser"] is True + # Sleep for eventual consistency + wait_for_writes_to_sync() + + # Make user created user is not there. + res_data = listUsers(admin_session) + assert res_data["data"] + assert res_data["data"]["listUsers"] + assert {"username": "sessionUser"} not in res_data["data"]["listUsers"]["users"] + + +@pytest.mark.dependency() +def test_soft_delete(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.soft_delete_entity(urn=user_urn) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) + + # undo soft delete + graph_client.set_soft_delete_status(urn=user_urn, delete=False) + wait_for_writes_to_sync() + + +@pytest.mark.dependency(depends=["test_soft_delete"]) +def test_suspend(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.emit( + MetadataChangeProposalWrapper( + entityType="corpuser", + entityUrn=user_urn, + changeType="UPSERT", + aspectName="corpUserStatus", + aspect=CorpUserStatusClass( + status="SUSPENDED", + lastModified=AuditStampClass( + time=int(time.time() * 1000.0), actor="urn:li:corpuser:unknown" + ), + ), + ) + ) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) + + # undo suspend + graph_client.emit( + MetadataChangeProposalWrapper( + entityType="corpuser", + entityUrn=user_urn, + changeType="UPSERT", + aspectName="corpUserStatus", + aspect=CorpUserStatusClass( + status="ACTIVE", + lastModified=AuditStampClass( + time=int(time.time() * 1000.0), actor="urn:li:corpuser:unknown" + ), + ), + ) + ) + wait_for_writes_to_sync() + + +@pytest.mark.dependency(depends=["test_suspend"]) +def test_hard_delete(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.hard_delete_entity(urn=user_urn) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) diff --git a/smoke-test/tests/tokens/token_utils.py b/smoke-test/tests/tokens/token_utils.py new file mode 100644 index 00000000000000..10558e7085de72 --- /dev/null +++ b/smoke-test/tests/tokens/token_utils.py @@ -0,0 +1,53 @@ +from tests.utils import get_frontend_url + + +def getUserId(session): + response = session.get( + f"{get_frontend_url()}/openapi/operations/identity/user/urn", + params={"skipCache": "true"}, + ) + + response.raise_for_status() + return response.json() + + +def removeUser(session, urn): + # Remove user + json = { + "query": """mutation removeUser($urn: String!) { + removeUser(urn: $urn) + }""", + "variables": {"urn": urn}, + } + + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) + + response.raise_for_status() + return response.json() + + +def listUsers(session): + input = { + "start": "0", + "count": "20", + } + + # list users + json = { + "query": """query listUsers($input: ListUsersInput!) { + listUsers(input: $input) { + start + count + total + users { + username + } + } + }""", + "variables": {"input": input}, + } + + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) + + response.raise_for_status() + return response.json() From 83904b7f351c9ea8b9ac7737892b2b21caedb720 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Wed, 18 Dec 2024 17:02:16 -0500 Subject: [PATCH 02/35] fix(env) Fix forms hook env var default config (#12155) --- .../configuration/src/main/resources/application.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 75b4c8e8b002f9..9010d77015f16c 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -561,7 +561,7 @@ springdoc.api-docs.groups.enabled: true forms: hook: - enabled: { $FORMS_HOOK_ENABLED:true } + enabled: ${FORMS_HOOK_ENABLED:true} consumerGroupSuffix: ${FORMS_HOOK_CONSUMER_GROUP_SUFFIX:} businessAttribute: From da8f8221977444644596da40e676e15362bd7a2d Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 18 Dec 2024 14:36:10 -0800 Subject: [PATCH 03/35] feat(ingest/mlflow): Support configurable base_external_url (#12167) --- .../src/datahub/ingestion/source/mlflow.py | 35 ++++++++++++++++--- .../tests/unit/test_mlflow_source.py | 13 +++++++ 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py index cef6d2b1bb5774..26d160acf330cf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py @@ -38,16 +38,30 @@ class MLflowConfig(EnvConfigMixin): tracking_uri: Optional[str] = Field( default=None, - description="Tracking server URI. If not set, an MLflow default tracking_uri is used (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)", + description=( + "Tracking server URI. If not set, an MLflow default tracking_uri is used" + " (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)" + ), ) registry_uri: Optional[str] = Field( default=None, - description="Registry server URI. If not set, an MLflow default registry_uri is used (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)", + description=( + "Registry server URI. If not set, an MLflow default registry_uri is used" + " (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)" + ), ) model_name_separator: str = Field( default="_", description="A string which separates model name from its version (e.g. model_1 or model-1)", ) + base_external_url: Optional[str] = Field( + default=None, + description=( + "Base URL to use when constructing external URLs to MLflow." + " If not set, tracking_uri is used if it's an HTTP URL." + " If neither is set, external URLs are not generated." + ), + ) @dataclass @@ -279,12 +293,23 @@ def _make_ml_model_urn(self, model_version: ModelVersion) -> str: ) return urn - def _make_external_url(self, model_version: ModelVersion) -> Union[None, str]: + def _get_base_external_url_from_tracking_uri(self) -> Optional[str]: + if isinstance( + self.client.tracking_uri, str + ) and self.client.tracking_uri.startswith("http"): + return self.client.tracking_uri + else: + return None + + def _make_external_url(self, model_version: ModelVersion) -> Optional[str]: """ Generate URL for a Model Version to MLflow UI. """ - base_uri = self.client.tracking_uri - if base_uri.startswith("http"): + base_uri = ( + self.config.base_external_url + or self._get_base_external_url_from_tracking_uri() + ) + if base_uri: return f"{base_uri.rstrip('/')}/#/models/{model_version.name}/versions/{model_version.version}" else: return None diff --git a/metadata-ingestion/tests/unit/test_mlflow_source.py b/metadata-ingestion/tests/unit/test_mlflow_source.py index d213dd92352e62..e882296b6f331d 100644 --- a/metadata-ingestion/tests/unit/test_mlflow_source.py +++ b/metadata-ingestion/tests/unit/test_mlflow_source.py @@ -136,3 +136,16 @@ def test_make_external_link_remote(source, model_version): url = source._make_external_url(model_version) assert url == expected_url + + +def test_make_external_link_remote_via_config(source, model_version): + custom_base_url = "https://custom-server.org" + source.config.base_external_url = custom_base_url + source.client = MlflowClient( + tracking_uri="https://dummy-mlflow-tracking-server.org" + ) + expected_url = f"{custom_base_url}/#/models/{model_version.name}/versions/{model_version.version}" + + url = source._make_external_url(model_version) + + assert url == expected_url From 4392d72456faae5f0f59eb09756287182feec56b Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 18 Dec 2024 20:29:34 -0500 Subject: [PATCH 04/35] fix(cli/properties): fix data type validation (#12170) --- .../structuredproperties.py | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py index e37281dea86e1f..619f69b016262d 100644 --- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py +++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py @@ -14,7 +14,7 @@ PropertyValueClass, StructuredPropertyDefinitionClass, ) -from datahub.metadata.urns import StructuredPropertyUrn, Urn +from datahub.metadata.urns import DataTypeUrn, StructuredPropertyUrn, Urn from datahub.utilities.urns._urn_base import URN_TYPES logging.basicConfig(level=logging.INFO) @@ -86,19 +86,31 @@ class StructuredProperties(ConfigModel): @validator("type") def validate_type(cls, v: str) -> str: - # Convert to lowercase if needed - if not v.islower(): + # This logic is somewhat hacky, since we need to deal with + # 1. fully qualified urns + # 2. raw data types, that need to get the datahub namespace prefix + # While keeping the user-facing interface and error messages clean. + + if not v.startswith("urn:li:") and not v.islower(): + # Convert to lowercase if needed + v = v.lower() logger.warning( - f"Structured property type should be lowercase. Updated to {v.lower()}" + f"Structured property type should be lowercase. Updated to {v}" ) - v = v.lower() + + urn = Urn.make_data_type_urn(v) # Check if type is allowed - if not AllowedTypes.check_allowed_type(v): + data_type_urn = DataTypeUrn.from_string(urn) + unqualified_data_type = data_type_urn.id + if unqualified_data_type.startswith("datahub."): + unqualified_data_type = unqualified_data_type[len("datahub.") :] + if not AllowedTypes.check_allowed_type(unqualified_data_type): raise ValueError( - f"Type {v} is not allowed. Allowed types are {AllowedTypes.values()}" + f"Type {unqualified_data_type} is not allowed. Allowed types are {AllowedTypes.values()}" ) - return v + + return urn @property def fqn(self) -> str: From 48f3cc578589c5c0379d5117756f01a0228669b4 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Wed, 18 Dec 2024 21:53:20 -0600 Subject: [PATCH 05/35] fix(pgsql): Postgres doesn't support UNION select with FOR UPDATE (#12169) --- .../metadata/entity/ebean/EbeanAspectDao.java | 87 ++++++++++++++++++- .../metadata/config/EbeanConfiguration.java | 1 + .../src/main/resources/application.yaml | 1 + 3 files changed, 85 insertions(+), 4 deletions(-) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java index bd6cc67561b883..ea580a97c51886 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java @@ -93,8 +93,14 @@ public class EbeanAspectDao implements AspectDao, AspectMigrationsDao { */ private final LoadingCache locks; + private final String batchGetMethod; + public EbeanAspectDao(@Nonnull final Database server, EbeanConfiguration ebeanConfiguration) { _server = server; + this.batchGetMethod = + ebeanConfiguration.getBatchGetMethod() != null + ? ebeanConfiguration.getBatchGetMethod() + : "IN"; if (ebeanConfiguration.getLocking().isEnabled()) { this.locks = CacheBuilder.newBuilder() @@ -371,23 +377,37 @@ private List batchGet( final int totalPageCount = QueryUtils.getTotalPageCount(keys.size(), keysCount); final List finalResult = - batchGetUnion(new ArrayList<>(keys), keysCount, position, forUpdate); + batchGetSelectString(new ArrayList<>(keys), keysCount, position, forUpdate); while (QueryUtils.hasMore(position, keysCount, totalPageCount)) { position += keysCount; final List oneStatementResult = - batchGetUnion(new ArrayList<>(keys), keysCount, position, forUpdate); + batchGetSelectString(new ArrayList<>(keys), keysCount, position, forUpdate); finalResult.addAll(oneStatementResult); } return finalResult; } + @Nonnull + private List batchGetSelectString( + @Nonnull final List keys, + final int keysCount, + final int position, + boolean forUpdate) { + + if (batchGetMethod.equals("IN")) { + return batchGetIn(keys, keysCount, position, forUpdate); + } + + return batchGetUnion(keys, keysCount, position, forUpdate); + } + /** * Builds a single SELECT statement for batch get, which selects one entity, and then can be * UNION'd with other SELECT statements. */ - private String batchGetSelect( + private String batchGetSelectString( final int selectId, @Nonnull final String urn, @Nonnull final String aspect, @@ -434,7 +454,7 @@ private List batchGetUnion( final Map params = new HashMap<>(); for (int index = position; index < end; index++) { sb.append( - batchGetSelect( + batchGetSelectString( index - position, keys.get(index).getUrn(), keys.get(index).getAspect(), @@ -467,6 +487,65 @@ private List batchGetUnion( return query.findList(); } + @Nonnull + private List batchGetIn( + @Nonnull final List keys, + final int keysCount, + final int position, + boolean forUpdate) { + validateConnection(); + + // Build a single SELECT with IN clause using composite key comparison + // Query will look like: + // SELECT * FROM metadata_aspect WHERE (urn, aspect, version) IN + // (('urn0', 'aspect0', 0), ('urn1', 'aspect1', 1)) + final StringBuilder sb = new StringBuilder(); + sb.append( + "SELECT urn, aspect, version, metadata, systemMetadata, createdOn, createdBy, createdFor "); + sb.append("FROM metadata_aspect_v2 WHERE (urn, aspect, version) IN ("); + + final int end = Math.min(keys.size(), position + keysCount); + final Map params = new HashMap<>(); + + for (int index = position; index < end; index++) { + int paramIndex = index - position; + String urnParam = "urn" + paramIndex; + String aspectParam = "aspect" + paramIndex; + String versionParam = "version" + paramIndex; + + params.put(urnParam, keys.get(index).getUrn()); + params.put(aspectParam, keys.get(index).getAspect()); + params.put(versionParam, keys.get(index).getVersion()); + + sb.append("(:" + urnParam + ", :" + aspectParam + ", :" + versionParam + ")"); + + if (index != end - 1) { + sb.append(","); + } + } + + sb.append(")"); + + if (forUpdate) { + sb.append(" FOR UPDATE"); + } + + final RawSql rawSql = + RawSqlBuilder.parse(sb.toString()) + .columnMapping(EbeanAspectV2.URN_COLUMN, "key.urn") + .columnMapping(EbeanAspectV2.ASPECT_COLUMN, "key.aspect") + .columnMapping(EbeanAspectV2.VERSION_COLUMN, "key.version") + .create(); + + final Query query = _server.find(EbeanAspectV2.class).setRawSql(rawSql); + + for (Map.Entry param : params.entrySet()) { + query.setParameter(param.getKey(), param.getValue()); + } + + return query.findList(); + } + @Override @Nonnull public ListResult listUrns( diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java index 47b406e695a3fb..6eb31e14a2d3b0 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java @@ -23,6 +23,7 @@ public class EbeanConfiguration { private boolean autoCreateDdl; private boolean postgresUseIamAuth; private LockingConfiguration locking; + private String batchGetMethod; public static final EbeanConfiguration testDefault = EbeanConfiguration.builder().locking(LockingConfiguration.testDefault).build(); diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 9010d77015f16c..b997bc108e4ba1 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -164,6 +164,7 @@ ebean: waitTimeoutMillis: ${EBEAN_WAIT_TIMEOUT_MILLIS:1000} autoCreateDdl: ${EBEAN_AUTOCREATE:false} postgresUseIamAuth: ${EBEAN_POSTGRES_USE_AWS_IAM_AUTH:false} + batchGetMethod: ${EBEAN_BATCH_GET_METHOD:IN} # Alternative UNION locking: enabled: ${EBEAN_LOCKING_ENABLED:false} durationSeconds: ${EBEAN_LOCKING_DURATION_SECONDS:60} From 953893cf2e72e71580b21bdfc12592fca572e13b Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Thu, 19 Dec 2024 12:39:47 +0530 Subject: [PATCH 06/35] refactor(ingest/kafka-connect): define interface for new connector impl (#12149) --- metadata-ingestion/setup.py | 2 +- .../ingestion/source/kafka/kafka_connect.py | 1468 ----------------- .../source/kafka_connect/__init__.py | 0 .../ingestion/source/kafka_connect/common.py | 202 +++ .../source/kafka_connect/kafka_connect.py | 367 +++++ .../source/kafka_connect/sink_connectors.py | 341 ++++ .../source/kafka_connect/source_connectors.py | 570 +++++++ 7 files changed, 1481 insertions(+), 1469 deletions(-) delete mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 6334b3abbb8a01..c6994dd6d5aa65 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -741,7 +741,7 @@ "hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource", "json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource", "kafka = datahub.ingestion.source.kafka.kafka:KafkaSource", - "kafka-connect = datahub.ingestion.source.kafka.kafka_connect:KafkaConnectSource", + "kafka-connect = datahub.ingestion.source.kafka_connect.kafka_connect:KafkaConnectSource", "ldap = datahub.ingestion.source.ldap:LDAPSource", "looker = datahub.ingestion.source.looker.looker_source:LookerDashboardSource", "lookml = datahub.ingestion.source.looker.lookml_source:LookMLSource", diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py deleted file mode 100644 index 23a99ccb310e13..00000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py +++ /dev/null @@ -1,1468 +0,0 @@ -import logging -import re -from dataclasses import dataclass, field -from typing import Dict, Iterable, List, Optional, Tuple - -import jpype -import jpype.imports -import requests -from pydantic.fields import Field -from sqlalchemy.engine.url import make_url - -import datahub.emitter.mce_builder as builder -import datahub.metadata.schema_classes as models -from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import ( - DatasetLineageProviderConfigBase, - PlatformInstanceConfigMixin, -) -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.decorators import ( - SourceCapability, - SupportStatus, - capability, - config_class, - platform_name, - support_status, -) -from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source -from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( - get_platform_from_sqlalchemy_uri, -) -from datahub.ingestion.source.state.stale_entity_removal_handler import ( - StaleEntityRemovalHandler, - StaleEntityRemovalSourceReport, - StatefulStaleMetadataRemovalConfig, -) -from datahub.ingestion.source.state.stateful_ingestion_base import ( - StatefulIngestionConfigBase, - StatefulIngestionSourceBase, -) - -logger = logging.getLogger(__name__) - -KAFKA = "kafka" -SOURCE = "source" -SINK = "sink" -CONNECTOR_CLASS = "connector.class" - - -class ProvidedConfig(ConfigModel): - provider: str - path_key: str - value: str - - -class GenericConnectorConfig(ConfigModel): - connector_name: str - source_dataset: str - source_platform: str - - -class KafkaConnectSourceConfig( - PlatformInstanceConfigMixin, - DatasetLineageProviderConfigBase, - StatefulIngestionConfigBase, -): - # See the Connect REST Interface for details - # https://docs.confluent.io/platform/current/connect/references/restapi.html# - connect_uri: str = Field( - default="http://localhost:8083/", description="URI to connect to." - ) - username: Optional[str] = Field(default=None, description="Kafka Connect username.") - password: Optional[str] = Field(default=None, description="Kafka Connect password.") - cluster_name: Optional[str] = Field( - default="connect-cluster", description="Cluster to ingest from." - ) - # convert lineage dataset's urns to lowercase - convert_lineage_urns_to_lowercase: bool = Field( - default=False, - description="Whether to convert the urns of ingested lineage dataset to lowercase", - ) - connector_patterns: AllowDenyPattern = Field( - default=AllowDenyPattern.allow_all(), - description="regex patterns for connectors to filter for ingestion.", - ) - provided_configs: Optional[List[ProvidedConfig]] = Field( - default=None, description="Provided Configurations" - ) - connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field( - default=None, - description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`', - ) - platform_instance_map: Optional[Dict[str, str]] = Field( - default=None, - description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`', - ) - generic_connectors: List[GenericConnectorConfig] = Field( - default=[], - description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector", - ) - - stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None - - -@dataclass -class KafkaConnectSourceReport(StaleEntityRemovalSourceReport): - connectors_scanned: int = 0 - filtered: List[str] = field(default_factory=list) - - def report_connector_scanned(self, connector: str) -> None: - self.connectors_scanned += 1 - - def report_dropped(self, connector: str) -> None: - self.filtered.append(connector) - - -@dataclass -class KafkaConnectLineage: - """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob""" - - source_platform: str - target_dataset: str - target_platform: str - job_property_bag: Optional[Dict[str, str]] = None - source_dataset: Optional[str] = None - - -@dataclass -class ConnectorManifest: - """Each instance is potential DataFlow""" - - name: str - type: str - config: Dict - tasks: Dict - url: Optional[str] = None - flow_property_bag: Optional[Dict[str, str]] = None - lineages: List[KafkaConnectLineage] = field(default_factory=list) - topic_names: Iterable[str] = field(default_factory=list) - - -def remove_prefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - index = len(prefix) - return text[index:] - return text - - -def unquote( - string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None -) -> str: - """ - If string starts and ends with a quote, unquote it - """ - trailing_quote = trailing_quote if trailing_quote else leading_quote - if string.startswith(leading_quote) and string.endswith(trailing_quote): - string = string[1:-1] - return string - - -def get_dataset_name( - database_name: Optional[str], - source_table: str, -) -> str: - if database_name: - dataset_name = database_name + "." + source_table - else: - dataset_name = source_table - - return dataset_name - - -def get_platform_instance( - config: KafkaConnectSourceConfig, connector_name: str, platform: str -) -> Optional[str]: - instance_name = None - if ( - config.connect_to_platform_map - and config.connect_to_platform_map.get(connector_name) - and config.connect_to_platform_map[connector_name].get(platform) - ): - instance_name = config.connect_to_platform_map[connector_name][platform] - if config.platform_instance_map and config.platform_instance_map.get(platform): - logger.warning( - f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map." - "Will prefer connector specific platform instance from connect_to_platform_map." - ) - elif config.platform_instance_map and config.platform_instance_map.get(platform): - instance_name = config.platform_instance_map[platform] - logger.info( - f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}" - ) - return instance_name - - -@dataclass -class ConfluentJDBCSourceConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, - connector_manifest: ConnectorManifest, - config: KafkaConnectSourceConfig, - report: KafkaConnectSourceReport, - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self.report = report - self._extract_lineages() - - REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter" - KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER] - # https://kafka.apache.org/documentation/#connect_included_transformation - KAFKA_NONTOPICROUTING_TRANSFORMS = [ - "InsertField", - "InsertField$Key", - "InsertField$Value", - "ReplaceField", - "ReplaceField$Key", - "ReplaceField$Value", - "MaskField", - "MaskField$Key", - "MaskField$Value", - "ValueToKey", - "ValueToKey$Key", - "ValueToKey$Value", - "HoistField", - "HoistField$Key", - "HoistField$Value", - "ExtractField", - "ExtractField$Key", - "ExtractField$Value", - "SetSchemaMetadata", - "SetSchemaMetadata$Key", - "SetSchemaMetadata$Value", - "Flatten", - "Flatten$Key", - "Flatten$Value", - "Cast", - "Cast$Key", - "Cast$Value", - "HeadersFrom", - "HeadersFrom$Key", - "HeadersFrom$Value", - "TimestampConverter", - "Filter", - "InsertHeader", - "DropHeaders", - ] - # https://docs.confluent.io/platform/current/connect/transforms/overview.html - CONFLUENT_NONTOPICROUTING_TRANSFORMS = [ - "Drop", - "Drop$Key", - "Drop$Value", - "Filter", - "Filter$Key", - "Filter$Value", - "TombstoneHandler", - ] - KNOWN_NONTOPICROUTING_TRANSFORMS = ( - KAFKA_NONTOPICROUTING_TRANSFORMS - + [ - f"org.apache.kafka.connect.transforms.{t}" - for t in KAFKA_NONTOPICROUTING_TRANSFORMS - ] - + CONFLUENT_NONTOPICROUTING_TRANSFORMS - + [ - f"io.confluent.connect.transforms.{t}" - for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS - ] - ) - - @dataclass - class JdbcParser: - db_connection_url: str - source_platform: str - database_name: str - topic_prefix: str - query: str - transforms: list - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> JdbcParser: - url = remove_prefix( - str(connector_manifest.config.get("connection.url")), "jdbc:" - ) - url_instance = make_url(url) - source_platform = get_platform_from_sqlalchemy_uri(str(url_instance)) - database_name = url_instance.database - assert database_name - db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}" - - topic_prefix = self.connector_manifest.config.get("topic.prefix", None) - - query = self.connector_manifest.config.get("query", None) - - transform_names = ( - self.connector_manifest.config.get("transforms", "").split(",") - if self.connector_manifest.config.get("transforms") - else [] - ) - - transforms = [] - for name in transform_names: - transform = {"name": name} - transforms.append(transform) - for key in self.connector_manifest.config.keys(): - if key.startswith(f"transforms.{name}."): - transform[ - key.replace(f"transforms.{name}.", "") - ] = self.connector_manifest.config[key] - - return self.JdbcParser( - db_connection_url, - source_platform, - database_name, - topic_prefix, - query, - transforms, - ) - - def default_get_lineages( - self, - topic_prefix: str, - database_name: str, - source_platform: str, - topic_names: Optional[Iterable[str]] = None, - include_source_dataset: bool = True, - ) -> List[KafkaConnectLineage]: - lineages: List[KafkaConnectLineage] = [] - if not topic_names: - topic_names = self.connector_manifest.topic_names - table_name_tuples: List[Tuple] = self.get_table_names() - for topic in topic_names: - # All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM)) - source_table: str = ( - remove_prefix(topic, topic_prefix) if topic_prefix else topic - ) - # include schema name for three-level hierarchies - if has_three_level_hierarchy(source_platform): - table_name_tuple: Tuple = next( - iter([t for t in table_name_tuples if t and t[-1] == source_table]), - (), - ) - if len(table_name_tuple) > 1: - source_table = f"{table_name_tuple[-2]}.{source_table}" - else: - include_source_dataset = False - self.report.warning( - "Could not find schema for table" - f"{self.connector_manifest.name} : {source_table}", - ) - dataset_name: str = get_dataset_name(database_name, source_table) - lineage = KafkaConnectLineage( - source_dataset=dataset_name if include_source_dataset else None, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - return lineages - - def get_table_names(self) -> List[Tuple]: - sep: str = "." - leading_quote_char: str = '"' - trailing_quote_char: str = leading_quote_char - - table_ids: List[str] = [] - if self.connector_manifest.tasks: - table_ids = ( - ",".join( - [ - task["config"].get("tables") - for task in self.connector_manifest.tasks - ] - ) - ).split(",") - quote_method = self.connector_manifest.config.get( - "quote.sql.identifiers", "always" - ) - if ( - quote_method == "always" - and table_ids - and table_ids[0] - and table_ids[-1] - ): - leading_quote_char = table_ids[0][0] - trailing_quote_char = table_ids[-1][-1] - # This will only work for single character quotes - elif self.connector_manifest.config.get("table.whitelist"): - table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore - - # List of Tuple containing (schema, table) - tables: List[Tuple] = [ - ( - ( - unquote( - table_id.split(sep)[-2], leading_quote_char, trailing_quote_char - ) - if len(table_id.split(sep)) > 1 - else "" - ), - unquote( - table_id.split(sep)[-1], leading_quote_char, trailing_quote_char - ), - ) - for table_id in table_ids - ] - return tables - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - database_name = parser.database_name - query = parser.query - topic_prefix = parser.topic_prefix - transforms = parser.transforms - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # Mask/Remove properties that may reveal credentials - self.connector_manifest.flow_property_bag[ - "connection.url" - ] = parser.db_connection_url - if "connection.password" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["connection.password"] - if "connection.user" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["connection.user"] - - logging.debug( - f"Extracting source platform: {source_platform} and database name: {database_name} from connection url " - ) - - if not self.connector_manifest.topic_names: - self.connector_manifest.lineages = lineages - return - - if query: - # Lineage source_table can be extracted by parsing query - for topic in self.connector_manifest.topic_names: - # default method - as per earlier implementation - dataset_name: str = get_dataset_name(database_name, topic) - - lineage = KafkaConnectLineage( - source_dataset=None, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.report.warning( - "Could not find input dataset, the connector has query configuration set", - self.connector_manifest.name, - ) - self.connector_manifest.lineages = lineages - return - - SINGLE_TRANSFORM = len(transforms) == 1 - NO_TRANSFORM = len(transforms) == 0 - UNKNOWN_TRANSFORM = any( - [ - transform["type"] - not in self.KNOWN_TOPICROUTING_TRANSFORMS - + self.KNOWN_NONTOPICROUTING_TRANSFORMS - for transform in transforms - ] - ) - ALL_TRANSFORMS_NON_TOPICROUTING = all( - [ - transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS - for transform in transforms - ] - ) - - if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING: - self.connector_manifest.lineages = self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - ) - return - - if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER: - tables = self.get_table_names() - topic_names = list(self.connector_manifest.topic_names) - - from java.util.regex import Pattern - - for table in tables: - source_table: str = table[-1] - topic = topic_prefix + source_table if topic_prefix else source_table - - transform_regex = Pattern.compile(transforms[0]["regex"]) - transform_replacement = transforms[0]["replacement"] - - matcher = transform_regex.matcher(topic) - if matcher.matches(): - topic = str(matcher.replaceFirst(transform_replacement)) - - # Additional check to confirm that the topic present - # in connector topics - - if topic in self.connector_manifest.topic_names: - # include schema name for three-level hierarchies - if has_three_level_hierarchy(source_platform) and len(table) > 1: - source_table = f"{table[-2]}.{table[-1]}" - - dataset_name = get_dataset_name(database_name, source_table) - - lineage = KafkaConnectLineage( - source_dataset=dataset_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - topic_names.remove(topic) - lineages.append(lineage) - - if topic_names: - lineages.extend( - self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - topic_names=topic_names, - include_source_dataset=False, - ) - ) - self.report.warning( - "Could not find input dataset for connector topics", - f"{self.connector_manifest.name} : {topic_names}", - ) - self.connector_manifest.lineages = lineages - return - else: - include_source_dataset = True - if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: - self.report.warning( - "Could not find input dataset, connector has unknown transform", - f"{self.connector_manifest.name} : {transforms[0]['type']}", - ) - include_source_dataset = False - if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: - self.report.warning( - "Could not find input dataset, connector has one or more unknown transforms", - self.connector_manifest.name, - ) - include_source_dataset = False - lineages = self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - include_source_dataset=include_source_dataset, - ) - self.connector_manifest.lineages = lineages - return - - -@dataclass -class MongoSourceConnector: - # https://www.mongodb.com/docs/kafka-connector/current/source-connector/ - - connector_manifest: ConnectorManifest - - def __init__( - self, connector_manifest: ConnectorManifest, config: KafkaConnectSourceConfig - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self._extract_lineages() - - @dataclass - class MongoSourceParser: - db_connection_url: Optional[str] - source_platform: str - database_name: Optional[str] - topic_prefix: Optional[str] - transforms: List[str] - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> MongoSourceParser: - parser = self.MongoSourceParser( - db_connection_url=connector_manifest.config.get("connection.uri"), - source_platform="mongodb", - database_name=connector_manifest.config.get("database"), - topic_prefix=connector_manifest.config.get("topic_prefix"), - transforms=( - connector_manifest.config["transforms"].split(",") - if "transforms" in connector_manifest.config - else [] - ), - ) - - return parser - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)" - - if not self.connector_manifest.topic_names: - return lineages - - for topic in self.connector_manifest.topic_names: - found = re.search(re.compile(topic_naming_pattern), topic) - - if found: - table_name = get_dataset_name(found.group(1), found.group(2)) - - lineage = KafkaConnectLineage( - source_dataset=table_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.connector_manifest.lineages = lineages - - -@dataclass -class DebeziumSourceConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, - connector_manifest: ConnectorManifest, - config: KafkaConnectSourceConfig, - report: KafkaConnectSourceReport, - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self.report = report - self._extract_lineages() - - @dataclass - class DebeziumParser: - source_platform: str - server_name: Optional[str] - database_name: Optional[str] - - def get_server_name(self, connector_manifest: ConnectorManifest) -> str: - if "topic.prefix" in connector_manifest.config: - return connector_manifest.config["topic.prefix"] - else: - return connector_manifest.config.get("database.server.name", "") - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> DebeziumParser: - connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "") - - if connector_class == "io.debezium.connector.mysql.MySqlConnector": - parser = self.DebeziumParser( - source_platform="mysql", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "MySqlConnector": - parser = self.DebeziumParser( - source_platform="mysql", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector": - parser = self.DebeziumParser( - source_platform="mongodb", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "io.debezium.connector.postgresql.PostgresConnector": - parser = self.DebeziumParser( - source_platform="postgres", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.oracle.OracleConnector": - parser = self.DebeziumParser( - source_platform="oracle", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector": - database_name = connector_manifest.config.get( - "database.names" - ) or connector_manifest.config.get("database.dbname") - - if "," in str(database_name): - raise Exception( - f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}" - ) - - parser = self.DebeziumParser( - source_platform="mssql", - server_name=self.get_server_name(connector_manifest), - database_name=database_name, - ) - elif connector_class == "io.debezium.connector.db2.Db2Connector": - parser = self.DebeziumParser( - source_platform="db2", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.vitess.VitessConnector": - parser = self.DebeziumParser( - source_platform="vitess", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("vitess.keyspace"), - ) - else: - raise ValueError(f"Connector class '{connector_class}' is unknown.") - - return parser - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - - try: - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - server_name = parser.server_name - database_name = parser.database_name - topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)" - - if not self.connector_manifest.topic_names: - return lineages - - for topic in self.connector_manifest.topic_names: - found = re.search(re.compile(topic_naming_pattern), topic) - - if found: - table_name = get_dataset_name(database_name, found.group(2)) - - lineage = KafkaConnectLineage( - source_dataset=table_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.connector_manifest.lineages = lineages - except Exception as e: - self.report.warning( - "Error resolving lineage for connector", - self.connector_manifest.name, - exc=e, - ) - - return - - -@dataclass -class BigQuerySinkConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class BQParser: - project: str - target_platform: str - sanitizeTopics: str - transforms: list - topicsToTables: Optional[str] = None - datasets: Optional[str] = None - defaultDataset: Optional[str] = None - version: str = "v1" - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> BQParser: - project = connector_manifest.config["project"] - sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false") - transform_names = ( - self.connector_manifest.config.get("transforms", "").split(",") - if self.connector_manifest.config.get("transforms") - else [] - ) - transforms = [] - for name in transform_names: - transform = {"name": name} - transforms.append(transform) - for key in self.connector_manifest.config.keys(): - if key.startswith(f"transforms.{name}."): - transform[ - key.replace(f"transforms.{name}.", "") - ] = self.connector_manifest.config[key] - - if "defaultDataset" in connector_manifest.config: - defaultDataset = connector_manifest.config["defaultDataset"] - return self.BQParser( - project=project, - defaultDataset=defaultDataset, - target_platform="bigquery", - sanitizeTopics=sanitizeTopics.lower() == "true", - version="v2", - transforms=transforms, - ) - else: - # version 1.6.x and similar configs supported - datasets = connector_manifest.config["datasets"] - topicsToTables = connector_manifest.config.get("topicsToTables") - - return self.BQParser( - project=project, - topicsToTables=topicsToTables, - datasets=datasets, - target_platform="bigquery", - sanitizeTopics=sanitizeTopics.lower() == "true", - transforms=transforms, - ) - - def get_list(self, property: str) -> Iterable[Tuple[str, str]]: - entries = property.split(",") - for entry in entries: - key, val = entry.rsplit("=") - yield (key.strip(), val.strip()) - - def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]: - topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore - from java.util.regex import Pattern - - for pattern, dataset in topicregex_dataset_map.items(): - patternMatcher = Pattern.compile(pattern).matcher(topic) - if patternMatcher.matches(): - return dataset - return None - - def sanitize_table_name(self, table_name): - table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name) - if re.match("^[^a-zA-Z_].*", table_name): - table_name = "_" + table_name - - return table_name - - def get_dataset_table_for_topic( - self, topic: str, parser: BQParser - ) -> Optional[str]: - if parser.version == "v2": - dataset = parser.defaultDataset - parts = topic.split(":") - if len(parts) == 2: - dataset = parts[0] - table = parts[1] - else: - table = parts[0] - else: - dataset = self.get_dataset_for_topic_v1(topic, parser) - if dataset is None: - return None - - table = topic - if parser.topicsToTables: - topicregex_table_map: Dict[str, str] = dict( - self.get_list(parser.topicsToTables) # type: ignore - ) - from java.util.regex import Pattern - - for pattern, tbl in topicregex_table_map.items(): - patternMatcher = Pattern.compile(pattern).matcher(topic) - if patternMatcher.matches(): - table = tbl - break - - if parser.sanitizeTopics: - table = self.sanitize_table_name(table) - return f"{dataset}.{table}" - - def apply_transformations( - self, topic: str, transforms: List[Dict[str, str]] - ) -> str: - for transform in transforms: - if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter": - regex = transform["regex"] - replacement = transform["replacement"] - pattern = re.compile(regex) - if pattern.match(topic): - topic = pattern.sub(replacement, topic, count=1) - return topic - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - if not parser: - return lineages - target_platform = parser.target_platform - project = parser.project - transforms = parser.transforms - self.connector_manifest.flow_property_bag = self.connector_manifest.config - # Mask/Remove properties that may reveal credentials - if "keyfile" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["keyfile"] - - for topic in self.connector_manifest.topic_names: - transformed_topic = self.apply_transformations(topic, transforms) - dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser) - if dataset_table is None: - self.report.warning( - "Could not find target dataset for topic, please check your connector configuration" - f"{self.connector_manifest.name} : {transformed_topic} ", - ) - continue - target_dataset = f"{project}.{dataset_table}" - - lineages.append( - KafkaConnectLineage( - source_dataset=transformed_topic, - source_platform=KAFKA, - target_dataset=target_dataset, - target_platform=target_platform, - ) - ) - self.connector_manifest.lineages = lineages - return - - -@dataclass -class SnowflakeSinkConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class SnowflakeParser: - database_name: str - schema_name: str - topics_to_tables: Dict[str, str] - - def get_table_name_from_topic_name(self, topic_name: str) -> str: - """ - This function converts the topic name to a valid Snowflake table name using some rules. - Refer below link for more info - https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics - """ - table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name) - if re.match("^[^a-zA-Z_].*", table_name): - table_name = "_" + table_name - # Connector may append original topic's hash code as suffix for conflict resolution - # if generated table names for 2 topics are similar. This corner case is not handled here. - # Note that Snowflake recommends to choose topic names that follow the rules for - # Snowflake identifier names so this case is not recommended by snowflake. - return table_name - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> SnowflakeParser: - database_name = connector_manifest.config["snowflake.database.name"] - schema_name = connector_manifest.config["snowflake.schema.name"] - - # Fetch user provided topic to table map - provided_topics_to_tables: Dict[str, str] = {} - if connector_manifest.config.get("snowflake.topic2table.map"): - for each in connector_manifest.config["snowflake.topic2table.map"].split( - "," - ): - topic, table = each.split(":") - provided_topics_to_tables[topic.strip()] = table.strip() - - topics_to_tables: Dict[str, str] = {} - # Extract lineage for only those topics whose data ingestion started - for topic in connector_manifest.topic_names: - if topic in provided_topics_to_tables: - # If user provided which table to get mapped with this topic - topics_to_tables[topic] = provided_topics_to_tables[topic] - else: - # Else connector converts topic name to a valid Snowflake table name. - topics_to_tables[topic] = self.get_table_name_from_topic_name(topic) - - return self.SnowflakeParser( - database_name=database_name, - schema_name=schema_name, - topics_to_tables=topics_to_tables, - ) - - def _extract_lineages(self): - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # For all snowflake sink connector properties, refer below link - # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector - # remove private keys, secrets from properties - secret_properties = [ - "snowflake.private.key", - "snowflake.private.key.passphrase", - "value.converter.basic.auth.user.info", - ] - for k in secret_properties: - if k in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag[k] - - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - - for topic, table in parser.topics_to_tables.items(): - target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}" - lineages.append( - KafkaConnectLineage( - source_dataset=topic, - source_platform=KAFKA, - target_dataset=target_dataset, - target_platform="snowflake", - ) - ) - - self.connector_manifest.lineages = lineages - return - - -@dataclass -class ConfluentS3SinkConnector: - connector_manifest: ConnectorManifest - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class S3SinkParser: - target_platform: str - bucket: str - topics_dir: str - topics: Iterable[str] - - def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser: - # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3 - bucket = connector_manifest.config.get("s3.bucket.name") - if not bucket: - raise ValueError( - "Could not find 's3.bucket.name' in connector configuration" - ) - - # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage - topics_dir = connector_manifest.config.get("topics.dir", "topics") - - return self.S3SinkParser( - target_platform="s3", - bucket=bucket, - topics_dir=topics_dir, - topics=connector_manifest.topic_names, - ) - - def _extract_lineages(self): - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # remove keys, secrets from properties - secret_properties = [ - "aws.access.key.id", - "aws.secret.access.key", - "s3.sse.customer.key", - "s3.proxy.password", - ] - for k in secret_properties: - if k in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag[k] - - try: - parser = self._get_parser(self.connector_manifest) - - lineages: List[KafkaConnectLineage] = list() - for topic in parser.topics: - target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}" - - lineages.append( - KafkaConnectLineage( - source_dataset=topic, - source_platform="kafka", - target_dataset=target_dataset, - target_platform=parser.target_platform, - ) - ) - self.connector_manifest.lineages = lineages - except Exception as e: - self.report.warning( - "Error resolving lineage for connector", - self.connector_manifest.name, - exc=e, - ) - - return - - -def transform_connector_config( - connector_config: Dict, provided_configs: List[ProvidedConfig] -) -> None: - """This method will update provided configs in connector config values, if any""" - lookupsByProvider = {} - for pconfig in provided_configs: - lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value - for k, v in connector_config.items(): - for key, value in lookupsByProvider.items(): - if key in v: - connector_config[k] = connector_config[k].replace(key, value) - - -@platform_name("Kafka Connect") -@config_class(KafkaConnectSourceConfig) -@support_status(SupportStatus.CERTIFIED) -@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") -@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default") -@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") -class KafkaConnectSource(StatefulIngestionSourceBase): - config: KafkaConnectSourceConfig - report: KafkaConnectSourceReport - platform: str = "kafka-connect" - - def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext): - super().__init__(config, ctx) - self.config = config - self.report = KafkaConnectSourceReport() - self.session = requests.Session() - self.session.headers.update( - { - "Accept": "application/json", - "Content-Type": "application/json", - } - ) - - # Test the connection - if self.config.username is not None and self.config.password is not None: - logger.info( - f"Connecting to {self.config.connect_uri} with Authentication..." - ) - self.session.auth = (self.config.username, self.config.password) - - test_response = self.session.get(f"{self.config.connect_uri}/connectors") - test_response.raise_for_status() - logger.info(f"Connection to {self.config.connect_uri} is ok") - if not jpype.isJVMStarted(): - jpype.startJVM() - - @classmethod - def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: - config = KafkaConnectSourceConfig.parse_obj(config_dict) - return cls(config, ctx) - - def get_connectors_manifest(self) -> List[ConnectorManifest]: - """Get Kafka Connect connectors manifest using REST API. - Enrich with lineages metadata. - """ - connectors_manifest = list() - - connector_response = self.session.get( - f"{self.config.connect_uri}/connectors", - ) - - payload = connector_response.json() - - for connector_name in payload: - connector_url = f"{self.config.connect_uri}/connectors/{connector_name}" - connector_manifest = self._get_connector_manifest( - connector_name, connector_url - ) - if ( - connector_manifest is None - or not self.config.connector_patterns.allowed(connector_manifest.name) - ): - self.report.report_dropped(connector_name) - continue - - if self.config.provided_configs: - transform_connector_config( - connector_manifest.config, self.config.provided_configs - ) - # Initialize connector lineages - connector_manifest.lineages = list() - connector_manifest.url = connector_url - - connector_manifest.topic_names = self._get_connector_topics(connector_name) - - # Populate Source Connector metadata - if connector_manifest.type == SOURCE: - connector_manifest.tasks = self._get_connector_tasks(connector_name) - - # JDBC source connector lineages - if connector_manifest.config.get(CONNECTOR_CLASS).__eq__( - "io.confluent.connect.jdbc.JdbcSourceConnector" - ): - connector_manifest = ConfluentJDBCSourceConnector( - connector_manifest=connector_manifest, - config=self.config, - report=self.report, - ).connector_manifest - elif connector_manifest.config.get(CONNECTOR_CLASS, "").startswith( - "io.debezium.connector" - ): - connector_manifest = DebeziumSourceConnector( - connector_manifest=connector_manifest, - config=self.config, - report=self.report, - ).connector_manifest - elif ( - connector_manifest.config.get(CONNECTOR_CLASS, "") - == "com.mongodb.kafka.connect.MongoSourceConnector" - ): - connector_manifest = MongoSourceConnector( - connector_manifest=connector_manifest, config=self.config - ).connector_manifest - else: - # Find the target connector object in the list, or log an error if unknown. - target_connector = None - for connector in self.config.generic_connectors: - if connector.connector_name == connector_manifest.name: - target_connector = connector - break - if not target_connector: - logger.warning( - f"Detected undefined connector {connector_manifest.name}, which is not in the customized connector list. Please refer to Kafka Connect ingestion recipe to define this customized connector." - ) - continue - - for topic in connector_manifest.topic_names: - lineage = KafkaConnectLineage( - source_dataset=target_connector.source_dataset, - source_platform=target_connector.source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - - connector_manifest.lineages.append(lineage) - - if connector_manifest.type == SINK: - if connector_manifest.config.get(CONNECTOR_CLASS).__eq__( - "com.wepay.kafka.connect.bigquery.BigQuerySinkConnector" - ): - connector_manifest = BigQuerySinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - elif connector_manifest.config.get("connector.class").__eq__( - "io.confluent.connect.s3.S3SinkConnector" - ): - connector_manifest = ConfluentS3SinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - elif connector_manifest.config.get("connector.class").__eq__( - "com.snowflake.kafka.connector.SnowflakeSinkConnector" - ): - connector_manifest = SnowflakeSinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - else: - self.report.report_dropped(connector_manifest.name) - logger.warning( - f"Skipping connector {connector_manifest.name}. Lineage for Connector not yet implemented" - ) - pass - - connectors_manifest.append(connector_manifest) - - return connectors_manifest - - def _get_connector_manifest( - self, connector_name: str, connector_url: str - ) -> Optional[ConnectorManifest]: - try: - connector_response = self.session.get(connector_url) - connector_response.raise_for_status() - except Exception as e: - self.report.warning( - "Failed to get connector details", connector_name, exc=e - ) - return None - manifest = connector_response.json() - connector_manifest = ConnectorManifest(**manifest) - return connector_manifest - - def _get_connector_tasks(self, connector_name: str) -> dict: - try: - response = self.session.get( - f"{self.config.connect_uri}/connectors/{connector_name}/tasks", - ) - response.raise_for_status() - except Exception as e: - self.report.warning( - "Error getting connector tasks", context=connector_name, exc=e - ) - return {} - - return response.json() - - def _get_connector_topics(self, connector_name: str) -> List[str]: - try: - response = self.session.get( - f"{self.config.connect_uri}/connectors/{connector_name}/topics", - ) - response.raise_for_status() - except Exception as e: - self.report.warning( - "Error getting connector topics", context=connector_name, exc=e - ) - return [] - - return response.json()[connector_name]["topics"] - - def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit: - connector_name = connector.name - connector_type = connector.type - connector_class = connector.config.get(CONNECTOR_CLASS) - flow_property_bag = connector.flow_property_bag - # connector_url = connector.url # NOTE: this will expose connector credential when used - flow_urn = builder.make_data_flow_urn( - self.platform, - connector_name, - self.config.env, - self.config.platform_instance, - ) - - return MetadataChangeProposalWrapper( - entityUrn=flow_urn, - aspect=models.DataFlowInfoClass( - name=connector_name, - description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.", - customProperties=flow_property_bag, - # externalUrl=connector_url, # NOTE: this will expose connector credential when used - ), - ).as_workunit() - - def construct_job_workunits( - self, connector: ConnectorManifest - ) -> Iterable[MetadataWorkUnit]: - connector_name = connector.name - flow_urn = builder.make_data_flow_urn( - self.platform, - connector_name, - self.config.env, - self.config.platform_instance, - ) - - lineages = connector.lineages - if lineages: - for lineage in lineages: - source_dataset = lineage.source_dataset - source_platform = lineage.source_platform - target_dataset = lineage.target_dataset - target_platform = lineage.target_platform - job_property_bag = lineage.job_property_bag - - source_platform_instance = get_platform_instance( - self.config, connector_name, source_platform - ) - target_platform_instance = get_platform_instance( - self.config, connector_name, target_platform - ) - - job_id = self.get_job_id(lineage, connector, self.config) - job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id) - - inlets = ( - [ - self.make_lineage_dataset_urn( - source_platform, source_dataset, source_platform_instance - ) - ] - if source_dataset - else [] - ) - outlets = [ - self.make_lineage_dataset_urn( - target_platform, target_dataset, target_platform_instance - ) - ] - - yield MetadataChangeProposalWrapper( - entityUrn=job_urn, - aspect=models.DataJobInfoClass( - name=f"{connector_name}:{job_id}", - type="COMMAND", - customProperties=job_property_bag, - ), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=job_urn, - aspect=models.DataJobInputOutputClass( - inputDatasets=inlets, - outputDatasets=outlets, - ), - ).as_workunit() - - def get_job_id( - self, - lineage: KafkaConnectLineage, - connector: ConnectorManifest, - config: KafkaConnectSourceConfig, - ) -> str: - connector_class = connector.config.get(CONNECTOR_CLASS) - - # Note - This block is only to maintain backward compatibility of Job URN - if ( - connector_class - and connector.type == SOURCE - and ( - "JdbcSourceConnector" in connector_class - or connector_class.startswith("io.debezium.connector") - ) - and lineage.source_dataset - and config.connect_to_platform_map - and config.connect_to_platform_map.get(connector.name) - and config.connect_to_platform_map[connector.name].get( - lineage.source_platform - ) - ): - return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}" - - return ( - lineage.source_dataset - if lineage.source_dataset - else f"unknown_source.{lineage.target_dataset}" - ) - - def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: - return [ - *super().get_workunit_processors(), - StaleEntityRemovalHandler.create( - self, self.config, self.ctx - ).workunit_processor, - ] - - def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - connectors_manifest = self.get_connectors_manifest() - for connector in connectors_manifest: - name = connector.name - - yield self.construct_flow_workunit(connector) - yield from self.construct_job_workunits(connector) - self.report.report_connector_scanned(name) - - def get_report(self) -> KafkaConnectSourceReport: - return self.report - - def make_lineage_dataset_urn( - self, platform: str, name: str, platform_instance: Optional[str] - ) -> str: - if self.config.convert_lineage_urns_to_lowercase: - name = name.lower() - - return builder.make_dataset_urn_with_platform_instance( - platform, name, platform_instance, self.config.env - ) - - -# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy. -def has_three_level_hierarchy(platform: str) -> bool: - return platform in ["postgres", "trino", "redshift", "snowflake"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py new file mode 100644 index 00000000000000..36f6a96c0d4080 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py @@ -0,0 +1,202 @@ +import logging +from dataclasses import dataclass, field +from typing import Dict, Iterable, List, Optional + +from pydantic.fields import Field + +from datahub.configuration.common import AllowDenyPattern, ConfigModel +from datahub.configuration.source_common import ( + DatasetLineageProviderConfigBase, + PlatformInstanceConfigMixin, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalSourceReport, + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionConfigBase, +) + +logger = logging.getLogger(__name__) + +KAFKA = "kafka" +SOURCE = "source" +SINK = "sink" +CONNECTOR_CLASS = "connector.class" + + +class ProvidedConfig(ConfigModel): + provider: str + path_key: str + value: str + + +class GenericConnectorConfig(ConfigModel): + connector_name: str + source_dataset: str + source_platform: str + + +class KafkaConnectSourceConfig( + PlatformInstanceConfigMixin, + DatasetLineageProviderConfigBase, + StatefulIngestionConfigBase, +): + # See the Connect REST Interface for details + # https://docs.confluent.io/platform/current/connect/references/restapi.html# + connect_uri: str = Field( + default="http://localhost:8083/", description="URI to connect to." + ) + username: Optional[str] = Field(default=None, description="Kafka Connect username.") + password: Optional[str] = Field(default=None, description="Kafka Connect password.") + cluster_name: Optional[str] = Field( + default="connect-cluster", description="Cluster to ingest from." + ) + # convert lineage dataset's urns to lowercase + convert_lineage_urns_to_lowercase: bool = Field( + default=False, + description="Whether to convert the urns of ingested lineage dataset to lowercase", + ) + connector_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for connectors to filter for ingestion.", + ) + provided_configs: Optional[List[ProvidedConfig]] = Field( + default=None, description="Provided Configurations" + ) + connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field( + default=None, + description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`', + ) + platform_instance_map: Optional[Dict[str, str]] = Field( + default=None, + description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`', + ) + generic_connectors: List[GenericConnectorConfig] = Field( + default=[], + description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector", + ) + + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None + + +@dataclass +class KafkaConnectSourceReport(StaleEntityRemovalSourceReport): + connectors_scanned: int = 0 + filtered: List[str] = field(default_factory=list) + + def report_connector_scanned(self, connector: str) -> None: + self.connectors_scanned += 1 + + def report_dropped(self, connector: str) -> None: + self.filtered.append(connector) + + +@dataclass +class KafkaConnectLineage: + """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob""" + + source_platform: str + target_dataset: str + target_platform: str + job_property_bag: Optional[Dict[str, str]] = None + source_dataset: Optional[str] = None + + +@dataclass +class ConnectorManifest: + """Each instance is potential DataFlow""" + + name: str + type: str + config: Dict + tasks: Dict + url: Optional[str] = None + flow_property_bag: Optional[Dict[str, str]] = None + lineages: List[KafkaConnectLineage] = field(default_factory=list) + topic_names: Iterable[str] = field(default_factory=list) + + +def remove_prefix(text: str, prefix: str) -> str: + if text.startswith(prefix): + index = len(prefix) + return text[index:] + return text + + +def unquote( + string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None +) -> str: + """ + If string starts and ends with a quote, unquote it + """ + trailing_quote = trailing_quote if trailing_quote else leading_quote + if string.startswith(leading_quote) and string.endswith(trailing_quote): + string = string[1:-1] + return string + + +def get_dataset_name( + database_name: Optional[str], + source_table: str, +) -> str: + if database_name: + dataset_name = database_name + "." + source_table + else: + dataset_name = source_table + + return dataset_name + + +def get_platform_instance( + config: KafkaConnectSourceConfig, connector_name: str, platform: str +) -> Optional[str]: + instance_name = None + if ( + config.connect_to_platform_map + and config.connect_to_platform_map.get(connector_name) + and config.connect_to_platform_map[connector_name].get(platform) + ): + instance_name = config.connect_to_platform_map[connector_name][platform] + if config.platform_instance_map and config.platform_instance_map.get(platform): + logger.warning( + f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map." + "Will prefer connector specific platform instance from connect_to_platform_map." + ) + elif config.platform_instance_map and config.platform_instance_map.get(platform): + instance_name = config.platform_instance_map[platform] + logger.info( + f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}" + ) + return instance_name + + +def transform_connector_config( + connector_config: Dict, provided_configs: List[ProvidedConfig] +) -> None: + """This method will update provided configs in connector config values, if any""" + lookupsByProvider = {} + for pconfig in provided_configs: + lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value + for k, v in connector_config.items(): + for key, value in lookupsByProvider.items(): + if key in v: + connector_config[k] = connector_config[k].replace(key, value) + + +# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy. +def has_three_level_hierarchy(platform: str) -> bool: + return platform in ["postgres", "trino", "redshift", "snowflake"] + + +@dataclass +class BaseConnector: + connector_manifest: ConnectorManifest + config: KafkaConnectSourceConfig + report: KafkaConnectSourceReport + + def extract_lineages(self) -> List[KafkaConnectLineage]: + return [] + + def extract_flow_property_bag(self) -> Optional[Dict[str, str]]: + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py new file mode 100644 index 00000000000000..fa6b614c4b52a6 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py @@ -0,0 +1,367 @@ +import logging +from typing import Iterable, List, Optional, Type + +import jpype +import jpype.imports +import requests + +import datahub.emitter.mce_builder as builder +import datahub.metadata.schema_classes as models +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.kafka_connect.common import ( + CONNECTOR_CLASS, + SINK, + SOURCE, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, + KafkaConnectSourceConfig, + KafkaConnectSourceReport, + get_platform_instance, + transform_connector_config, +) +from datahub.ingestion.source.kafka_connect.sink_connectors import ( + BIGQUERY_SINK_CONNECTOR_CLASS, + S3_SINK_CONNECTOR_CLASS, + SNOWFLAKE_SINK_CONNECTOR_CLASS, + BigQuerySinkConnector, + ConfluentS3SinkConnector, + SnowflakeSinkConnector, +) +from datahub.ingestion.source.kafka_connect.source_connectors import ( + DEBEZIUM_SOURCE_CONNECTOR_PREFIX, + JDBC_SOURCE_CONNECTOR_CLASS, + MONGO_SOURCE_CONNECTOR_CLASS, + ConfigDrivenSourceConnector, + ConfluentJDBCSourceConnector, + DebeziumSourceConnector, + MongoSourceConnector, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionSourceBase, +) + +logger = logging.getLogger(__name__) + + +@platform_name("Kafka Connect") +@config_class(KafkaConnectSourceConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") +class KafkaConnectSource(StatefulIngestionSourceBase): + config: KafkaConnectSourceConfig + report: KafkaConnectSourceReport + platform: str = "kafka-connect" + + def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext): + super().__init__(config, ctx) + self.config = config + self.report = KafkaConnectSourceReport() + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "Content-Type": "application/json", + } + ) + + # Test the connection + if self.config.username is not None and self.config.password is not None: + logger.info( + f"Connecting to {self.config.connect_uri} with Authentication..." + ) + self.session.auth = (self.config.username, self.config.password) + + test_response = self.session.get(f"{self.config.connect_uri}/connectors") + test_response.raise_for_status() + logger.info(f"Connection to {self.config.connect_uri} is ok") + if not jpype.isJVMStarted(): + jpype.startJVM() + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: + config = KafkaConnectSourceConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_connectors_manifest(self) -> Iterable[ConnectorManifest]: + """Get Kafka Connect connectors manifest using REST API. + Enrich with lineages metadata. + """ + + connector_response = self.session.get( + f"{self.config.connect_uri}/connectors", + ) + + payload = connector_response.json() + + for connector_name in payload: + connector_url = f"{self.config.connect_uri}/connectors/{connector_name}" + connector_manifest = self._get_connector_manifest( + connector_name, connector_url + ) + if ( + connector_manifest is None + or not self.config.connector_patterns.allowed(connector_manifest.name) + ): + self.report.report_dropped(connector_name) + continue + + if self.config.provided_configs: + transform_connector_config( + connector_manifest.config, self.config.provided_configs + ) + connector_manifest.url = connector_url + connector_manifest.topic_names = self._get_connector_topics(connector_name) + connector_class_value = connector_manifest.config.get(CONNECTOR_CLASS) or "" + + class_type: Type[BaseConnector] = BaseConnector + + # Populate Source Connector metadata + if connector_manifest.type == SOURCE: + connector_manifest.tasks = self._get_connector_tasks(connector_name) + + # JDBC source connector lineages + if connector_class_value == JDBC_SOURCE_CONNECTOR_CLASS: + class_type = ConfluentJDBCSourceConnector + elif connector_class_value.startswith(DEBEZIUM_SOURCE_CONNECTOR_PREFIX): + class_type = DebeziumSourceConnector + elif connector_class_value == MONGO_SOURCE_CONNECTOR_CLASS: + class_type = MongoSourceConnector + elif any( + [ + connector.connector_name == connector_manifest.name + for connector in self.config.generic_connectors + ] + ): + class_type = ConfigDrivenSourceConnector + else: + self.report.report_dropped(connector_manifest.name) + self.report.warning( + "Lineage for Source Connector not supported. " + "Please refer to Kafka Connect docs to use `generic_connectors` config.", + context=f"{connector_manifest.name} of type {connector_class_value}", + ) + continue + elif connector_manifest.type == SINK: + if connector_class_value == BIGQUERY_SINK_CONNECTOR_CLASS: + class_type = BigQuerySinkConnector + elif connector_class_value == S3_SINK_CONNECTOR_CLASS: + class_type = ConfluentS3SinkConnector + elif connector_class_value == SNOWFLAKE_SINK_CONNECTOR_CLASS: + class_type = SnowflakeSinkConnector + else: + self.report.report_dropped(connector_manifest.name) + self.report.warning( + "Lineage for Sink Connector not supported.", + context=f"{connector_manifest.name} of type {connector_class_value}", + ) + + connector_class = class_type(connector_manifest, self.config, self.report) + connector_manifest.lineages = connector_class.extract_lineages() + connector_manifest.flow_property_bag = ( + connector_class.extract_flow_property_bag() + ) + + yield connector_manifest + + def _get_connector_manifest( + self, connector_name: str, connector_url: str + ) -> Optional[ConnectorManifest]: + try: + connector_response = self.session.get(connector_url) + connector_response.raise_for_status() + except Exception as e: + self.report.warning( + "Failed to get connector details", connector_name, exc=e + ) + return None + manifest = connector_response.json() + connector_manifest = ConnectorManifest(**manifest) + return connector_manifest + + def _get_connector_tasks(self, connector_name: str) -> dict: + try: + response = self.session.get( + f"{self.config.connect_uri}/connectors/{connector_name}/tasks", + ) + response.raise_for_status() + except Exception as e: + self.report.warning( + "Error getting connector tasks", context=connector_name, exc=e + ) + return {} + + return response.json() + + def _get_connector_topics(self, connector_name: str) -> List[str]: + try: + response = self.session.get( + f"{self.config.connect_uri}/connectors/{connector_name}/topics", + ) + response.raise_for_status() + except Exception as e: + self.report.warning( + "Error getting connector topics", context=connector_name, exc=e + ) + return [] + + return response.json()[connector_name]["topics"] + + def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit: + connector_name = connector.name + connector_type = connector.type + connector_class = connector.config.get(CONNECTOR_CLASS) + flow_property_bag = connector.flow_property_bag + # connector_url = connector.url # NOTE: this will expose connector credential when used + flow_urn = builder.make_data_flow_urn( + self.platform, + connector_name, + self.config.env, + self.config.platform_instance, + ) + + return MetadataChangeProposalWrapper( + entityUrn=flow_urn, + aspect=models.DataFlowInfoClass( + name=connector_name, + description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.", + customProperties=flow_property_bag, + # externalUrl=connector_url, # NOTE: this will expose connector credential when used + ), + ).as_workunit() + + def construct_job_workunits( + self, connector: ConnectorManifest + ) -> Iterable[MetadataWorkUnit]: + connector_name = connector.name + flow_urn = builder.make_data_flow_urn( + self.platform, + connector_name, + self.config.env, + self.config.platform_instance, + ) + + lineages = connector.lineages + if lineages: + for lineage in lineages: + source_dataset = lineage.source_dataset + source_platform = lineage.source_platform + target_dataset = lineage.target_dataset + target_platform = lineage.target_platform + job_property_bag = lineage.job_property_bag + + source_platform_instance = get_platform_instance( + self.config, connector_name, source_platform + ) + target_platform_instance = get_platform_instance( + self.config, connector_name, target_platform + ) + + job_id = self.get_job_id(lineage, connector, self.config) + job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id) + + inlets = ( + [ + self.make_lineage_dataset_urn( + source_platform, source_dataset, source_platform_instance + ) + ] + if source_dataset + else [] + ) + outlets = [ + self.make_lineage_dataset_urn( + target_platform, target_dataset, target_platform_instance + ) + ] + + yield MetadataChangeProposalWrapper( + entityUrn=job_urn, + aspect=models.DataJobInfoClass( + name=f"{connector_name}:{job_id}", + type="COMMAND", + customProperties=job_property_bag, + ), + ).as_workunit() + + yield MetadataChangeProposalWrapper( + entityUrn=job_urn, + aspect=models.DataJobInputOutputClass( + inputDatasets=inlets, + outputDatasets=outlets, + ), + ).as_workunit() + + def get_job_id( + self, + lineage: KafkaConnectLineage, + connector: ConnectorManifest, + config: KafkaConnectSourceConfig, + ) -> str: + connector_class = connector.config.get(CONNECTOR_CLASS) + + # Note - This block is only to maintain backward compatibility of Job URN + if ( + connector_class + and connector.type == SOURCE + and ( + "JdbcSourceConnector" in connector_class + or connector_class.startswith("io.debezium.connector") + ) + and lineage.source_dataset + and config.connect_to_platform_map + and config.connect_to_platform_map.get(connector.name) + and config.connect_to_platform_map[connector.name].get( + lineage.source_platform + ) + ): + return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}" + + return ( + lineage.source_dataset + if lineage.source_dataset + else f"unknown_source.{lineage.target_dataset}" + ) + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + for connector in self.get_connectors_manifest(): + yield self.construct_flow_workunit(connector) + yield from self.construct_job_workunits(connector) + self.report.report_connector_scanned(connector.name) + + def get_report(self) -> KafkaConnectSourceReport: + return self.report + + def make_lineage_dataset_urn( + self, platform: str, name: str, platform_instance: Optional[str] + ) -> str: + if self.config.convert_lineage_urns_to_lowercase: + name = name.lower() + + return builder.make_dataset_urn_with_platform_instance( + platform, name, platform_instance, self.config.env + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py new file mode 100644 index 00000000000000..2790460c8e6019 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py @@ -0,0 +1,341 @@ +import re +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Tuple + +from datahub.ingestion.source.kafka_connect.common import ( + KAFKA, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, +) + + +@dataclass +class ConfluentS3SinkConnector(BaseConnector): + @dataclass + class S3SinkParser: + target_platform: str + bucket: str + topics_dir: str + topics: Iterable[str] + + def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser: + # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3 + bucket = connector_manifest.config.get("s3.bucket.name") + if not bucket: + raise ValueError( + "Could not find 's3.bucket.name' in connector configuration" + ) + + # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage + topics_dir = connector_manifest.config.get("topics.dir", "topics") + + return self.S3SinkParser( + target_platform="s3", + bucket=bucket, + topics_dir=topics_dir, + topics=connector_manifest.topic_names, + ) + + def extract_flow_property_bag(self) -> Dict[str, str]: + # Mask/Remove properties that may reveal credentials + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k + not in [ + "aws.access.key.id", + "aws.secret.access.key", + "s3.sse.customer.key", + "s3.proxy.password", + ] + } + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + try: + parser = self._get_parser(self.connector_manifest) + + lineages: List[KafkaConnectLineage] = list() + for topic in parser.topics: + target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}" + + lineages.append( + KafkaConnectLineage( + source_dataset=topic, + source_platform="kafka", + target_dataset=target_dataset, + target_platform=parser.target_platform, + ) + ) + return lineages + except Exception as e: + self.report.warning( + "Error resolving lineage for connector", + self.connector_manifest.name, + exc=e, + ) + + return [] + + +@dataclass +class SnowflakeSinkConnector(BaseConnector): + @dataclass + class SnowflakeParser: + database_name: str + schema_name: str + topics_to_tables: Dict[str, str] + + def get_table_name_from_topic_name(self, topic_name: str) -> str: + """ + This function converts the topic name to a valid Snowflake table name using some rules. + Refer below link for more info + https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics + """ + table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name) + if re.match("^[^a-zA-Z_].*", table_name): + table_name = "_" + table_name + # Connector may append original topic's hash code as suffix for conflict resolution + # if generated table names for 2 topics are similar. This corner case is not handled here. + # Note that Snowflake recommends to choose topic names that follow the rules for + # Snowflake identifier names so this case is not recommended by snowflake. + return table_name + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> SnowflakeParser: + database_name = connector_manifest.config["snowflake.database.name"] + schema_name = connector_manifest.config["snowflake.schema.name"] + + # Fetch user provided topic to table map + provided_topics_to_tables: Dict[str, str] = {} + if connector_manifest.config.get("snowflake.topic2table.map"): + for each in connector_manifest.config["snowflake.topic2table.map"].split( + "," + ): + topic, table = each.split(":") + provided_topics_to_tables[topic.strip()] = table.strip() + + topics_to_tables: Dict[str, str] = {} + # Extract lineage for only those topics whose data ingestion started + for topic in connector_manifest.topic_names: + if topic in provided_topics_to_tables: + # If user provided which table to get mapped with this topic + topics_to_tables[topic] = provided_topics_to_tables[topic] + else: + # Else connector converts topic name to a valid Snowflake table name. + topics_to_tables[topic] = self.get_table_name_from_topic_name(topic) + + return self.SnowflakeParser( + database_name=database_name, + schema_name=schema_name, + topics_to_tables=topics_to_tables, + ) + + def extract_flow_property_bag(self) -> Dict[str, str]: + # For all snowflake sink connector properties, refer below link + # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector + # remove private keys, secrets from properties + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k + not in [ + "snowflake.private.key", + "snowflake.private.key.passphrase", + "value.converter.basic.auth.user.info", + ] + } + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + + for topic, table in parser.topics_to_tables.items(): + target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}" + lineages.append( + KafkaConnectLineage( + source_dataset=topic, + source_platform=KAFKA, + target_dataset=target_dataset, + target_platform="snowflake", + ) + ) + + return lineages + + +@dataclass +class BigQuerySinkConnector(BaseConnector): + @dataclass + class BQParser: + project: str + target_platform: str + sanitizeTopics: str + transforms: list + topicsToTables: Optional[str] = None + datasets: Optional[str] = None + defaultDataset: Optional[str] = None + version: str = "v1" + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> BQParser: + project = connector_manifest.config["project"] + sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false") + transform_names = ( + self.connector_manifest.config.get("transforms", "").split(",") + if self.connector_manifest.config.get("transforms") + else [] + ) + transforms = [] + for name in transform_names: + transform = {"name": name} + transforms.append(transform) + for key in self.connector_manifest.config.keys(): + if key.startswith(f"transforms.{name}."): + transform[ + key.replace(f"transforms.{name}.", "") + ] = self.connector_manifest.config[key] + + if "defaultDataset" in connector_manifest.config: + defaultDataset = connector_manifest.config["defaultDataset"] + return self.BQParser( + project=project, + defaultDataset=defaultDataset, + target_platform="bigquery", + sanitizeTopics=sanitizeTopics.lower() == "true", + version="v2", + transforms=transforms, + ) + else: + # version 1.6.x and similar configs supported + datasets = connector_manifest.config["datasets"] + topicsToTables = connector_manifest.config.get("topicsToTables") + + return self.BQParser( + project=project, + topicsToTables=topicsToTables, + datasets=datasets, + target_platform="bigquery", + sanitizeTopics=sanitizeTopics.lower() == "true", + transforms=transforms, + ) + + def get_list(self, property: str) -> Iterable[Tuple[str, str]]: + entries = property.split(",") + for entry in entries: + key, val = entry.rsplit("=") + yield (key.strip(), val.strip()) + + def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]: + topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore + from java.util.regex import Pattern + + for pattern, dataset in topicregex_dataset_map.items(): + patternMatcher = Pattern.compile(pattern).matcher(topic) + if patternMatcher.matches(): + return dataset + return None + + def sanitize_table_name(self, table_name): + table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name) + if re.match("^[^a-zA-Z_].*", table_name): + table_name = "_" + table_name + + return table_name + + def get_dataset_table_for_topic( + self, topic: str, parser: BQParser + ) -> Optional[str]: + if parser.version == "v2": + dataset = parser.defaultDataset + parts = topic.split(":") + if len(parts) == 2: + dataset = parts[0] + table = parts[1] + else: + table = parts[0] + else: + dataset = self.get_dataset_for_topic_v1(topic, parser) + if dataset is None: + return None + + table = topic + if parser.topicsToTables: + topicregex_table_map: Dict[str, str] = dict( + self.get_list(parser.topicsToTables) # type: ignore + ) + from java.util.regex import Pattern + + for pattern, tbl in topicregex_table_map.items(): + patternMatcher = Pattern.compile(pattern).matcher(topic) + if patternMatcher.matches(): + table = tbl + break + + if parser.sanitizeTopics: + table = self.sanitize_table_name(table) + return f"{dataset}.{table}" + + def apply_transformations( + self, topic: str, transforms: List[Dict[str, str]] + ) -> str: + for transform in transforms: + if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter": + regex = transform["regex"] + replacement = transform["replacement"] + pattern = re.compile(regex) + if pattern.match(topic): + topic = pattern.sub(replacement, topic, count=1) + return topic + + def extract_flow_property_bag(self) -> Dict[str, str]: + # Mask/Remove properties that may reveal credentials + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k not in ["keyfile"] + } + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + if not parser: + return lineages + target_platform = parser.target_platform + project = parser.project + transforms = parser.transforms + + for topic in self.connector_manifest.topic_names: + transformed_topic = self.apply_transformations(topic, transforms) + dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser) + if dataset_table is None: + self.report.warning( + "Could not find target dataset for topic, please check your connector configuration" + f"{self.connector_manifest.name} : {transformed_topic} ", + ) + continue + target_dataset = f"{project}.{dataset_table}" + + lineages.append( + KafkaConnectLineage( + source_dataset=transformed_topic, + source_platform=KAFKA, + target_dataset=target_dataset, + target_platform=target_platform, + ) + ) + return lineages + + +BIGQUERY_SINK_CONNECTOR_CLASS = "com.wepay.kafka.connect.bigquery.BigQuerySinkConnector" +S3_SINK_CONNECTOR_CLASS = "io.confluent.connect.s3.S3SinkConnector" +SNOWFLAKE_SINK_CONNECTOR_CLASS = "com.snowflake.kafka.connector.SnowflakeSinkConnector" diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py new file mode 100644 index 00000000000000..7b3b6e551a0a1f --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py @@ -0,0 +1,570 @@ +import logging +import re +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Tuple + +from sqlalchemy.engine.url import make_url + +from datahub.ingestion.source.kafka_connect.common import ( + CONNECTOR_CLASS, + KAFKA, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, + get_dataset_name, + has_three_level_hierarchy, + remove_prefix, + unquote, +) +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( + get_platform_from_sqlalchemy_uri, +) + + +@dataclass +class ConfluentJDBCSourceConnector(BaseConnector): + REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter" + KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER] + # https://kafka.apache.org/documentation/#connect_included_transformation + KAFKA_NONTOPICROUTING_TRANSFORMS = [ + "InsertField", + "InsertField$Key", + "InsertField$Value", + "ReplaceField", + "ReplaceField$Key", + "ReplaceField$Value", + "MaskField", + "MaskField$Key", + "MaskField$Value", + "ValueToKey", + "ValueToKey$Key", + "ValueToKey$Value", + "HoistField", + "HoistField$Key", + "HoistField$Value", + "ExtractField", + "ExtractField$Key", + "ExtractField$Value", + "SetSchemaMetadata", + "SetSchemaMetadata$Key", + "SetSchemaMetadata$Value", + "Flatten", + "Flatten$Key", + "Flatten$Value", + "Cast", + "Cast$Key", + "Cast$Value", + "HeadersFrom", + "HeadersFrom$Key", + "HeadersFrom$Value", + "TimestampConverter", + "Filter", + "InsertHeader", + "DropHeaders", + ] + # https://docs.confluent.io/platform/current/connect/transforms/overview.html + CONFLUENT_NONTOPICROUTING_TRANSFORMS = [ + "Drop", + "Drop$Key", + "Drop$Value", + "Filter", + "Filter$Key", + "Filter$Value", + "TombstoneHandler", + ] + KNOWN_NONTOPICROUTING_TRANSFORMS = ( + KAFKA_NONTOPICROUTING_TRANSFORMS + + [ + f"org.apache.kafka.connect.transforms.{t}" + for t in KAFKA_NONTOPICROUTING_TRANSFORMS + ] + + CONFLUENT_NONTOPICROUTING_TRANSFORMS + + [ + f"io.confluent.connect.transforms.{t}" + for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS + ] + ) + + @dataclass + class JdbcParser: + db_connection_url: str + source_platform: str + database_name: str + topic_prefix: str + query: str + transforms: list + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> JdbcParser: + url = remove_prefix( + str(connector_manifest.config.get("connection.url")), "jdbc:" + ) + url_instance = make_url(url) + source_platform = get_platform_from_sqlalchemy_uri(str(url_instance)) + database_name = url_instance.database + assert database_name + db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}" + + topic_prefix = self.connector_manifest.config.get("topic.prefix", None) + + query = self.connector_manifest.config.get("query", None) + + transform_names = ( + self.connector_manifest.config.get("transforms", "").split(",") + if self.connector_manifest.config.get("transforms") + else [] + ) + + transforms = [] + for name in transform_names: + transform = {"name": name} + transforms.append(transform) + for key in self.connector_manifest.config.keys(): + if key.startswith(f"transforms.{name}."): + transform[ + key.replace(f"transforms.{name}.", "") + ] = self.connector_manifest.config[key] + + return self.JdbcParser( + db_connection_url, + source_platform, + database_name, + topic_prefix, + query, + transforms, + ) + + def default_get_lineages( + self, + topic_prefix: str, + database_name: str, + source_platform: str, + topic_names: Optional[Iterable[str]] = None, + include_source_dataset: bool = True, + ) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = [] + if not topic_names: + topic_names = self.connector_manifest.topic_names + table_name_tuples: List[Tuple] = self.get_table_names() + for topic in topic_names: + # All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM)) + source_table: str = ( + remove_prefix(topic, topic_prefix) if topic_prefix else topic + ) + # include schema name for three-level hierarchies + if has_three_level_hierarchy(source_platform): + table_name_tuple: Tuple = next( + iter([t for t in table_name_tuples if t and t[-1] == source_table]), + (), + ) + if len(table_name_tuple) > 1: + source_table = f"{table_name_tuple[-2]}.{source_table}" + else: + include_source_dataset = False + self.report.warning( + "Could not find schema for table" + f"{self.connector_manifest.name} : {source_table}", + ) + dataset_name: str = get_dataset_name(database_name, source_table) + lineage = KafkaConnectLineage( + source_dataset=dataset_name if include_source_dataset else None, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + def get_table_names(self) -> List[Tuple]: + sep: str = "." + leading_quote_char: str = '"' + trailing_quote_char: str = leading_quote_char + + table_ids: List[str] = [] + if self.connector_manifest.tasks: + table_ids = ( + ",".join( + [ + task["config"].get("tables") + for task in self.connector_manifest.tasks + ] + ) + ).split(",") + quote_method = self.connector_manifest.config.get( + "quote.sql.identifiers", "always" + ) + if ( + quote_method == "always" + and table_ids + and table_ids[0] + and table_ids[-1] + ): + leading_quote_char = table_ids[0][0] + trailing_quote_char = table_ids[-1][-1] + # This will only work for single character quotes + elif self.connector_manifest.config.get("table.whitelist"): + table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore + + # List of Tuple containing (schema, table) + tables: List[Tuple] = [ + ( + ( + unquote( + table_id.split(sep)[-2], leading_quote_char, trailing_quote_char + ) + if len(table_id.split(sep)) > 1 + else "" + ), + unquote( + table_id.split(sep)[-1], leading_quote_char, trailing_quote_char + ), + ) + for table_id in table_ids + ] + return tables + + def extract_flow_property_bag(self) -> Dict[str, str]: + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k not in ["connection.password", "connection.user"] + } + + # Mask/Remove properties that may reveal credentials + flow_property_bag["connection.url"] = self.get_parser( + self.connector_manifest + ).db_connection_url + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + database_name = parser.database_name + query = parser.query + topic_prefix = parser.topic_prefix + transforms = parser.transforms + + logging.debug( + f"Extracting source platform: {source_platform} and database name: {database_name} from connection url " + ) + + if not self.connector_manifest.topic_names: + return lineages + + if query: + # Lineage source_table can be extracted by parsing query + for topic in self.connector_manifest.topic_names: + # default method - as per earlier implementation + dataset_name: str = get_dataset_name(database_name, topic) + + lineage = KafkaConnectLineage( + source_dataset=None, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + self.report.warning( + "Could not find input dataset, the connector has query configuration set", + self.connector_manifest.name, + ) + return lineages + + SINGLE_TRANSFORM = len(transforms) == 1 + NO_TRANSFORM = len(transforms) == 0 + UNKNOWN_TRANSFORM = any( + [ + transform["type"] + not in self.KNOWN_TOPICROUTING_TRANSFORMS + + self.KNOWN_NONTOPICROUTING_TRANSFORMS + for transform in transforms + ] + ) + ALL_TRANSFORMS_NON_TOPICROUTING = all( + [ + transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS + for transform in transforms + ] + ) + + if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING: + return self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + ) + + if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER: + tables = self.get_table_names() + topic_names = list(self.connector_manifest.topic_names) + + from java.util.regex import Pattern + + for table in tables: + source_table: str = table[-1] + topic = topic_prefix + source_table if topic_prefix else source_table + + transform_regex = Pattern.compile(transforms[0]["regex"]) + transform_replacement = transforms[0]["replacement"] + + matcher = transform_regex.matcher(topic) + if matcher.matches(): + topic = str(matcher.replaceFirst(transform_replacement)) + + # Additional check to confirm that the topic present + # in connector topics + + if topic in self.connector_manifest.topic_names: + # include schema name for three-level hierarchies + if has_three_level_hierarchy(source_platform) and len(table) > 1: + source_table = f"{table[-2]}.{table[-1]}" + + dataset_name = get_dataset_name(database_name, source_table) + + lineage = KafkaConnectLineage( + source_dataset=dataset_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + topic_names.remove(topic) + lineages.append(lineage) + + if topic_names: + lineages.extend( + self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + topic_names=topic_names, + include_source_dataset=False, + ) + ) + self.report.warning( + "Could not find input dataset for connector topics", + f"{self.connector_manifest.name} : {topic_names}", + ) + return lineages + else: + include_source_dataset = True + if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: + self.report.warning( + "Could not find input dataset, connector has unknown transform", + f"{self.connector_manifest.name} : {transforms[0]['type']}", + ) + include_source_dataset = False + if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: + self.report.warning( + "Could not find input dataset, connector has one or more unknown transforms", + self.connector_manifest.name, + ) + include_source_dataset = False + lineages = self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + include_source_dataset=include_source_dataset, + ) + return lineages + + +@dataclass +class MongoSourceConnector(BaseConnector): + # https://www.mongodb.com/docs/kafka-connector/current/source-connector/ + + @dataclass + class MongoSourceParser: + db_connection_url: Optional[str] + source_platform: str + database_name: Optional[str] + topic_prefix: Optional[str] + transforms: List[str] + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> MongoSourceParser: + parser = self.MongoSourceParser( + db_connection_url=connector_manifest.config.get("connection.uri"), + source_platform="mongodb", + database_name=connector_manifest.config.get("database"), + topic_prefix=connector_manifest.config.get("topic_prefix"), + transforms=( + connector_manifest.config["transforms"].split(",") + if "transforms" in connector_manifest.config + else [] + ), + ) + + return parser + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)" + + if not self.connector_manifest.topic_names: + return lineages + + for topic in self.connector_manifest.topic_names: + found = re.search(re.compile(topic_naming_pattern), topic) + + if found: + table_name = get_dataset_name(found.group(1), found.group(2)) + + lineage = KafkaConnectLineage( + source_dataset=table_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + +@dataclass +class DebeziumSourceConnector(BaseConnector): + @dataclass + class DebeziumParser: + source_platform: str + server_name: Optional[str] + database_name: Optional[str] + + def get_server_name(self, connector_manifest: ConnectorManifest) -> str: + if "topic.prefix" in connector_manifest.config: + return connector_manifest.config["topic.prefix"] + else: + return connector_manifest.config.get("database.server.name", "") + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> DebeziumParser: + connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "") + + if connector_class == "io.debezium.connector.mysql.MySqlConnector": + parser = self.DebeziumParser( + source_platform="mysql", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "MySqlConnector": + parser = self.DebeziumParser( + source_platform="mysql", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector": + parser = self.DebeziumParser( + source_platform="mongodb", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "io.debezium.connector.postgresql.PostgresConnector": + parser = self.DebeziumParser( + source_platform="postgres", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.oracle.OracleConnector": + parser = self.DebeziumParser( + source_platform="oracle", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector": + database_name = connector_manifest.config.get( + "database.names" + ) or connector_manifest.config.get("database.dbname") + + if "," in str(database_name): + raise Exception( + f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}" + ) + + parser = self.DebeziumParser( + source_platform="mssql", + server_name=self.get_server_name(connector_manifest), + database_name=database_name, + ) + elif connector_class == "io.debezium.connector.db2.Db2Connector": + parser = self.DebeziumParser( + source_platform="db2", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.vitess.VitessConnector": + parser = self.DebeziumParser( + source_platform="vitess", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("vitess.keyspace"), + ) + else: + raise ValueError(f"Connector class '{connector_class}' is unknown.") + + return parser + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + + try: + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + server_name = parser.server_name + database_name = parser.database_name + topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)" + + if not self.connector_manifest.topic_names: + return lineages + + for topic in self.connector_manifest.topic_names: + found = re.search(re.compile(topic_naming_pattern), topic) + + if found: + table_name = get_dataset_name(database_name, found.group(2)) + + lineage = KafkaConnectLineage( + source_dataset=table_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + except Exception as e: + self.report.warning( + "Error resolving lineage for connector", + self.connector_manifest.name, + exc=e, + ) + + return [] + + +@dataclass +class ConfigDrivenSourceConnector(BaseConnector): + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages = [] + for connector in self.config.generic_connectors: + if connector.connector_name == self.connector_manifest.name: + target_connector = connector + break + for topic in self.connector_manifest.topic_names: + lineage = KafkaConnectLineage( + source_dataset=target_connector.source_dataset, + source_platform=target_connector.source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + +JDBC_SOURCE_CONNECTOR_CLASS = "io.confluent.connect.jdbc.JdbcSourceConnector" +DEBEZIUM_SOURCE_CONNECTOR_PREFIX = "io.debezium.connector" +MONGO_SOURCE_CONNECTOR_CLASS = "com.mongodb.kafka.connect.MongoSourceConnector" From 2e544614f12bf2ad8e758b2fd742ee14c6998825 Mon Sep 17 00:00:00 2001 From: sagar-salvi-apptware <159135491+sagar-salvi-apptware@users.noreply.github.com> Date: Thu, 19 Dec 2024 12:41:40 +0530 Subject: [PATCH 07/35] feat(ingest): add looker meta extractor support in sql parsing (#12062) Co-authored-by: Mayuri N Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .../datahub/configuration/source_common.py | 13 ++ .../ingestion/source/looker/looker_common.py | 56 +++++- .../source/looker/looker_lib_wrapper.py | 14 +- .../ingestion/source/looker/looker_source.py | 13 +- .../ingestion/source/powerbi/config.py | 15 +- .../powerbi/dataplatform_instance_resolver.py | 2 +- .../source/powerbi/m_query/pattern_handler.py | 2 +- .../source/snowflake/snowflake_v2.py | 1 + .../sql_parsing/sql_parsing_aggregator.py | 2 +- .../sql_parsing/tool_meta_extractor.py | 121 ++++++++++++- .../looker/golden_looker_mces.json | 56 ++++++ .../looker/golden_test_allow_ingest.json | 53 ++++++ ...olden_test_external_project_view_mces.json | 53 ++++++ .../looker/golden_test_file_path_ingest.json | 53 ++++++ ...olden_test_folder_path_pattern_ingest.json | 53 ++++++ .../golden_test_independent_look_ingest.json | 170 +++++++++++++----- .../looker/golden_test_ingest.json | 54 ++++++ .../looker/golden_test_ingest_joins.json | 53 ++++++ .../golden_test_ingest_unaliased_joins.json | 53 ++++++ ...en_test_non_personal_independent_look.json | 71 ++++++++ .../looker_mces_golden_deleted_stateful.json | 68 ++++++- .../looker/looker_mces_usage_history.json | 53 ++++++ .../tests/integration/looker/test_looker.py | 20 +++ .../sql_parsing/test_tool_meta_extractor.py | 44 ++++- .../state/test_redundant_run_skip_handler.py | 6 +- .../platformresource/PlatformResourceType.pdl | 6 +- 26 files changed, 1026 insertions(+), 79 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/source_common.py b/metadata-ingestion/src/datahub/configuration/source_common.py index 44c737f1bd13d4..8e41e9fb917878 100644 --- a/metadata-ingestion/src/datahub/configuration/source_common.py +++ b/metadata-ingestion/src/datahub/configuration/source_common.py @@ -63,3 +63,16 @@ class DatasetLineageProviderConfigBase(EnvConfigMixin): default=None, description="A holder for platform -> platform_instance mappings to generate correct dataset urns", ) + + +class PlatformDetail(ConfigModel): + platform_instance: Optional[str] = Field( + default=None, + description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match " + "with platform instance name used in ingestion " + "recipe of other datahub sources.", + ) + env: str = Field( + default=DEFAULT_ENV, + description="The environment that all assets produced by DataHub platform ingestion source belong to", + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 57a251ef2ed14f..a66962f962255f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -31,6 +31,10 @@ from pydantic.class_validators import validator import datahub.emitter.mce_builder as builder +from datahub.api.entities.platformresource.platform_resource import ( + PlatformResource, + PlatformResourceKey, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ContainerKey, create_embed_mcp from datahub.ingestion.api.report import Report @@ -106,7 +110,7 @@ from datahub.utilities.url_util import remove_port_from_url CORPUSER_DATAHUB = "urn:li:corpuser:datahub" - +LOOKER = "looker" logger = logging.getLogger(__name__) @@ -1411,6 +1415,7 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport): resolved_user_ids: int = 0 email_ids_missing: int = 0 # resolved users with missing email addresses + looker_user_count: int = 0 _looker_api: Optional[LookerAPI] = None query_latency: Dict[str, datetime.timedelta] = dataclasses_field( @@ -1614,9 +1619,21 @@ def get_urn_dashboard_id(self): class LookerUserRegistry: looker_api_wrapper: LookerAPI fields: str = ",".join(["id", "email", "display_name", "first_name", "last_name"]) + _user_cache: Dict[str, LookerUser] = {} - def __init__(self, looker_api: LookerAPI): + def __init__(self, looker_api: LookerAPI, report: LookerDashboardSourceReport): self.looker_api_wrapper = looker_api + self.report = report + self._initialize_user_cache() + + def _initialize_user_cache(self) -> None: + raw_users: Sequence[User] = self.looker_api_wrapper.all_users( + user_fields=self.fields + ) + + for raw_user in raw_users: + looker_user = LookerUser.create_looker_user(raw_user) + self._user_cache[str(looker_user.id)] = looker_user def get_by_id(self, id_: str) -> Optional[LookerUser]: if not id_: @@ -1624,6 +1641,9 @@ def get_by_id(self, id_: str) -> Optional[LookerUser]: logger.debug(f"Will get user {id_}") + if str(id_) in self._user_cache: + return self._user_cache.get(str(id_)) + raw_user: Optional[User] = self.looker_api_wrapper.get_user( str(id_), user_fields=self.fields ) @@ -1632,3 +1652,35 @@ def get_by_id(self, id_: str) -> Optional[LookerUser]: looker_user = LookerUser.create_looker_user(raw_user) return looker_user + + def to_platform_resource( + self, platform_instance: Optional[str] + ) -> Iterable[MetadataChangeProposalWrapper]: + try: + platform_resource_key = PlatformResourceKey( + platform=LOOKER, + resource_type="USER_ID_MAPPING", + platform_instance=platform_instance, + primary_key="", + ) + + # Extract user email mappings + user_email_cache = { + user_id: user.email + for user_id, user in self._user_cache.items() + if user.email + } + + platform_resource = PlatformResource.create( + key=platform_resource_key, + value=user_email_cache, + ) + + self.report.looker_user_count = len(user_email_cache) + yield from platform_resource.to_mcps() + + except Exception as exc: + self.report.warning( + message="Failed to generate platform resource for looker id mappings", + exc=exc, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py index ab55d4e15e5de4..c3f2a110136c45 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py @@ -68,6 +68,7 @@ class LookerAPIStats(BaseModel): get_look_calls: int = 0 search_looks_calls: int = 0 search_dashboards_calls: int = 0 + all_user_calls: int = 0 class LookerAPI: @@ -135,7 +136,7 @@ def get_available_permissions(self) -> Set[str]: return permissions - @lru_cache(maxsize=1000) + @lru_cache(maxsize=5000) def get_user(self, id_: str, user_fields: str) -> Optional[User]: self.client_stats.user_calls += 1 try: @@ -154,6 +155,17 @@ def get_user(self, id_: str, user_fields: str) -> Optional[User]: # User not found return None + def all_users(self, user_fields: str) -> Sequence[User]: + self.client_stats.all_user_calls += 1 + try: + return self.client.all_users( + fields=cast(str, user_fields), + transport_options=self.transport_options, + ) + except SDKError as e: + logger.warning(f"Failure was {e}") + return [] + def execute_query(self, write_query: WriteQuery) -> List[Dict]: logger.debug(f"Executing query {write_query}") self.client_stats.query_calls += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index cd8ccb8217257c..815c5dfb1c0147 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -145,7 +145,9 @@ def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext): self.source_config: LookerDashboardSourceConfig = config self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport() self.looker_api: LookerAPI = LookerAPI(self.source_config) - self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api) + self.user_registry: LookerUserRegistry = LookerUserRegistry( + self.looker_api, self.reporter + ) self.explore_registry: LookerExploreRegistry = LookerExploreRegistry( self.looker_api, self.reporter, self.source_config ) @@ -1673,5 +1675,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield usage_mcp.as_workunit() self.reporter.report_stage_end("usage_extraction") + # Dump looker user resource mappings. + logger.info("Ingesting looker user resource mapping workunits") + self.reporter.report_stage_start("user_resource_extraction") + yield from auto_workunit( + self.user_registry.to_platform_resource( + self.source_config.platform_instance + ) + ) + def get_report(self) -> SourceReport: return self.reporter diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index f7458c4eb4d5b5..b49d40a0c7eb6a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -9,7 +9,7 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.common.subtypes import BIAssetSubTypes from datahub.ingestion.source.state.stale_entity_removal_handler import ( @@ -232,19 +232,6 @@ def default_for_dataset_type_mapping() -> Dict[str, str]: return dict_ -class PlatformDetail(ConfigModel): - platform_instance: Optional[str] = pydantic.Field( - default=None, - description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match " - "with platform instance name used in ingestion " - "recipe of other datahub sources.", - ) - env: str = pydantic.Field( - default=builder.DEFAULT_ENV, - description="The environment that all assets produced by DataHub platform ingestion source belong to", - ) - - class DataBricksPlatformDetail(PlatformDetail): """ metastore is an additional field used in Databricks connector to generate the dataset urn diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py index baaa8d5b85ae10..6d51e853a2fb06 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py @@ -2,8 +2,8 @@ from abc import ABC, abstractmethod from typing import Union +from datahub.configuration.source_common import PlatformDetail from datahub.ingestion.source.powerbi.config import ( - PlatformDetail, PowerBiDashboardSourceConfig, PowerBIPlatformDetail, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py index ffaed79f4e42a6..63520bd731de86 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py @@ -5,13 +5,13 @@ from lark import Tree +from datahub.configuration.source_common import PlatformDetail from datahub.emitter import mce_builder as builder from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( Constant, DataBricksPlatformDetail, DataPlatformPair, - PlatformDetail, PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, PowerBIPlatformDetail, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index c3a7912c40e8ee..e5883dd0349a3a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -540,6 +540,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: identifiers=self.identifiers, schema_resolver=schema_resolver, discovered_tables=discovered_datasets, + graph=self.ctx.graph, ) # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 79ea98d1c7f54e..f81eb291e89e1d 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -490,7 +490,7 @@ def __init__( self._exit_stack.push(self._query_usage_counts) # Tool Extractor - self._tool_meta_extractor = ToolMetaExtractor() + self._tool_meta_extractor = ToolMetaExtractor.create(graph) self.report.tool_meta_report = self._tool_meta_extractor.report def close(self) -> None: diff --git a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py index 0d85002776e5e2..5af9d9d4f0fffc 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py +++ b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py @@ -1,3 +1,4 @@ +import contextlib import json import logging from dataclasses import dataclass, field @@ -5,8 +6,15 @@ from typing_extensions import Protocol +from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, + PlatformResource, + PlatformResourceSearchFields, +) from datahub.ingestion.api.report import Report +from datahub.ingestion.graph.client import DataHubGraph from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn +from datahub.utilities.search_utils import LogicalOperator from datahub.utilities.stats_collections import int_top_k_dict UrnStr = str @@ -31,6 +39,7 @@ def _get_last_line(query: str) -> str: @dataclass class ToolMetaExtractorReport(Report): num_queries_meta_extracted: Dict[str, int] = field(default_factory=int_top_k_dict) + failures: List[str] = field(default_factory=list) class ToolMetaExtractor: @@ -42,14 +51,81 @@ class ToolMetaExtractor: by warehouse query logs. """ - def __init__(self) -> None: - self.report = ToolMetaExtractorReport() + def __init__( + self, + report: ToolMetaExtractorReport, + looker_user_mapping: Optional[Dict[str, str]] = None, + ) -> None: + self.report = report self.known_tool_extractors: List[Tuple[str, Callable[[QueryLog], bool]]] = [ ( "mode", self._extract_mode_query, - ) + ), + ( + "looker", + self._extract_looker_query, + ), ] + # maps user id (as string) to email address + self.looker_user_mapping = looker_user_mapping + + @classmethod + def create( + cls, + graph: Optional[DataHubGraph] = None, + ) -> "ToolMetaExtractor": + report = ToolMetaExtractorReport() + looker_user_mapping = None + if graph: + try: + looker_user_mapping = cls.extract_looker_user_mapping_from_graph( + graph, report + ) + except Exception as e: + report.failures.append( + f"Unexpected error during Looker user metadata extraction: {str(e)}" + ) + + return cls(report, looker_user_mapping) + + @classmethod + def extract_looker_user_mapping_from_graph( + cls, graph: DataHubGraph, report: ToolMetaExtractorReport + ) -> Optional[Dict[str, str]]: + looker_user_mapping = None + query = ( + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match(PlatformResourceSearchFields.PLATFORM, "looker") + .add_field_match( + PlatformResourceSearchFields.RESOURCE_TYPE, + "USER_ID_MAPPING", + ) + .end() + ) + platform_resources = list( + PlatformResource.search_by_filters(query=query, graph_client=graph) + ) + + if len(platform_resources) > 1: + report.failures.append( + "Looker user metadata extraction failed. Found more than one looker user id mappings." + ) + else: + platform_resource = platform_resources[0] + + if ( + platform_resource + and platform_resource.resource_info + and platform_resource.resource_info.value + ): + with contextlib.suppress(ValueError, AssertionError): + value = platform_resource.resource_info.value.as_raw_json() + if value: + looker_user_mapping = value + + return looker_user_mapping def _extract_mode_query(self, entry: QueryLog) -> bool: """ @@ -78,14 +154,49 @@ def _extract_mode_query(self, entry: QueryLog) -> bool: return True + def _extract_looker_query(self, entry: QueryLog) -> bool: + """ + Returns: + bool: whether QueryLog entry is that of looker and looker user info + is extracted into entry. + """ + if not self.looker_user_mapping: + return False + + last_line = _get_last_line(entry.query_text) + + if not (last_line.startswith("--") and "Looker Query Context" in last_line): + return False + + start_quote_idx = last_line.index("'") + end_quote_idx = last_line.rindex("'") + if start_quote_idx == -1 or end_quote_idx == -1: + return False + + looker_json_raw = last_line[start_quote_idx + 1 : end_quote_idx] + looker_json = json.loads(looker_json_raw) + + user_id = str(looker_json["user_id"]) + email = self.looker_user_mapping.get(user_id) + if not email: + return False + + original_user = entry.user + + entry.user = email_to_user_urn(email) + entry.extra_info = entry.extra_info or {} + entry.extra_info["user_via"] = original_user + + return True + def extract_bi_metadata(self, entry: QueryLog) -> bool: for tool, meta_extractor in self.known_tool_extractors: try: if meta_extractor(entry): self.report.num_queries_meta_extracted[tool] += 1 return True - except Exception: - logger.debug("Tool metadata extraction failed with error : {e}") + except Exception as e: + logger.debug(f"Tool metadata extraction failed with error : {e}") return False diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json index a9c445b5986efe..6ae772c134cb32 100644 --- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json @@ -842,6 +842,62 @@ "pipelineName": "stateful-looker-pipeline" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json index af9c62a2a41803..d7620980a9cedb 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json @@ -497,6 +497,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json index b89bc356b48fdc..13963af55bfe56 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json @@ -735,6 +735,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json index 810fefd8f6cb85..f11d060102851c 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json @@ -735,6 +735,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json index 3d78397f54a235..f6e39dd5286cd0 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json @@ -828,6 +828,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index 5a540e61e768d7..203bed843155c8 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -464,6 +464,21 @@ "/Folders/Shared" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-1@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -708,6 +723,21 @@ "/Folders/Personal" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-2@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -1108,12 +1138,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/sales_model" + "/Explore/data" ] } }, @@ -1126,12 +1156,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "sales_model", + "model": "data", "looker.explore.label": "My Explore View", - "looker.explore.name": "sales_explore", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", + "externalUrl": "https://looker.company.com/explore/data/my_view", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1153,7 +1183,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "sales_explore", + "schemaName": "my_view", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1208,7 +1238,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1227,12 +1257,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" + "renderUrl": "https://looker.company.com/embed/explore/data/my_view" } }, "systemMetadata": { @@ -1244,12 +1274,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } }, "systemMetadata": { @@ -1261,7 +1291,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1271,8 +1301,8 @@ "id": "Explore" }, { - "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", - "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", + "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } ] } @@ -1287,12 +1317,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/data" + "/Explore/order_model" ] } }, @@ -1305,12 +1335,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "data", + "model": "order_model", "looker.explore.label": "My Explore View", - "looker.explore.name": "my_view", + "looker.explore.name": "order_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/data/my_view", + "externalUrl": "https://looker.company.com/explore/order_model/order_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1332,7 +1362,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_view", + "schemaName": "order_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1387,7 +1417,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1406,12 +1436,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/data/my_view" + "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" } }, "systemMetadata": { @@ -1423,12 +1453,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } }, "systemMetadata": { @@ -1440,7 +1470,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1450,8 +1480,8 @@ "id": "Explore" }, { - "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", - "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", + "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } ] } @@ -1466,12 +1496,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/order_model" + "/Explore/sales_model" ] } }, @@ -1484,12 +1514,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "order_model", + "model": "sales_model", "looker.explore.label": "My Explore View", - "looker.explore.name": "order_explore", + "looker.explore.name": "sales_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/order_model/order_explore", + "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1511,7 +1541,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "order_explore", + "schemaName": "sales_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1566,7 +1596,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1585,12 +1615,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" + "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" } }, "systemMetadata": { @@ -1602,12 +1632,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } }, "systemMetadata": { @@ -1619,7 +1649,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1629,8 +1659,8 @@ "id": "Explore" }, { - "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", - "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", + "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } ] } @@ -1705,6 +1735,62 @@ "pipelineName": "execution-1" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json index 9ac95b8482a475..87af50f95ed6bb 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json @@ -793,6 +793,60 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:looker,ap-south-1)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json index 3a2c6359ea63c2..b990ce7c67dab6 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json @@ -759,6 +759,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json index 007eee348aeaf8..391192b3d16f36 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json @@ -513,6 +513,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json index 859b9163d7aad6..4909a6af73a225 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json @@ -464,6 +464,21 @@ "/Folders/Shared" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-1@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -1185,6 +1200,62 @@ "pipelineName": "execution-1" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json index 8256c984afb274..ddeb5428b1d726 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json @@ -762,6 +762,62 @@ "pipelineName": "stateful-looker-pipeline" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", @@ -814,8 +870,8 @@ } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,bogus data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -831,8 +887,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -865,8 +921,8 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,bogus data.explore.my_view,PROD)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 0b3530f9c24629..594983c8fb0f2a 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -678,6 +678,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index 8bbf14709ff9fb..a39de8384efb23 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -83,6 +83,7 @@ def test_looker_ingest(pytestconfig, tmp_path, mock_time): with mock.patch("looker_sdk.init40") as mock_sdk: mock_sdk.return_value = mocked_client setup_mock_dashboard(mocked_client) + mocked_client.run_inline_query.side_effect = side_effect_query_inline setup_mock_explore(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" @@ -319,6 +320,7 @@ def setup_mock_look(mocked_client): mocked_client.all_looks.return_value = [ Look( id="1", + user_id="1", title="Outer Look", description="I am not part of any Dashboard", query_id="1", @@ -327,6 +329,7 @@ def setup_mock_look(mocked_client): Look( id="2", title="Personal Look", + user_id="2", description="I am not part of any Dashboard and in personal folder", query_id="2", folder=FolderBase( @@ -561,6 +564,20 @@ def get_user( mocked_client.user.side_effect = get_user +def setup_mock_all_user(mocked_client): + def all_users( + fields: Optional[str] = None, + transport_options: Optional[transport.TransportOptions] = None, + ) -> List[User]: + return [ + User(id="1", email="test-1@looker.com"), + User(id="2", email="test-2@looker.com"), + User(id="3", email="test-3@looker.com"), + ] + + mocked_client.all_users.side_effect = all_users + + def side_effect_query_inline( result_format: str, body: WriteQuery, transport_options: Optional[TransportOptions] ) -> str: @@ -714,6 +731,7 @@ def test_looker_ingest_usage_history(pytestconfig, tmp_path, mock_time): mocked_client.run_inline_query.side_effect = side_effect_query_inline setup_mock_explore(mocked_client) setup_mock_user(mocked_client) + setup_mock_all_user(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" @@ -946,6 +964,8 @@ def ingest_independent_looks( mock_sdk.return_value = mocked_client setup_mock_dashboard(mocked_client) setup_mock_explore(mocked_client) + setup_mock_user(mocked_client) + setup_mock_all_user(mocked_client) setup_mock_look(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py b/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py index 6f590b53071467..f6566f007f5e6b 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py @@ -1,11 +1,14 @@ from datahub.configuration.datetimes import parse_absolute_time from datahub.metadata.urns import CorpUserUrn from datahub.sql_parsing.sql_parsing_aggregator import PreparsedQuery -from datahub.sql_parsing.tool_meta_extractor import ToolMetaExtractor +from datahub.sql_parsing.tool_meta_extractor import ( + ToolMetaExtractor, + ToolMetaExtractorReport, +) def test_extract_mode_metadata() -> None: - extractor = ToolMetaExtractor() + extractor = ToolMetaExtractor(report=ToolMetaExtractorReport()) query = """\ select * from LONG_TAIL_COMPANIONS.ADOPTION.PET_PROFILES LIMIT 100 @@ -30,8 +33,42 @@ def test_extract_mode_metadata() -> None: assert extractor.report.num_queries_meta_extracted["mode"] == 1 +def test_extract_looker_metadata() -> None: + extractor = ToolMetaExtractor( + report=ToolMetaExtractorReport(), looker_user_mapping={"7": "john.doe@xyz.com"} + ) + looker_query = """\ +SELECT + all_entities_extended_sibling."ENTITY" AS "all_entities_extended_sibling.entity_type", + COUNT(DISTINCT ( all_entities_extended_sibling."URN" )) AS "all_entities_extended_sibling.distinct_count" +FROM "PUBLIC"."ALL_ENTITIES" + AS all_entities_extended_sibling +GROUP BY + 1 +ORDER BY + 1 +FETCH NEXT 50 ROWS ONLY +-- Looker Query Context '{"user_id":7,"history_slug":"264797031bc403cf382cbefbe3700849","instance_slug":"32654f2ffadf10b1949d4009e52fc6a4"}' +""" + + entry = PreparsedQuery( + query_id=None, + query_text=looker_query, + upstreams=[], + downstream=None, + column_lineage=None, + column_usage=None, + inferred_schema=None, + user=CorpUserUrn("mode"), + timestamp=parse_absolute_time("2021-08-01T01:02:03Z"), + ) + assert extractor.extract_bi_metadata(entry) + assert entry.user == CorpUserUrn("john.doe") + assert extractor.report.num_queries_meta_extracted["looker"] == 1 + + def test_extract_no_metadata() -> None: - extractor = ToolMetaExtractor() + extractor = ToolMetaExtractor(report=ToolMetaExtractorReport()) query = """\ select * from LONG_TAIL_COMPANIONS.ADOPTION.PET_PROFILES LIMIT 100 @@ -53,3 +90,4 @@ def test_extract_no_metadata() -> None: assert not extractor.extract_bi_metadata(entry) assert extractor.report.num_queries_meta_extracted["mode"] == 0 + assert extractor.report.num_queries_meta_extracted["looker"] == 0 diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py index 85c86f8d205d9a..5631ad2c69f949 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py @@ -37,7 +37,11 @@ def stateful_source(mock_datahub_graph: DataHubGraph) -> Iterable[SnowflakeV2Sou ), ) - with mock.patch("snowflake.connector.connect"): + with mock.patch( + "datahub.sql_parsing.sql_parsing_aggregator.ToolMetaExtractor.create", + ) as mock_checkpoint, mock.patch("snowflake.connector.connect"): + mock_checkpoint.return_value = mock.MagicMock() + yield SnowflakeV2Source(ctx=ctx, config=config) diff --git a/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl b/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl index 2f36eda9141abb..1a1dbea4359fbd 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl @@ -9,9 +9,13 @@ enum PlatformResourceType { /** * e.g. a Slack member resource, Looker user resource, etc. */ - USER_INFO, + USER_INFO, /** * e.g. a Slack channel */ CONVERSATION + /** + * e.g. Looker mapping of all user ids + */ + USER_ID_MAPPING } From e45f548910834dc5f2a61d0cd2168b69ec1172b2 Mon Sep 17 00:00:00 2001 From: skrydal Date: Thu, 19 Dec 2024 16:25:59 +0100 Subject: [PATCH 08/35] feat(ingest/iceberg): Improve iceberg connector (#12163) --- .../ingestion/source/iceberg/iceberg.py | 28 ++- .../source/iceberg/iceberg_common.py | 4 + metadata-ingestion/tests/unit/test_iceberg.py | 168 ++++++++++++++++-- 3 files changed, 189 insertions(+), 11 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index 5931873f54236d..76f24bfd63d476 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -10,6 +10,7 @@ NoSuchNamespaceError, NoSuchPropertyException, NoSuchTableError, + ServerError, ) from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit from pyiceberg.table import Table @@ -145,6 +146,13 @@ def _get_datasets(self, catalog: Catalog) -> Iterable[Identifier]: self.report.report_no_listed_namespaces(len(namespaces)) tables_count = 0 for namespace in namespaces: + namespace_repr = ".".join(namespace) + if not self.config.namespace_pattern.allowed(namespace_repr): + LOGGER.info( + f"Namespace {namespace_repr} is not allowed by config pattern, skipping" + ) + self.report.report_dropped(f"{namespace_repr}.*") + continue try: tables = catalog.list_tables(namespace) tables_count += len(tables) @@ -181,6 +189,9 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]: if not self.config.table_pattern.allowed(dataset_name): # Dataset name is rejected by pattern, report as dropped. self.report.report_dropped(dataset_name) + LOGGER.debug( + f"Skipping table {dataset_name} due to not being allowed by the config pattern" + ) return try: if not hasattr(thread_local, "local_catalog"): @@ -219,6 +230,22 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]: LOGGER.warning( f"NoSuchTableError while processing table {dataset_path}, skipping it.", ) + except FileNotFoundError as e: + self.report.report_warning( + "file-not-found", + f"Encountered FileNotFoundError when trying to read manifest file for {dataset_name}. {e}", + ) + LOGGER.warning( + f"FileNotFoundError while processing table {dataset_path}, skipping it." + ) + except ServerError as e: + self.report.report_warning( + "iceberg-rest-server-error", + f"Iceberg Rest Catalog returned 500 status due to an unhandled exception for {dataset_name}. Exception: {e}", + ) + LOGGER.warning( + f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it." + ) except Exception as e: self.report.report_failure("general", f"Failed to create workunit: {e}") LOGGER.exception( @@ -269,7 +296,6 @@ def _create_iceberg_workunit( ] = table.current_snapshot().manifest_list dataset_properties = DatasetPropertiesClass( name=table.name()[-1], - tags=[], description=table.metadata.properties.get("comment", None), customProperties=custom_properties, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py index 98ad9e552d35c9..4a7f6bf4d60c1d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py @@ -68,6 +68,10 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin) default=AllowDenyPattern.allow_all(), description="Regex patterns for tables to filter in ingestion.", ) + namespace_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for namespaces to filter in ingestion.", + ) user_ownership_property: Optional[str] = Field( default="owner", description="Iceberg table property to look for a `CorpUser` owner. Can only hold a single user value. If property has no value, no owner information will be emitted.", diff --git a/metadata-ingestion/tests/unit/test_iceberg.py b/metadata-ingestion/tests/unit/test_iceberg.py index b8a136586a2bf5..3afa26b35dfe9f 100644 --- a/metadata-ingestion/tests/unit/test_iceberg.py +++ b/metadata-ingestion/tests/unit/test_iceberg.py @@ -10,6 +10,8 @@ NoSuchIcebergTableError, NoSuchNamespaceError, NoSuchPropertyException, + NoSuchTableError, + ServerError, ) from pyiceberg.io.pyarrow import PyArrowFileIO from pyiceberg.partitioning import PartitionSpec @@ -39,6 +41,7 @@ UUIDType, ) +from datahub.configuration.common import AllowDenyPattern from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.iceberg.iceberg import ( @@ -62,12 +65,12 @@ ) -def with_iceberg_source(processing_threads: int = 1) -> IcebergSource: +def with_iceberg_source(processing_threads: int = 1, **kwargs: Any) -> IcebergSource: catalog = {"test": {"type": "rest"}} return IcebergSource( ctx=PipelineContext(run_id="iceberg-source-test"), config=IcebergSourceConfig( - catalog=catalog, processing_threads=processing_threads + catalog=catalog, processing_threads=processing_threads, **kwargs ), ) @@ -542,11 +545,11 @@ def __init__(self, tables: Dict[str, Dict[str, Callable[[], Table]]]): """ self.tables = tables - def list_namespaces(self) -> Iterable[str]: - return [*self.tables.keys()] + def list_namespaces(self) -> Iterable[Tuple[str]]: + return [*[(key,) for key in self.tables.keys()]] def list_tables(self, namespace: str) -> Iterable[Tuple[str, str]]: - return [(namespace, table) for table in self.tables[namespace].keys()] + return [(namespace[0], table) for table in self.tables[namespace[0]].keys()] def load_table(self, dataset_path: Tuple[str, str]) -> Table: return self.tables[dataset_path[0]][dataset_path[1]]() @@ -554,15 +557,15 @@ def load_table(self, dataset_path: Tuple[str, str]) -> Table: class MockCatalogExceptionListingTables(MockCatalog): def list_tables(self, namespace: str) -> Iterable[Tuple[str, str]]: - if namespace == "no_such_namespace": + if namespace == ("no_such_namespace",): raise NoSuchNamespaceError() - if namespace == "generic_exception": + if namespace == ("generic_exception",): raise Exception() return super().list_tables(namespace) class MockCatalogExceptionListingNamespaces(MockCatalog): - def list_namespaces(self) -> Iterable[str]: + def list_namespaces(self) -> Iterable[Tuple[str]]: raise Exception() @@ -814,15 +817,157 @@ def test_proper_run_with_multiple_namespaces() -> None: ) +def test_filtering() -> None: + source = with_iceberg_source( + processing_threads=1, + table_pattern=AllowDenyPattern(deny=[".*abcd.*"]), + namespace_pattern=AllowDenyPattern(allow=["namespace1"]), + ) + mock_catalog = MockCatalog( + { + "namespace1": { + "table_xyz": lambda: Table( + identifier=("namespace1", "table_xyz"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace1/table_xyz", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace1/table_xyz", + io=PyArrowFileIO(), + catalog=None, + ), + "JKLtable": lambda: Table( + identifier=("namespace1", "JKLtable"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace1/JKLtable", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace1/JKLtable", + io=PyArrowFileIO(), + catalog=None, + ), + "table_abcd": lambda: Table( + identifier=("namespace1", "table_abcd"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace1/table_abcd", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace1/table_abcd", + io=PyArrowFileIO(), + catalog=None, + ), + "aaabcd": lambda: Table( + identifier=("namespace1", "aaabcd"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace1/aaabcd", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace1/aaabcd", + io=PyArrowFileIO(), + catalog=None, + ), + }, + "namespace2": { + "foo": lambda: Table( + identifier=("namespace2", "foo"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace2/foo", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace2/foo", + io=PyArrowFileIO(), + catalog=None, + ), + "bar": lambda: Table( + identifier=("namespace2", "bar"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace2/bar", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace2/bar", + io=PyArrowFileIO(), + catalog=None, + ), + }, + "namespace3": { + "sales": lambda: Table( + identifier=("namespace3", "sales"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace3/sales", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace3/sales", + io=PyArrowFileIO(), + catalog=None, + ), + "products": lambda: Table( + identifier=("namespace2", "bar"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace3/products", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace3/products", + io=PyArrowFileIO(), + catalog=None, + ), + }, + } + ) + with patch( + "datahub.ingestion.source.iceberg.iceberg.IcebergSourceConfig.get_catalog" + ) as get_catalog: + get_catalog.return_value = mock_catalog + wu: List[MetadataWorkUnit] = [*source.get_workunits_internal()] + assert len(wu) == 2 + urns = [] + for unit in wu: + assert isinstance(unit.metadata, MetadataChangeEvent) + assert isinstance(unit.metadata.proposedSnapshot, DatasetSnapshotClass) + urns.append(unit.metadata.proposedSnapshot.urn) + TestCase().assertCountEqual( + urns, + [ + "urn:li:dataset:(urn:li:dataPlatform:iceberg,namespace1.table_xyz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:iceberg,namespace1.JKLtable,PROD)", + ], + ) + assert source.report.tables_scanned == 2 + + def test_handle_expected_exceptions() -> None: source = with_iceberg_source(processing_threads=3) def _raise_no_such_property_exception(): raise NoSuchPropertyException() - def _raise_no_such_table_exception(): + def _raise_no_such_iceberg_table_exception(): raise NoSuchIcebergTableError() + def _raise_file_not_found_error(): + raise FileNotFoundError() + + def _raise_no_such_table_exception(): + raise NoSuchTableError() + + def _raise_server_error(): + raise ServerError() + mock_catalog = MockCatalog( { "namespaceA": { @@ -876,6 +1021,9 @@ def _raise_no_such_table_exception(): ), "table5": _raise_no_such_property_exception, "table6": _raise_no_such_table_exception, + "table7": _raise_file_not_found_error, + "table8": _raise_no_such_iceberg_table_exception, + "table9": _raise_server_error, } } ) @@ -899,7 +1047,7 @@ def _raise_no_such_table_exception(): "urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceA.table4,PROD)", ], ) - assert source.report.warnings.total_elements == 2 + assert source.report.warnings.total_elements == 5 assert source.report.failures.total_elements == 0 assert source.report.tables_scanned == 4 From 08605a95a78df3f2a47c42a1e595b01f52dcc5e5 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 19 Dec 2024 11:02:37 -0500 Subject: [PATCH 09/35] feat(python): split out temp wheel builds (#12157) --- .github/workflows/airflow-plugin.yml | 5 +- .github/workflows/dagster-plugin.yml | 8 +- .github/workflows/gx-plugin.yml | 8 +- .github/workflows/metadata-ingestion.yml | 9 +- .github/workflows/prefect-plugin.yml | 17 +-- .github/workflows/python-build-pages.yml | 64 ++++++++++ docs-website/build.gradle | 6 +- docs-website/generateDocsDir.ts | 24 ++-- metadata-ingestion/build.gradle | 4 +- python-build/.gitignore | 3 + python-build/build.gradle | 27 ++++ python-build/build_site.py | 150 +++++++++++++++++++++++ python-build/copy_wheels.py | 27 ++++ settings.gradle | 1 + 14 files changed, 304 insertions(+), 49 deletions(-) create mode 100644 .github/workflows/python-build-pages.yml create mode 100644 python-build/.gitignore create mode 100644 python-build/build.gradle create mode 100644 python-build/build_site.py create mode 100644 python-build/copy_wheels.py diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index eefa02be4f1af8..26fcceb8aeab70 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -27,7 +27,6 @@ jobs: airflow-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: @@ -69,7 +68,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/airflow-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.10' && matrix.extra_pip_requirements == 'apache-airflow>=2.7.0' }} with: name: Test Results (Airflow Plugin ${{ matrix.python-version}}) @@ -93,7 +92,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/dagster-plugin.yml b/.github/workflows/dagster-plugin.yml index bee1ec95e77747..d8a9cd7bfd6a35 100644 --- a/.github/workflows/dagster-plugin.yml +++ b/.github/workflows/dagster-plugin.yml @@ -27,7 +27,6 @@ jobs: dagster-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: @@ -44,7 +43,8 @@ jobs: with: distribution: "zulu" java-version: 17 - - uses: actions/checkout@v4 + - uses: gradle/actions/setup-gradle@v3 + - uses: acryldata/sane-checkout-action@v3 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -56,7 +56,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/dagster-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.10' && matrix.extraPythonRequirement == 'dagster>=1.3.3' }} with: name: Test Results (dagster Plugin ${{ matrix.python-version}}) @@ -79,7 +79,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/gx-plugin.yml b/.github/workflows/gx-plugin.yml index 595438bd6e4a90..2fd814a0764858 100644 --- a/.github/workflows/gx-plugin.yml +++ b/.github/workflows/gx-plugin.yml @@ -27,7 +27,6 @@ jobs: gx-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: @@ -48,7 +47,8 @@ jobs: with: distribution: "zulu" java-version: 17 - - uses: actions/checkout@v4 + - uses: gradle/actions/setup-gradle@v3 + - uses: acryldata/sane-checkout-action@v3 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -60,7 +60,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/gx-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.11' && matrix.extraPythonRequirement == 'great-expectations~=0.17.0' }} with: name: Test Results (GX Plugin ${{ matrix.python-version}}) @@ -83,7 +83,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index 49def2a863c565..ad00c6d1551d1d 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -28,7 +28,6 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 40 env: - SPARK_VERSION: 3.3.2 DATAHUB_TELEMETRY_ENABLED: false # TODO: Enable this once the test is fixed. # DATAHUB_LOOKML_GIT_TEST_SSH_KEY: ${{ secrets.DATAHUB_LOOKML_GIT_TEST_SSH_KEY }} @@ -84,9 +83,9 @@ jobs: df -hl docker image ls docker system df - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: - name: Test Results (metadata ingestion ${{ matrix.python-version }}) + name: Test Results (metadata ingestion ${{ matrix.python-version }} ${{ matrix.command }}) path: | **/build/reports/tests/test/** **/build/test-results/test/** @@ -100,14 +99,14 @@ jobs: directory: ./build/coverage-reports/ fail_ci_if_error: false flags: pytest-${{ matrix.command }} - name: pytest-${{ matrix.command }} + name: pytest-${{ matrix.python-version }}-${{ matrix.command }} verbose: true event-file: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/prefect-plugin.yml b/.github/workflows/prefect-plugin.yml index 3c75e8fe9a62ff..e4a70426f3a618 100644 --- a/.github/workflows/prefect-plugin.yml +++ b/.github/workflows/prefect-plugin.yml @@ -27,25 +27,20 @@ jobs: prefect-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: python-version: ["3.8", "3.9", "3.10"] - include: - - python-version: "3.8" - - python-version: "3.9" - - python-version: "3.10" fail-fast: false steps: - name: Set up JDK 17 - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: "zulu" java-version: 17 - uses: gradle/actions/setup-gradle@v3 - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: acryldata/sane-checkout-action@v3 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: "pip" @@ -56,7 +51,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/prefect-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.10'}} with: name: Test Results (Prefect Plugin ${{ matrix.python-version}}) @@ -72,7 +67,7 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} directory: ./build/coverage-reports/ fail_ci_if_error: false - flags: prefect,prefect-${{ matrix.extra_pip_extras }} + flags: prefect,prefect-${{ matrix.python-version }} name: pytest-prefect-${{ matrix.python-version }} verbose: true @@ -80,7 +75,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/python-build-pages.yml b/.github/workflows/python-build-pages.yml new file mode 100644 index 00000000000000..8971722c374fb7 --- /dev/null +++ b/.github/workflows/python-build-pages.yml @@ -0,0 +1,64 @@ +name: Python Build +on: + push: + branches: + - master + paths: + - ".github/workflows/python-build-pages.yml" + - "metadata-ingestion/**" + - "metadata-ingestion-modules/**" + - "metadata-models/**" + pull_request: + branches: + - "**" + paths: + - ".github/workflows/python-build-pages.yml" + - "metadata-ingestion/**" + - "metadata-ingestion-modules/**" + - "metadata-models/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + deploy-pages: + runs-on: ubuntu-latest + if: ${{ vars.CLOUDFLARE_WHEELS_PROJECT_NAME != '' }} + + name: Python Wheels + permissions: + contents: read + pull-requests: read + deployments: write + steps: + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + distribution: "zulu" + java-version: 17 + - uses: gradle/actions/setup-gradle@v3 + - uses: acryldata/sane-checkout-action@v3 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + cache: "pip" + - uses: actions/cache@v4 + with: + path: | + ~/.cache/uv + key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }} + - name: Build Python wheel site + run: | + ./gradlew :python-build:buildSite + env: + GITHUB_TOKEN: ${{ github.token }} + - name: Publish + uses: cloudflare/pages-action@v1 + with: + apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} + accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} + projectName: ${{ vars.CLOUDFLARE_WHEELS_PROJECT_NAME }} + workingDirectory: python-build + directory: site + gitHubToken: ${{ github.token }} diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 1860b4a49ae23a..797863d2019fbd 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -83,11 +83,7 @@ task yarnInstall(type: YarnTask) { task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, generateGraphQLSchema, generateJsonSchema, ':metadata-ingestion:modelDocGen', ':metadata-ingestion:docGen', - ':metadata-ingestion:buildWheel', - ':metadata-ingestion-modules:airflow-plugin:buildWheel', - ':metadata-ingestion-modules:dagster-plugin:buildWheel', - ':metadata-ingestion-modules:prefect-plugin:buildWheel', - ':metadata-ingestion-modules:gx-plugin:buildWheel', + ':python-build:buildWheels', ]) { inputs.files(projectMdFiles) outputs.cacheIf { true } diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index 0f7e347da64eba..ad82a85f9e5672 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -573,26 +573,20 @@ function write_markdown_file( function copy_python_wheels(): void { // Copy the built wheel files to the static directory. - const wheel_dirs = [ - "../metadata-ingestion/dist", - "../metadata-ingestion-modules/airflow-plugin/dist", - "../metadata-ingestion-modules/dagster-plugin/dist", - "../metadata-ingestion-modules/prefect-plugin/dist", - "../metadata-ingestion-modules/gx-plugin/dist", - ]; + // Everything is copied to the python-build directory first, so + // we just need to copy from there. + const wheel_dir = "../python-build/wheels"; const wheel_output_directory = path.join(STATIC_DIRECTORY, "wheels"); fs.mkdirSync(wheel_output_directory, { recursive: true }); - for (const wheel_dir of wheel_dirs) { - const wheel_files = fs.readdirSync(wheel_dir); - for (const wheel_file of wheel_files) { - const src = path.join(wheel_dir, wheel_file); - const dest = path.join(wheel_output_directory, wheel_file); + const wheel_files = fs.readdirSync(wheel_dir); + for (const wheel_file of wheel_files) { + const src = path.join(wheel_dir, wheel_file); + const dest = path.join(wheel_output_directory, wheel_file); - // console.log(`Copying artifact ${src} to ${dest}...`); - fs.copyFileSync(src, dest); - } + // console.log(`Copying artifact ${src} to ${dest}...`); + fs.copyFileSync(src, dest); } } diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index 2c5d8e6c9646a8..fc1409fbed74e4 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -23,8 +23,8 @@ task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { inputs.file file('setup.py') outputs.file(sentinel_file) commandLine 'bash', '-c', - "${python_executable} -m venv ${venv_name} && " + - "${venv_name}/bin/python -m pip install --upgrade pip uv wheel 'setuptools>=63.0.0' && " + + "${python_executable} -m venv ${venv_name} && set -x && " + + "${venv_name}/bin/python -m pip install --upgrade uv && " + "touch ${sentinel_file}" } diff --git a/python-build/.gitignore b/python-build/.gitignore new file mode 100644 index 00000000000000..d2de6dec258091 --- /dev/null +++ b/python-build/.gitignore @@ -0,0 +1,3 @@ + +/wheels +/site diff --git a/python-build/build.gradle b/python-build/build.gradle new file mode 100644 index 00000000000000..e90bffd46828ce --- /dev/null +++ b/python-build/build.gradle @@ -0,0 +1,27 @@ +plugins { + id 'base' +} + +ext { + python_executable = 'python3' +} + +task checkPythonVersion(type: Exec) { + commandLine python_executable, '-c', + 'import sys; sys.version_info >= (3, 8), f"Python version {sys.version_info} is too old"' +} + +task buildWheels(type: Exec, dependsOn: [ + checkPythonVersion, + ':metadata-ingestion:buildWheel', + ':metadata-ingestion-modules:airflow-plugin:buildWheel', + ':metadata-ingestion-modules:dagster-plugin:buildWheel', + ':metadata-ingestion-modules:prefect-plugin:buildWheel', + ':metadata-ingestion-modules:gx-plugin:buildWheel', +]) { + commandLine python_executable, "copy_wheels.py" +} + +task buildSite(type: Exec, dependsOn: [buildWheels]) { + commandLine python_executable, "build_site.py" +} diff --git a/python-build/build_site.py b/python-build/build_site.py new file mode 100644 index 00000000000000..73941eca9968ca --- /dev/null +++ b/python-build/build_site.py @@ -0,0 +1,150 @@ +import contextlib +import json +import os +import pathlib +import shutil +import subprocess +from datetime import datetime, timezone + +PYTHON_BUILD_DIR = pathlib.Path(__file__).parent +WHEEL_DIR = PYTHON_BUILD_DIR / "wheels" +SITE_OUTPUT_DIR = PYTHON_BUILD_DIR / "site" + +shutil.rmtree(SITE_OUTPUT_DIR, ignore_errors=True) +SITE_OUTPUT_DIR.mkdir(parents=True) + +SITE_ARTIFACT_WHEEL_DIR = SITE_OUTPUT_DIR / "artifacts" / "wheels" +SITE_ARTIFACT_WHEEL_DIR.mkdir(parents=True) +for wheel_file in WHEEL_DIR.glob("*"): + shutil.copy(wheel_file, SITE_ARTIFACT_WHEEL_DIR) + + +def package_name(wheel_file: pathlib.Path) -> str: + return wheel_file.name.split("-")[0].replace("_", "-") + + +# Get some extra context about the build +ts = datetime.now(timezone.utc).isoformat() +context_info: dict = { + "timestamp": ts, +} + +# Get branch info. +with contextlib.suppress(Exception): + if branch_info := os.getenv("GITHUB_HEAD_REF"): + pass + else: + branch_info = subprocess.check_output( + ["git", "branch", "--show-current"], text=True + ) + context_info["branch"] = branch_info.strip() + +# Get commit info. +with contextlib.suppress(Exception): + commit_info = subprocess.check_output( + ["git", "log", "-1", "--pretty=%H%n%B"], text=True + ) + commit_hash, commit_msg = commit_info.strip().split("\n", 1) + context_info["commit"] = { + "hash": commit_hash, + "message": commit_msg.strip(), + } + +# Get PR info. +with contextlib.suppress(Exception): + pr_info = "unknown" + if github_ref := os.getenv("GITHUB_REF"): + # e.g. GITHUB_REF=refs/pull/12157/merge + parts = github_ref.split("/") + if parts[1] == "pull": + pull_number = parts[2] + pr_info = json.loads( + subprocess.check_output( + ["gh", "pr", "view", pull_number, "--json", "title,number,url"], + text=True, + ) + ) + else: + # The `gh` CLI might be able to figure it out. + pr_info = json.loads( + subprocess.check_output( + ["gh", "pr", "view", "--json", "title,number,url"], text=True + ) + ) + context_info["pr"] = pr_info + + +newline = "\n" +(SITE_OUTPUT_DIR / "index.html").write_text( + f""" + + + DataHub Python Builds + + + + + + + + + + + +
+

DataHub Python Builds

+

+ These prebuilt wheel files can be used to install our Python packages as of a specific commit. +

+ +

Build context

+

+ Built at {ts}. +

+
{json.dumps(context_info, indent=2)}
+ +

Usage

+

+ Current base URL: unknown +

+ + + + + + + + + + + { + newline.join( + f''' + + + + + + ''' + for wheel_file in sorted(WHEEL_DIR.glob("*.whl")) + ) + } + +
PackageSizeInstall command
{package_name(wheel_file)}{wheel_file.stat().st_size / 1024 / 1024:.3f} MBuv pip install '{package_name(wheel_file)} @ <base-url>/artifacts/wheels/{wheel_file.name}'
+
+ + + +""" +) + +print("DataHub Python wheel site built in", SITE_OUTPUT_DIR) diff --git a/python-build/copy_wheels.py b/python-build/copy_wheels.py new file mode 100644 index 00000000000000..b66662cbfe9914 --- /dev/null +++ b/python-build/copy_wheels.py @@ -0,0 +1,27 @@ +import pathlib +import shutil + +PYTHON_BUILD_DIR = pathlib.Path(__file__).parent +ROOT_DIR = PYTHON_BUILD_DIR.parent +WHEEL_OUTPUT_DIR = PYTHON_BUILD_DIR / "wheels" + +# These should line up with the build.gradle file. +wheel_dirs = [ + ROOT_DIR / "metadata-ingestion/dist", + ROOT_DIR / "metadata-ingestion-modules/airflow-plugin/dist", + ROOT_DIR / "metadata-ingestion-modules/dagster-plugin/dist", + ROOT_DIR / "metadata-ingestion-modules/prefect-plugin/dist", + ROOT_DIR / "metadata-ingestion-modules/gx-plugin/dist", +] + +# Delete and recreate the output directory. +if WHEEL_OUTPUT_DIR.exists(): + shutil.rmtree(WHEEL_OUTPUT_DIR) +WHEEL_OUTPUT_DIR.mkdir(parents=True) + +# Copy things over. +for wheel_dir in wheel_dirs: + for wheel_file in wheel_dir.glob("*"): + shutil.copy(wheel_file, WHEEL_OUTPUT_DIR) + +print("Copied wheels to", WHEEL_OUTPUT_DIR) diff --git a/settings.gradle b/settings.gradle index 8756df31c1ac6f..b0c2c707d566c0 100644 --- a/settings.gradle +++ b/settings.gradle @@ -64,6 +64,7 @@ include 'metadata-ingestion-modules:airflow-plugin' include 'metadata-ingestion-modules:gx-plugin' include 'metadata-ingestion-modules:dagster-plugin' include 'metadata-ingestion-modules:prefect-plugin' +include 'python-build' include 'smoke-test' include 'metadata-auth:auth-api' include 'metadata-service:schema-registry-api' From 89acda66d0d56d01a2645d9c8cced7c593b65e99 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Thu, 19 Dec 2024 10:18:30 -0600 Subject: [PATCH 10/35] docs(release): v0.3.7.7 (#12091) --- docs/managed-datahub/release-notes/v_0_3_7.md | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/docs/managed-datahub/release-notes/v_0_3_7.md b/docs/managed-datahub/release-notes/v_0_3_7.md index be3a2d97514efa..75f5ac21224c27 100644 --- a/docs/managed-datahub/release-notes/v_0_3_7.md +++ b/docs/managed-datahub/release-notes/v_0_3_7.md @@ -13,12 +13,43 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies ## Known Issues +### v0.3.7.7 + * Postgres regression, non-functional when using postgres + ### v0.3.7.3 * Search page fails to render when filters are applied with a query which returns zero results. ## Release Changelog --- +### v0.3.7.8 + +- [Postgres] Fix regression from MySQL fix in v0.3.7.7 + +### v0.3.7.7 + +- [UI] Fix bug showing upstream lineage dbt source leaves +- [UI] Show column-level lineage through transformational home node +- [UI] Browse nodes titles expand to full width of panel +- [UI] Data product preview cards display correctly +- [UI] Fix elasticsearch usage sort field names +- [UI] Add structured property display settings feature +- [Executor] Fix false errors on cli ingestions +- [Search] Schema field boost reduced +- [Search] Search usage ranking null_fill fix +- [Search] Single term with underscores by default no longer considered quoted +- [Metadata Tests] Metadata Test shutdown actions flush +- [Metadata Tests] Add deduplicate logic for MCP batches +- [Metadata Tests] Prevent mutation of systemMetadata in patch batches +- [MAE Consumer] Fix graph edge on container delete exception +- [Notifications] Filter out system ingestion source notifications +- [MySQL] Fix index gap lock deadlock +- [API] DataJobInputOutput finegrained lineage fix + +### v0.3.7.6 + +- [UI] fix(automations): white screen automations with dbt sync + ### v0.3.7.5 - [GMS] Fix upstream lineage patching when path contained encoded slash From 9031b49b2345f79db5504f80432af1cd8a77a5e5 Mon Sep 17 00:00:00 2001 From: John Joyce Date: Thu, 19 Dec 2024 09:07:59 -0800 Subject: [PATCH 11/35] fix(docs): Add improvements in examples for PATCH documentation (#12165) Co-authored-by: John Joyce Co-authored-by: John Joyce --- docs/advanced/patch.md | 110 +++++++++++++----- docs/api/tutorials/custom-properties.md | 4 +- .../dataset_add_custom_properties_patch.py | 19 +++ .../dataset_add_glossary_term_patch.py | 22 ++++ .../library/dataset_add_owner_patch.py | 24 ++++ .../library/dataset_add_properties.py | 44 ------- ...aset_add_remove_custom_properties_patch.py | 19 +++ .../library/dataset_add_remove_properties.py | 46 -------- .../dataset_add_structured_properties.py | 24 ---- ...dataset_add_structured_properties_patch.py | 23 ++++ .../examples/library/dataset_add_tag_patch.py | 22 ++++ .../dataset_add_upstream_lineage_patch.py | 62 ++++++++++ .../dataset_field_add_glossary_term_patch.py | 26 +++++ .../library/dataset_field_add_tag_patch.py | 24 ++++ 14 files changed, 321 insertions(+), 148 deletions(-) create mode 100644 metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py create mode 100644 metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py create mode 100644 metadata-ingestion/examples/library/dataset_add_owner_patch.py delete mode 100644 metadata-ingestion/examples/library/dataset_add_properties.py create mode 100644 metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py delete mode 100644 metadata-ingestion/examples/library/dataset_add_remove_properties.py delete mode 100644 metadata-ingestion/examples/library/dataset_add_structured_properties.py create mode 100644 metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py create mode 100644 metadata-ingestion/examples/library/dataset_add_tag_patch.py create mode 100644 metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py create mode 100644 metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py create mode 100644 metadata-ingestion/examples/library/dataset_field_add_tag_patch.py diff --git a/docs/advanced/patch.md b/docs/advanced/patch.md index 601d0556593136..24e8c68a9168dd 100644 --- a/docs/advanced/patch.md +++ b/docs/advanced/patch.md @@ -1,69 +1,120 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# But First, Semantics: Upsert versus Patch +# Emitting Patch Updates to DataHub ## Why Would You Use Patch -By default, most of the SDK tutorials and API-s involve applying full upserts at the aspect level. This means that typically, when you want to change one field within an aspect without modifying others, you need to do a read-modify-write to not overwrite existing fields. -To support these scenarios, DataHub supports PATCH based operations so that targeted changes to single fields or values within arrays of fields are possible without impacting other existing metadata. +By default, most of the SDK tutorials and APIs involve applying full upserts at the aspect level, e.g. replacing the aspect entirely. +This means that when you want to change even a single field within an aspect without modifying others, you need to do a read-modify-write to avoid overwriting existing fields. +To support these scenarios, DataHub supports `PATCH` operations to perform targeted changes for individual fields or values within arrays of fields are possible without impacting other existing metadata. :::note -Currently, PATCH support is only available for a selected set of aspects, so before pinning your hopes on using PATCH as a way to make modifications to aspect values, confirm whether your aspect supports PATCH semantics. The complete list of Aspects that are supported are maintained [here](https://github.com/datahub-project/datahub/blob/9588440549f3d99965085e97b214a7dabc181ed2/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/AspectTemplateEngine.java#L24). In the near future, we do have plans to automatically support PATCH semantics for aspects by default. +Currently, PATCH support is only available for a selected set of aspects, so before pinning your hopes on using PATCH as a way to make modifications to aspect values, confirm whether your aspect supports PATCH semantics. The complete list of Aspects that are supported are maintained [here](https://github.com/datahub-project/datahub/blob/9588440549f3d99965085e97b214a7dabc181ed2/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/AspectTemplateEngine.java#L24). ::: -## How To Use Patch +## How To Use Patches -Examples for using Patch are sprinkled throughout the API guides. Here's how to find the appropriate classes for the language for your choice. - - + -The Java Patch builders are aspect-oriented and located in the [datahub-client](https://github.com/datahub-project/datahub/tree/master/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch) module under the `datahub.client.patch` namespace. +The Python Patch builders are entity-oriented and located in the [metadata-ingestion](https://github.com/datahub-project/datahub/tree/9588440549f3d99965085e97b214a7dabc181ed2/metadata-ingestion/src/datahub/specific) module and located in the `datahub.specific` module. +Patch builder helper classes exist for -Here are a few illustrative examples using the Java Patch builders: +- [Datasets](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/dataset.py) +- [Charts](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/chart.py) +- [Dashboards](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/dashboard.py) +- [Data Jobs (Tasks)](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/datajob.py) +- [Data Products](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/dataproduct.py) +And we are gladly accepting contributions for Containers, Data Flows (Pipelines), Tags, Glossary Terms, Domains, and ML Models. -### Add Custom Properties +### Add & Remove Owners for Dataset -```java -{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetCustomPropertiesAdd.java show_path_as_comment }} +To add & remove specific owners for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_owner_patch.py show_path_as_comment }} ``` -### Add and Remove Custom Properties +### Add & Remove Tags for Dataset -```java -{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetCustomPropertiesAddRemove.java show_path_as_comment }} +To add & remove specific tags for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_tag_patch.py show_path_as_comment }} ``` -### Add Data Job Lineage +And for a specific schema field within the Dataset: -```java -{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DataJobLineageAdd.java show_path_as_comment }} +```python +{{ inline /metadata-ingestion/examples/library/dataset_field_add_tag_patch.py show_path_as_comment }} ``` - - +### Add & Remove Glossary Terms for Dataset + +To add & remove specific glossary terms for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py show_path_as_comment }} +``` + +And for a specific schema field within the Dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py show_path_as_comment }} +``` + +### Add & Remove Structured Properties for Dataset -The Python Patch builders are entity-oriented and located in the [metadata-ingestion](https://github.com/datahub-project/datahub/tree/9588440549f3d99965085e97b214a7dabc181ed2/metadata-ingestion/src/datahub/specific) module and located in the `datahub.specific` module. +To add & remove structured properties for a dataset: -Here are a few illustrative examples using the Python Patch builders: +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py show_path_as_comment }} +``` -### Add Properties to Dataset +### Add & Remove Upstream Lineage for Dataset + +To add & remove a lineage edge connecting a dataset to it's upstream or input at both the dataset and schema field level: ```python -{{ inline /metadata-ingestion/examples/library/dataset_add_properties.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py show_path_as_comment }} +``` + +### Add & Remove Read-Only Custom Properties for Dataset + +To add & remove specific custom properties for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py show_path_as_comment }} +``` + + + + +The Java Patch builders are aspect-oriented and located in the [datahub-client](https://github.com/datahub-project/datahub/tree/master/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch) module under the `datahub.client.patch` namespace. + +### Add & Remove Read-Only Custom Properties + +```java +{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetCustomPropertiesAddRemove.java show_path_as_comment }} +``` + +### Add Data Job Lineage + +```java +{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DataJobLineageAdd.java show_path_as_comment }} ``` -## How Patch works +## Advanced: How Patch works To understand how patching works, it's important to understand a bit about our [models](../what/aspect.md). Entities are comprised of Aspects which can be reasoned about as JSON representations of the object models. To be able to patch these we utilize [JsonPatch](https://jsonpatch.com/). The components of a JSON Patch are the path, operation, and value. @@ -73,9 +124,6 @@ which can be reasoned about as JSON representations of the object models. To be The JSON path refers to a value within the schema. This can be a single field or can be an entire object reference depending on what the path is. For our patches we are primarily targeting single fields or even single array elements within a field. To be able to target array elements by id, we go through a translation process of the schema to transform arrays into maps. This allows a path to reference a particular array element by key rather than by index, for example a specific tag urn being added to a dataset. -This is important to note that for some fields in our schema that are arrays which do not necessarily restrict uniqueness, this puts a uniqueness constraint on the key. -The key for objects stored in arrays is determined manually by examining the schema and a long term goal is to make these keys annotation driven to reduce the amount of code needed to support -additional aspects to be patched. There is a generic patch endpoint, but it requires any array field keys to be specified at request time, putting a lot of burden on the API user. #### Examples @@ -87,8 +135,7 @@ Breakdown: * `/upstreams` -> References the upstreams field of the UpstreamLineage aspect, this is an array of Upstream objects where the key is the Urn * `/urn:...` -> The dataset to be targeted by the operation - -A patch path for targeting a fine grained lineage upstream: +A patch path for targeting a fine-grained lineage upstream: `/fineGrainedLineages/TRANSFORM/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD),foo)/urn:li:query:queryId/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created_upstream,PROD),bar)` @@ -118,7 +165,6 @@ using adds, but generally the most useful use case for patch is to add elements Remove operations require the path specified to be present, or an error will be thrown, otherwise they operate as one would expect. The specified path will be removed from the aspect. - ### Value Value is the actual information that will be stored at a path. If the path references an object then this will include the JSON key value pairs for that object. diff --git a/docs/api/tutorials/custom-properties.md b/docs/api/tutorials/custom-properties.md index fe0d7e62dcde83..86b1b2c0c54da6 100644 --- a/docs/api/tutorials/custom-properties.md +++ b/docs/api/tutorials/custom-properties.md @@ -74,7 +74,7 @@ The following code adds custom properties `cluster_name` and `retention_time` to ```python -{{ inline /metadata-ingestion/examples/library/dataset_add_properties.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py show_path_as_comment }} ``` @@ -128,7 +128,7 @@ The following code shows you how can add and remove custom properties in the sam ```python -{{ inline /metadata-ingestion/examples/library/dataset_add_remove_properties.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py show_path_as_comment }} ``` diff --git a/metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py b/metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py new file mode 100644 index 00000000000000..7231461fea322d --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py @@ -0,0 +1,19 @@ +from datahub.emitter.mce_builder import make_dataset_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") + +# Create Dataset Patch to Add Custom Properties +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_custom_property("cluster_name", "datahubproject.acryl.io") +patch_builder.add_custom_property("retention_time", "2 years") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py b/metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py new file mode 100644 index 00000000000000..d0b9a866fde615 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py @@ -0,0 +1,22 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_term_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import GlossaryTermAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Term for 'profile_id' column +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_term(GlossaryTermAssociationClass(make_term_urn("term-to-add-id"))) +patch_builder.remove_term(make_term_urn("term-to-remove-id")) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_owner_patch.py b/metadata-ingestion/examples/library/dataset_add_owner_patch.py new file mode 100644 index 00000000000000..8d3130c09c4bbf --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_owner_patch.py @@ -0,0 +1,24 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_group_urn, make_user_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import OwnerClass, OwnershipTypeClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Owners +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_owner( + OwnerClass(make_user_urn("user-to-add-id"), OwnershipTypeClass.TECHNICAL_OWNER) +) +patch_builder.remove_owner(make_group_urn("group-to-remove-id")) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_properties.py b/metadata-ingestion/examples/library/dataset_add_properties.py deleted file mode 100644 index b72aac5b828002..00000000000000 --- a/metadata-ingestion/examples/library/dataset_add_properties.py +++ /dev/null @@ -1,44 +0,0 @@ -import logging -from typing import Union - -from datahub.configuration.kafka import KafkaProducerConnectionConfig -from datahub.emitter.kafka_emitter import DatahubKafkaEmitter, KafkaEmitterConfig -from datahub.emitter.mce_builder import make_dataset_urn -from datahub.emitter.rest_emitter import DataHubRestEmitter -from datahub.specific.dataset import DatasetPatchBuilder - -log = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - - -# Get an emitter, either REST or Kafka, this example shows you both -def get_emitter() -> Union[DataHubRestEmitter, DatahubKafkaEmitter]: - USE_REST_EMITTER = True - if USE_REST_EMITTER: - gms_endpoint = "http://localhost:8080" - return DataHubRestEmitter(gms_server=gms_endpoint) - else: - kafka_server = "localhost:9092" - schema_registry_url = "http://localhost:8081" - return DatahubKafkaEmitter( - config=KafkaEmitterConfig( - connection=KafkaProducerConnectionConfig( - bootstrap=kafka_server, schema_registry_url=schema_registry_url - ) - ) - ) - - -dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") - -with get_emitter() as emitter: - for patch_mcp in ( - DatasetPatchBuilder(dataset_urn) - .add_custom_property("cluster_name", "datahubproject.acryl.io") - .add_custom_property("retention_time", "2 years") - .build() - ): - emitter.emit(patch_mcp) - - -log.info(f"Added cluster_name, retention_time properties to dataset {dataset_urn}") diff --git a/metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py b/metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py new file mode 100644 index 00000000000000..c1db9c91d13ec0 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py @@ -0,0 +1,19 @@ +from datahub.emitter.mce_builder import make_dataset_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") + +# Create Dataset Patch to Add + Remove Custom Properties +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_custom_property("cluster_name", "datahubproject.acryl.io") +patch_builder.remove_custom_property("retention_time") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_remove_properties.py b/metadata-ingestion/examples/library/dataset_add_remove_properties.py deleted file mode 100644 index 7109c0264f9713..00000000000000 --- a/metadata-ingestion/examples/library/dataset_add_remove_properties.py +++ /dev/null @@ -1,46 +0,0 @@ -import logging -from typing import Union - -from datahub.configuration.kafka import KafkaProducerConnectionConfig -from datahub.emitter.kafka_emitter import DatahubKafkaEmitter, KafkaEmitterConfig -from datahub.emitter.mce_builder import make_dataset_urn -from datahub.emitter.rest_emitter import DataHubRestEmitter -from datahub.specific.dataset import DatasetPatchBuilder - -log = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - - -# Get an emitter, either REST or Kafka, this example shows you both -def get_emitter() -> Union[DataHubRestEmitter, DatahubKafkaEmitter]: - USE_REST_EMITTER = True - if USE_REST_EMITTER: - gms_endpoint = "http://localhost:8080" - return DataHubRestEmitter(gms_server=gms_endpoint) - else: - kafka_server = "localhost:9092" - schema_registry_url = "http://localhost:8081" - return DatahubKafkaEmitter( - config=KafkaEmitterConfig( - connection=KafkaProducerConnectionConfig( - bootstrap=kafka_server, schema_registry_url=schema_registry_url - ) - ) - ) - - -dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") - -with get_emitter() as emitter: - for patch_mcp in ( - DatasetPatchBuilder(dataset_urn) - .add_custom_property("cluster_name", "datahubproject.acryl.io") - .remove_custom_property("retention_time") - .build() - ): - emitter.emit(patch_mcp) - - -log.info( - f"Added cluster_name property, removed retention_time property from dataset {dataset_urn}" -) diff --git a/metadata-ingestion/examples/library/dataset_add_structured_properties.py b/metadata-ingestion/examples/library/dataset_add_structured_properties.py deleted file mode 100644 index fc2c3793405927..00000000000000 --- a/metadata-ingestion/examples/library/dataset_add_structured_properties.py +++ /dev/null @@ -1,24 +0,0 @@ -import logging - -from datahub.emitter.mce_builder import make_dataset_urn -from datahub.emitter.rest_emitter import DataHubRestEmitter -from datahub.specific.dataset import DatasetPatchBuilder - -log = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - -# Create rest emitter -rest_emitter = DataHubRestEmitter(gms_server="http://localhost:8080") - -dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") - - -for patch_mcp in ( - DatasetPatchBuilder(dataset_urn) - .add_structured_property("io.acryl.dataManagement.replicationSLA", 12) - .build() -): - rest_emitter.emit(patch_mcp) - - -log.info(f"Added cluster_name, retention_time properties to dataset {dataset_urn}") diff --git a/metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py b/metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py new file mode 100644 index 00000000000000..ef72ed58a4b82f --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py @@ -0,0 +1,23 @@ +from datahub.emitter.mce_builder import make_dataset_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") + +# Create Dataset Patch to Add and Remove Structured Properties +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_structured_property( + "urn:li:structuredProperty:retentionTimeInDays", 12 +) +patch_builder.remove_structured_property( + "urn:li:structuredProperty:customClassification" +) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_tag_patch.py b/metadata-ingestion/examples/library/dataset_add_tag_patch.py new file mode 100644 index 00000000000000..0bc644d6865f63 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_tag_patch.py @@ -0,0 +1,22 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_tag_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import TagAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_tag(TagAssociationClass(make_tag_urn("tag-to-add-id"))) +patch_builder.remove_tag("urn:li:tag:tag-to-remove-id") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py b/metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py new file mode 100644 index 00000000000000..0b4e5e39bf627e --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py @@ -0,0 +1,62 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_schema_field_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import ( + DatasetLineageTypeClass, + FineGrainedLineageClass, + FineGrainedLineageUpstreamTypeClass, + UpstreamClass, +) +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) +upstream_to_remove_urn = make_dataset_urn( + platform="s3", name="fct_users_old", env="PROD" +) +upstream_to_add_urn = make_dataset_urn(platform="s3", name="fct_users_new", env="PROD") + +# Create Dataset Patch to Add & Remove Upstream Lineage Edges +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.remove_upstream_lineage(upstream_to_remove_urn) +patch_builder.add_upstream_lineage( + UpstreamClass(upstream_to_add_urn, DatasetLineageTypeClass.TRANSFORMED) +) + +# ...And also include schema field lineage +upstream_field_to_add_urn = make_schema_field_urn(upstream_to_add_urn, "profile_id") +downstream_field_to_add_urn = make_schema_field_urn(dataset_urn, "profile_id") + +patch_builder.add_fine_grained_upstream_lineage( + FineGrainedLineageClass( + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + [upstream_field_to_add_urn], + [downstream_field_to_add_urn], + ) +) + +upstream_field_to_remove_urn = make_schema_field_urn( + upstream_to_remove_urn, "profile_id" +) +downstream_field_to_remove_urn = make_schema_field_urn(dataset_urn, "profile_id") + +patch_builder.remove_fine_grained_upstream_lineage( + FineGrainedLineageClass( + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + [upstream_field_to_remove_urn], + [downstream_field_to_remove_urn], + ) +) + +patch_mcps = patch_builder.build() + + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py b/metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py new file mode 100644 index 00000000000000..3f8da2c143c924 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py @@ -0,0 +1,26 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_term_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import GlossaryTermAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Term for 'profile_id' column +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.for_field("profile_id").add_term( + GlossaryTermAssociationClass(make_term_urn("term-to-add-id")) +) +patch_builder.for_field("profile_id").remove_term( + "urn:li:glossaryTerm:term-to-remove-id" +) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_field_add_tag_patch.py b/metadata-ingestion/examples/library/dataset_field_add_tag_patch.py new file mode 100644 index 00000000000000..3075cac5320ae9 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_field_add_tag_patch.py @@ -0,0 +1,24 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_tag_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import TagAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Tag for 'profile_id' column +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.for_field("profile_id").add_tag( + TagAssociationClass(make_tag_urn("tag-to-add-id")) +) +patch_builder.for_field("profile_id").remove_tag("urn:li:tag:tag-to-remove-id") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) From b7bb5ca7ee3e0e80c5f8ca1843e67671f779f27d Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Thu, 19 Dec 2024 10:20:06 -0800 Subject: [PATCH 12/35] feat(graphql/ml): Add custom properties to ml entities (#12152) --- .../types/mappers/EmbeddedModelMapper.java | 12 +++++++++++ .../mlmodel/mappers/MLFeatureMapper.java | 12 +++++++---- .../mappers/MLFeaturePropertiesMapper.java | 20 +++++++++++++------ .../mlmodel/mappers/MLFeatureTableMapper.java | 10 +++++----- .../MLFeatureTablePropertiesMapper.java | 18 ++++++++++------- .../mlmodel/mappers/MLModelGroupMapper.java | 11 ++++++---- .../mappers/MLModelGroupPropertiesMapper.java | 19 ++++++++++++------ .../mappers/MLModelPropertiesMapper.java | 12 ++++++----- .../mlmodel/mappers/MLPrimaryKeyMapper.java | 15 ++++++++------ .../mappers/MLPrimaryKeyPropertiesMapper.java | 19 ++++++++++++------ .../src/main/resources/entity.graphql | 12 ++++++++--- 11 files changed, 108 insertions(+), 52 deletions(-) create mode 100644 datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/EmbeddedModelMapper.java diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/EmbeddedModelMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/EmbeddedModelMapper.java new file mode 100644 index 00000000000000..62e7c90ab9b0e0 --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/EmbeddedModelMapper.java @@ -0,0 +1,12 @@ +package com.linkedin.datahub.graphql.types.mappers; + +import com.linkedin.common.urn.Urn; +import com.linkedin.datahub.graphql.QueryContext; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +/** Made for models that are embedded in other models and thus do not encode their own URN. */ +public interface EmbeddedModelMapper { + O apply( + @Nullable final QueryContext context, @Nonnull final I input, @Nonnull final Urn entityUrn); +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureMapper.java index d5eb1a15624dc3..74076fd2f4ee9f 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureMapper.java @@ -75,7 +75,8 @@ public MLFeature apply( mlFeature.setOwnership( OwnershipMapper.map(context, new Ownership(dataMap), entityUrn))); mappingHelper.mapToResult( - context, ML_FEATURE_PROPERTIES_ASPECT_NAME, MLFeatureMapper::mapMLFeatureProperties); + ML_FEATURE_PROPERTIES_ASPECT_NAME, + (entity, dataMap) -> mapMLFeatureProperties(context, entity, dataMap, entityUrn)); mappingHelper.mapToResult( INSTITUTIONAL_MEMORY_ASPECT_NAME, (mlFeature, dataMap) -> @@ -138,10 +139,13 @@ private static void mapMLFeatureKey(@Nonnull MLFeature mlFeature, @Nonnull DataM private static void mapMLFeatureProperties( @Nullable final QueryContext context, @Nonnull MLFeature mlFeature, - @Nonnull DataMap dataMap) { + @Nonnull DataMap dataMap, + @Nonnull Urn entityUrn) { MLFeatureProperties featureProperties = new MLFeatureProperties(dataMap); - mlFeature.setFeatureProperties(MLFeaturePropertiesMapper.map(context, featureProperties)); - mlFeature.setProperties(MLFeaturePropertiesMapper.map(context, featureProperties)); + com.linkedin.datahub.graphql.generated.MLFeatureProperties graphqlProperties = + MLFeaturePropertiesMapper.map(context, featureProperties, entityUrn); + mlFeature.setFeatureProperties(graphqlProperties); + mlFeature.setProperties(graphqlProperties); mlFeature.setDescription(featureProperties.getDescription()); if (featureProperties.getDataType() != null) { mlFeature.setDataType(MLFeatureDataType.valueOf(featureProperties.getDataType().toString())); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeaturePropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeaturePropertiesMapper.java index 92d090275867da..08ac3a1b5f138f 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeaturePropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeaturePropertiesMapper.java @@ -1,29 +1,34 @@ package com.linkedin.datahub.graphql.types.mlmodel.mappers; +import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.Dataset; import com.linkedin.datahub.graphql.generated.MLFeatureDataType; import com.linkedin.datahub.graphql.generated.MLFeatureProperties; -import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; import java.util.stream.Collectors; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; public class MLFeaturePropertiesMapper - implements ModelMapper { + implements EmbeddedModelMapper< + com.linkedin.ml.metadata.MLFeatureProperties, MLFeatureProperties> { public static final MLFeaturePropertiesMapper INSTANCE = new MLFeaturePropertiesMapper(); public static MLFeatureProperties map( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLFeatureProperties mlFeatureProperties) { - return INSTANCE.apply(context, mlFeatureProperties); + @Nonnull final com.linkedin.ml.metadata.MLFeatureProperties mlFeatureProperties, + @Nonnull Urn entityUrn) { + return INSTANCE.apply(context, mlFeatureProperties, entityUrn); } @Override public MLFeatureProperties apply( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLFeatureProperties mlFeatureProperties) { + @Nonnull final com.linkedin.ml.metadata.MLFeatureProperties mlFeatureProperties, + @Nonnull Urn entityUrn) { final MLFeatureProperties result = new MLFeatureProperties(); result.setDescription(mlFeatureProperties.getDescription()); @@ -45,6 +50,9 @@ public MLFeatureProperties apply( .collect(Collectors.toList())); } + result.setCustomProperties( + CustomPropertiesMapper.map(mlFeatureProperties.getCustomProperties(), entityUrn)); + return result; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTableMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTableMapper.java index 51d3004d97a619..65bc8e84f7bbb5 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTableMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTableMapper.java @@ -76,7 +76,7 @@ public MLFeatureTable apply( mappingHelper.mapToResult(ML_FEATURE_TABLE_KEY_ASPECT_NAME, this::mapMLFeatureTableKey); mappingHelper.mapToResult( ML_FEATURE_TABLE_PROPERTIES_ASPECT_NAME, - (entity, dataMap) -> this.mapMLFeatureTableProperties(context, entity, dataMap, entityUrn)); + (entity, dataMap) -> mapMLFeatureTableProperties(context, entity, dataMap, entityUrn)); mappingHelper.mapToResult( INSTITUTIONAL_MEMORY_ASPECT_NAME, (mlFeatureTable, dataMap) -> @@ -146,10 +146,10 @@ private static void mapMLFeatureTableProperties( @Nonnull DataMap dataMap, Urn entityUrn) { MLFeatureTableProperties featureTableProperties = new MLFeatureTableProperties(dataMap); - mlFeatureTable.setFeatureTableProperties( - MLFeatureTablePropertiesMapper.map(context, featureTableProperties, entityUrn)); - mlFeatureTable.setProperties( - MLFeatureTablePropertiesMapper.map(context, featureTableProperties, entityUrn)); + com.linkedin.datahub.graphql.generated.MLFeatureTableProperties graphqlProperties = + MLFeatureTablePropertiesMapper.map(context, featureTableProperties, entityUrn); + mlFeatureTable.setFeatureTableProperties(graphqlProperties); + mlFeatureTable.setProperties(graphqlProperties); mlFeatureTable.setDescription(featureTableProperties.getDescription()); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTablePropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTablePropertiesMapper.java index d9fed13ed0d0be..3c054cb6a9a5b2 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTablePropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTablePropertiesMapper.java @@ -8,26 +8,30 @@ import com.linkedin.datahub.graphql.generated.MLFeatureTableProperties; import com.linkedin.datahub.graphql.generated.MLPrimaryKey; import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; import java.util.stream.Collectors; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; -public class MLFeatureTablePropertiesMapper { +public class MLFeatureTablePropertiesMapper + implements EmbeddedModelMapper< + com.linkedin.ml.metadata.MLFeatureTableProperties, MLFeatureTableProperties> { public static final MLFeatureTablePropertiesMapper INSTANCE = new MLFeatureTablePropertiesMapper(); public static MLFeatureTableProperties map( @Nullable final QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLFeatureTableProperties mlFeatureTableProperties, - Urn entityUrn) { + @Nonnull final com.linkedin.ml.metadata.MLFeatureTableProperties mlFeatureTableProperties, + @Nonnull Urn entityUrn) { return INSTANCE.apply(context, mlFeatureTableProperties, entityUrn); } - public static MLFeatureTableProperties apply( + @Override + public MLFeatureTableProperties apply( @Nullable final QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLFeatureTableProperties mlFeatureTableProperties, - Urn entityUrn) { + @Nonnull final com.linkedin.ml.metadata.MLFeatureTableProperties mlFeatureTableProperties, + @Nonnull Urn entityUrn) { final MLFeatureTableProperties result = new MLFeatureTableProperties(); result.setDescription(mlFeatureTableProperties.getDescription()); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupMapper.java index 6e3da1c1533926..9009972a47616d 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupMapper.java @@ -75,9 +75,8 @@ public MLModelGroup apply( mappingHelper.mapToResult( ML_MODEL_GROUP_KEY_ASPECT_NAME, MLModelGroupMapper::mapToMLModelGroupKey); mappingHelper.mapToResult( - context, ML_MODEL_GROUP_PROPERTIES_ASPECT_NAME, - MLModelGroupMapper::mapToMLModelGroupProperties); + (entity, dataMap) -> mapToMLModelGroupProperties(context, entity, dataMap, entityUrn)); mappingHelper.mapToResult( STATUS_ASPECT_NAME, (mlModelGroup, dataMap) -> @@ -136,9 +135,13 @@ private static void mapToMLModelGroupKey(MLModelGroup mlModelGroup, DataMap data } private static void mapToMLModelGroupProperties( - @Nullable final QueryContext context, MLModelGroup mlModelGroup, DataMap dataMap) { + @Nullable final QueryContext context, + MLModelGroup mlModelGroup, + DataMap dataMap, + @Nonnull Urn entityUrn) { MLModelGroupProperties modelGroupProperties = new MLModelGroupProperties(dataMap); - mlModelGroup.setProperties(MLModelGroupPropertiesMapper.map(context, modelGroupProperties)); + mlModelGroup.setProperties( + MLModelGroupPropertiesMapper.map(context, modelGroupProperties, entityUrn)); if (modelGroupProperties.getDescription() != null) { mlModelGroup.setDescription(modelGroupProperties.getDescription()); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java index 9f1918f9ec4893..a6cfded9865d90 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java @@ -1,27 +1,31 @@ package com.linkedin.datahub.graphql.types.mlmodel.mappers; +import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.MLModelGroupProperties; -import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; public class MLModelGroupPropertiesMapper - implements ModelMapper< + implements EmbeddedModelMapper< com.linkedin.ml.metadata.MLModelGroupProperties, MLModelGroupProperties> { public static final MLModelGroupPropertiesMapper INSTANCE = new MLModelGroupPropertiesMapper(); public static MLModelGroupProperties map( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLModelGroupProperties mlModelGroupProperties) { - return INSTANCE.apply(context, mlModelGroupProperties); + @Nonnull final com.linkedin.ml.metadata.MLModelGroupProperties mlModelGroupProperties, + @Nonnull Urn entityUrn) { + return INSTANCE.apply(context, mlModelGroupProperties, entityUrn); } @Override public MLModelGroupProperties apply( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLModelGroupProperties mlModelGroupProperties) { + @Nonnull final com.linkedin.ml.metadata.MLModelGroupProperties mlModelGroupProperties, + @Nonnull Urn entityUrn) { final MLModelGroupProperties result = new MLModelGroupProperties(); result.setDescription(mlModelGroupProperties.getDescription()); @@ -30,6 +34,9 @@ public MLModelGroupProperties apply( } result.setCreatedAt(mlModelGroupProperties.getCreatedAt()); + result.setCustomProperties( + CustomPropertiesMapper.map(mlModelGroupProperties.getCustomProperties(), entityUrn)); + return result; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java index a89904b3ab915c..265005c2caa9ee 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java @@ -7,25 +7,27 @@ import com.linkedin.datahub.graphql.generated.MLModelGroup; import com.linkedin.datahub.graphql.generated.MLModelProperties; import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; import java.util.stream.Collectors; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; -public class MLModelPropertiesMapper { +public class MLModelPropertiesMapper + implements EmbeddedModelMapper { public static final MLModelPropertiesMapper INSTANCE = new MLModelPropertiesMapper(); public static MLModelProperties map( @Nullable final QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLModelProperties mlModelProperties, + @Nonnull final com.linkedin.ml.metadata.MLModelProperties mlModelProperties, Urn entityUrn) { return INSTANCE.apply(context, mlModelProperties, entityUrn); } public MLModelProperties apply( @Nullable final QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLModelProperties mlModelProperties, - Urn entityUrn) { + @Nonnull final com.linkedin.ml.metadata.MLModelProperties mlModelProperties, + @Nonnull Urn entityUrn) { final MLModelProperties result = new MLModelProperties(); result.setDate(mlModelProperties.getDate()); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyMapper.java index c446c892cb2231..d48d93ede9c1ab 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyMapper.java @@ -74,9 +74,8 @@ public MLPrimaryKey apply( mappingHelper.mapToResult( ML_PRIMARY_KEY_KEY_ASPECT_NAME, MLPrimaryKeyMapper::mapMLPrimaryKeyKey); mappingHelper.mapToResult( - context, ML_PRIMARY_KEY_PROPERTIES_ASPECT_NAME, - MLPrimaryKeyMapper::mapMLPrimaryKeyProperties); + (entity, dataMap) -> mapMLPrimaryKeyProperties(context, entity, dataMap, entityUrn)); mappingHelper.mapToResult( INSTITUTIONAL_MEMORY_ASPECT_NAME, (mlPrimaryKey, dataMap) -> @@ -132,11 +131,15 @@ private static void mapMLPrimaryKeyKey(MLPrimaryKey mlPrimaryKey, DataMap dataMa } private static void mapMLPrimaryKeyProperties( - @Nullable final QueryContext context, MLPrimaryKey mlPrimaryKey, DataMap dataMap) { + @Nullable final QueryContext context, + MLPrimaryKey mlPrimaryKey, + DataMap dataMap, + @Nonnull Urn entityUrn) { MLPrimaryKeyProperties primaryKeyProperties = new MLPrimaryKeyProperties(dataMap); - mlPrimaryKey.setPrimaryKeyProperties( - MLPrimaryKeyPropertiesMapper.map(context, primaryKeyProperties)); - mlPrimaryKey.setProperties(MLPrimaryKeyPropertiesMapper.map(context, primaryKeyProperties)); + com.linkedin.datahub.graphql.generated.MLPrimaryKeyProperties graphqlProperties = + MLPrimaryKeyPropertiesMapper.map(context, primaryKeyProperties, entityUrn); + mlPrimaryKey.setPrimaryKeyProperties(graphqlProperties); + mlPrimaryKey.setProperties(graphqlProperties); mlPrimaryKey.setDescription(primaryKeyProperties.getDescription()); if (primaryKeyProperties.getDataType() != null) { mlPrimaryKey.setDataType( diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyPropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyPropertiesMapper.java index 09e41fe7ee4e8e..0bbe8f53f32713 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyPropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyPropertiesMapper.java @@ -1,30 +1,34 @@ package com.linkedin.datahub.graphql.types.mlmodel.mappers; +import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.Dataset; import com.linkedin.datahub.graphql.generated.MLFeatureDataType; import com.linkedin.datahub.graphql.generated.MLPrimaryKeyProperties; -import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; import java.util.stream.Collectors; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; public class MLPrimaryKeyPropertiesMapper - implements ModelMapper< + implements EmbeddedModelMapper< com.linkedin.ml.metadata.MLPrimaryKeyProperties, MLPrimaryKeyProperties> { public static final MLPrimaryKeyPropertiesMapper INSTANCE = new MLPrimaryKeyPropertiesMapper(); public static MLPrimaryKeyProperties map( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLPrimaryKeyProperties mlPrimaryKeyProperties) { - return INSTANCE.apply(context, mlPrimaryKeyProperties); + @Nonnull final com.linkedin.ml.metadata.MLPrimaryKeyProperties mlPrimaryKeyProperties, + @Nonnull Urn entityUrn) { + return INSTANCE.apply(context, mlPrimaryKeyProperties, entityUrn); } @Override public MLPrimaryKeyProperties apply( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLPrimaryKeyProperties mlPrimaryKeyProperties) { + @Nonnull final com.linkedin.ml.metadata.MLPrimaryKeyProperties mlPrimaryKeyProperties, + @Nonnull Urn entityUrn) { final MLPrimaryKeyProperties result = new MLPrimaryKeyProperties(); result.setDescription(mlPrimaryKeyProperties.getDescription()); @@ -45,6 +49,9 @@ public MLPrimaryKeyProperties apply( }) .collect(Collectors.toList())); + result.setCustomProperties( + CustomPropertiesMapper.map(mlPrimaryKeyProperties.getCustomProperties(), entityUrn)); + return result; } } diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 049527e5d77e3b..926cd256a5c5a4 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -9829,11 +9829,13 @@ type MLModelGroup implements EntityWithRelationships & Entity & BrowsableEntity type MLModelGroupProperties { -description: String + description: String createdAt: Long version: VersionTag + + customProperties: [CustomPropertiesEntry!] } """ @@ -10028,6 +10030,8 @@ type MLFeatureProperties { version: VersionTag sources: [Dataset] + + customProperties: [CustomPropertiesEntry!] } """ @@ -10164,13 +10168,15 @@ type MLPrimaryKey implements EntityWithRelationships & Entity { type MLPrimaryKeyProperties { -description: String + description: String dataType: MLFeatureDataType version: VersionTag sources: [Dataset] + + customProperties: [CustomPropertiesEntry!] } """ @@ -10347,7 +10353,7 @@ type MLModelGroupEditableProperties { type MLFeatureTableProperties { -description: String + description: String mlFeatures: [MLFeature] From 9762c46702dc4492d09a5810544dfa7922266fb1 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Thu, 19 Dec 2024 12:41:44 -0600 Subject: [PATCH 13/35] chore(bump): ingestion-base & actions (#12171) --- docker/datahub-ingestion-base/build.gradle | 2 +- docker/datahub-ingestion/build.gradle | 2 +- docker/profiles/docker-compose.actions.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/datahub-ingestion-base/build.gradle b/docker/datahub-ingestion-base/build.gradle index ef482de9256a33..f19faa227ca612 100644 --- a/docker/datahub-ingestion-base/build.gradle +++ b/docker/datahub-ingestion-base/build.gradle @@ -12,7 +12,7 @@ ext { docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" - revision = 7 // increment to trigger rebuild + revision = 8 // increment to trigger rebuild } docker { diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index 113a6dcf0a1bd4..b236a53c288f7f 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -12,7 +12,7 @@ ext { docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" - revision = 8 // increment to trigger rebuild + revision = 9 // increment to trigger rebuild } dependencies { diff --git a/docker/profiles/docker-compose.actions.yml b/docker/profiles/docker-compose.actions.yml index c2985f42993267..459fffdd8acf3f 100644 --- a/docker/profiles/docker-compose.actions.yml +++ b/docker/profiles/docker-compose.actions.yml @@ -6,7 +6,7 @@ x-search-datastore-elasticsearch-env: &search-datastore-env x-datahub-actions-service: &datahub-actions-service hostname: actions - image: ${DATAHUB_ACTIONS_IMAGE:-${DATAHUB_ACTIONS_REPO:-acryldata}/datahub-actions}:${ACTIONS_VERSION:-v0.1.1} + image: ${DATAHUB_ACTIONS_IMAGE:-${DATAHUB_ACTIONS_REPO:-acryldata}/datahub-actions}:${ACTIONS_VERSION:-v0.1.6} env_file: - datahub-actions/env/docker.env - ${DATAHUB_LOCAL_COMMON_ENV:-empty.env} From 45ace13fe26a9ae20ed9fcdd7df04bb7c197d52a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Thu, 19 Dec 2024 20:20:42 +0100 Subject: [PATCH 14/35] feat(mssql): platform instance aspect for dataflow and datajob entities (#12180) --- .../ingestion/source/sql/mssql/job_models.py | 31 +- .../ingestion/source/sql/mssql/source.py | 14 + .../golden_mces_mssql_to_file.json | 756 ++++++++++++------ .../sql_server/source_files/mssql_to_file.yml | 1 + 4 files changed, 574 insertions(+), 228 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py index 5107a4e38f64de..d3941e7add0fd0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -1,11 +1,17 @@ from dataclasses import dataclass, field from typing import Dict, List, Optional, Union -from datahub.emitter.mce_builder import make_data_flow_urn, make_data_job_urn +from datahub.emitter.mce_builder import ( + make_data_flow_urn, + make_data_job_urn, + make_data_platform_urn, + make_dataplatform_instance_urn, +) from datahub.metadata.schema_classes import ( DataFlowInfoClass, DataJobInfoClass, DataJobInputOutputClass, + DataPlatformInstanceClass, ) @@ -204,6 +210,18 @@ def as_datajob_info_aspect(self) -> DataJobInfoClass: status=self.status, ) + @property + def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: + if self.entity.flow.platform_instance: + return DataPlatformInstanceClass( + platform=make_data_platform_urn(self.entity.flow.orchestrator), + instance=make_dataplatform_instance_urn( + platform=self.entity.flow.orchestrator, + instance=self.entity.flow.platform_instance, + ), + ) + return None + @dataclass class MSSQLDataFlow: @@ -238,3 +256,14 @@ def as_dataflow_info_aspect(self) -> DataFlowInfoClass: customProperties=self.flow_properties, externalUrl=self.external_url, ) + + @property + def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: + if self.entity.platform_instance: + return DataPlatformInstanceClass( + platform=make_data_platform_urn(self.entity.orchestrator), + instance=make_dataplatform_instance_urn( + self.entity.orchestrator, self.entity.platform_instance + ), + ) + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index 414c1faaa1661a..9d8b67041998ce 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -639,6 +639,13 @@ def construct_job_workunits( aspect=data_job.as_datajob_info_aspect, ).as_workunit() + data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect + if data_platform_instance_aspect: + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_platform_instance_aspect, + ).as_workunit() + if include_lineage: yield MetadataChangeProposalWrapper( entityUrn=data_job.urn, @@ -654,6 +661,13 @@ def construct_flow_workunits( entityUrn=data_flow.urn, aspect=data_flow.as_dataflow_info_aspect, ).as_workunit() + + data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect + if data_platform_instance_aspect: + yield MetadataChangeProposalWrapper( + entityUrn=data_flow.urn, + aspect=data_platform_instance_aspect, + ).as_workunit() # TODO: Add SubType when it appear def get_inspectors(self) -> Iterable[Inspector]: diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index b67ebfb206883a..b36188405e7e11 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -1,13 +1,14 @@ [ { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData" }, @@ -23,7 +24,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -39,12 +40,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -55,7 +57,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -73,12 +75,17 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { - "path": [] + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + ] } }, "systemMetadata": { @@ -89,7 +96,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", "changeType": "UPSERT", "aspectName": "dataFlowInfo", "aspect": { @@ -105,19 +112,36 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { "json": { "customProperties": { - "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", + "job_id": "b8907be7-52f5-4df4-a870-f4fe0679ec45", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-05 16:44:43.910000", - "date_modified": "2024-12-05 16:44:44.043000", + "date_created": "2024-12-19 12:34:45.843000", + "date_modified": "2024-12-19 12:34:46.017000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -138,7 +162,24 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", "changeType": "UPSERT", "aspectName": "dataJobInputOutput", "aspect": { @@ -156,12 +197,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -172,13 +213,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_accessadmin" @@ -195,7 +237,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -211,12 +253,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -227,7 +270,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -245,15 +288,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -266,12 +313,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -282,13 +329,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_backupoperator" @@ -305,7 +353,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -321,12 +369,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -337,7 +386,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -355,15 +404,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -376,12 +429,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -392,13 +445,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_datareader" @@ -415,7 +469,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -431,12 +485,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -447,7 +502,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -465,15 +520,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -486,12 +545,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -502,13 +561,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_datawriter" @@ -525,7 +585,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -541,12 +601,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -557,7 +618,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -575,15 +636,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -596,12 +661,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -612,13 +677,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_ddladmin" @@ -635,7 +701,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -651,12 +717,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -667,7 +734,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -685,15 +752,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -706,12 +777,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -722,13 +793,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_denydatareader" @@ -745,7 +817,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -761,12 +833,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -777,7 +850,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -795,15 +868,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -816,12 +893,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -832,13 +909,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_denydatawriter" @@ -855,7 +933,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -871,12 +949,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -887,7 +966,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -905,15 +984,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -926,12 +1009,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -942,13 +1025,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_owner" @@ -965,7 +1049,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -981,12 +1065,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -997,7 +1082,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1015,15 +1100,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -1036,12 +1125,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -1052,13 +1141,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_securityadmin" @@ -1075,7 +1165,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1091,12 +1181,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -1107,7 +1198,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1125,15 +1216,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -1146,12 +1241,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -1162,13 +1257,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "dbo" @@ -1185,7 +1281,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1201,12 +1297,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -1217,7 +1314,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1235,15 +1332,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -1256,12 +1357,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.dbo.Products,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1" + "container": "urn:li:container:92899b29bb814fdeb1186eb99139073f" } }, "systemMetadata": { @@ -1273,7 +1374,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.dbo.Products,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1345,7 +1446,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.dbo.Products,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1363,19 +1481,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.dbo.Products,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" }, { - "id": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", - "urn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1" + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + }, + { + "id": "urn:li:container:92899b29bb814fdeb1186eb99139073f", + "urn": "urn:li:container:92899b29bb814fdeb1186eb99139073f" } ] } @@ -1388,12 +1510,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -1404,13 +1526,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "Foo" @@ -1427,7 +1550,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1443,12 +1566,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -1459,7 +1583,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1477,15 +1601,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -1498,12 +1626,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -1515,7 +1643,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1587,7 +1715,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1605,19 +1750,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + }, + { + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -1630,12 +1779,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -1647,7 +1796,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1720,7 +1869,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1738,19 +1904,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -1763,12 +1933,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -1780,7 +1950,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1877,7 +2047,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1895,19 +2082,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -1920,12 +2111,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -1937,7 +2128,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -2012,12 +2203,12 @@ { "name": "FK_TempSales_SalesReason", "foreignFields": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD),ID)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD),ID)" ], "sourceFields": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD),TempID)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD),TempID)" ], - "foreignDataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)" + "foreignDataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)" } ] } @@ -2033,7 +2224,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2051,19 +2259,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -2076,12 +2288,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -2093,7 +2305,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -2103,8 +2315,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2192,7 +2404,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2210,7 +2439,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -2228,19 +2457,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + }, + { + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -2253,7 +2486,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", "changeType": "UPSERT", "aspectName": "dataFlowInfo", "aspect": { @@ -2269,9 +2502,26 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -2282,8 +2532,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-05 16:44:43.800000", - "date_modified": "2024-12-05 16:44:43.800000" + "date_created": "2024-12-19 12:34:45.660000", + "date_modified": "2024-12-19 12:34:45.660000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2300,7 +2550,24 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -2310,8 +2577,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-05 16:44:43.803000", - "date_modified": "2024-12-05 16:44:43.803000" + "date_created": "2024-12-19 12:34:45.667000", + "date_modified": "2024-12-19 12:34:45.667000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2326,14 +2593,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -2344,13 +2628,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "guest" @@ -2367,7 +2652,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2383,12 +2668,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -2399,7 +2685,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2417,15 +2703,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -2438,12 +2728,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -2454,13 +2744,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "INFORMATION_SCHEMA" @@ -2477,7 +2768,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2493,12 +2784,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -2509,7 +2801,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2527,15 +2819,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -2548,12 +2844,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -2564,13 +2860,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "sys" @@ -2587,7 +2884,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2603,12 +2900,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -2619,7 +2917,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2637,15 +2935,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -2658,7 +2960,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -2669,7 +2971,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.demodata.foo.persons,PROD)", "type": "VIEW" } ] @@ -2683,7 +2985,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2699,7 +3001,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2715,7 +3017,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2731,7 +3033,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2747,7 +3049,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/sql_server/source_files/mssql_to_file.yml b/metadata-ingestion/tests/integration/sql_server/source_files/mssql_to_file.yml index 40bef3ff104a39..e003ec39cd5282 100644 --- a/metadata-ingestion/tests/integration/sql_server/source_files/mssql_to_file.yml +++ b/metadata-ingestion/tests/integration/sql_server/source_files/mssql_to_file.yml @@ -7,6 +7,7 @@ source: password: test!Password database: DemoData host_port: localhost:21433 + platform_instance: my-instance # use_odbc: True # uri_args: # driver: "ODBC Driver 17 for SQL Server" From acb76cd97c8fc104b5c26a438db862a8d5e87705 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Thu, 19 Dec 2024 20:26:58 +0100 Subject: [PATCH 15/35] fix(tableau): prevents warning in case of site admin creator role (#12175) --- .../src/datahub/ingestion/source/tableau/tableau.py | 2 +- .../datahub/ingestion/source/tableau/tableau_constant.py | 4 +++- .../ingestion/source/tableau/tableau_server_wrapper.py | 8 ++++++-- .../ingestion/source/tableau/tableau_validation.py | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 6cc2220d90fd93..7838e5fa256b85 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -645,7 +645,7 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None: # the site-role might be different on another site logged_in_user: UserInfo = UserInfo.from_server(server=server) - if not logged_in_user.is_site_administrator_explorer(): + if not logged_in_user.has_site_administrator_explorer_privileges(): report.warning( title=title, message=message, diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py index ea0878143ef354..d69312f803021a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py @@ -82,4 +82,6 @@ SITE = "Site" IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql" SITE_PERMISSION = "sitePermission" -SITE_ROLE = "SiteAdministratorExplorer" +ROLE_SITE_ADMIN_EXPLORER = "SiteAdministratorExplorer" +ROLE_SITE_ADMIN_CREATOR = "SiteAdministratorCreator" +ROLE_SERVER_ADMIN = "ServerAdministrator" diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py index f309622d12b91b..482140a227511a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py @@ -11,8 +11,12 @@ class UserInfo: site_role: str site_id: str - def is_site_administrator_explorer(self): - return self.site_role == c.SITE_ROLE + def has_site_administrator_explorer_privileges(self): + return self.site_role in [ + c.ROLE_SITE_ADMIN_EXPLORER, + c.ROLE_SITE_ADMIN_CREATOR, + c.ROLE_SERVER_ADMIN, + ] @staticmethod def from_server(server: Server) -> "UserInfo": diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py index 4a703faf6091b3..4ec0e5ef01d3c6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py @@ -28,7 +28,7 @@ def check_user_role( try: # TODO: Add check for `Enable Derived Permissions` - if not logged_in_user.is_site_administrator_explorer(): + if not logged_in_user.has_site_administrator_explorer_privileges(): capability_dict[c.SITE_PERMISSION] = CapabilityReport( capable=False, failure_reason=f"{failure_reason} Their current role is {logged_in_user.site_role}.", From eceb799e634aa19340dbfe9da51714311f401996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 20 Dec 2024 08:37:21 +0100 Subject: [PATCH 16/35] fix(tableau): restart server object when reauthenticating (#12182) Co-authored-by: Harshal Sheth --- .../src/datahub/ingestion/source/tableau/tableau.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 7838e5fa256b85..fadcb8ff8f3966 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -896,10 +896,9 @@ def dataset_browse_prefix(self) -> str: return f"/{self.config.env.lower()}{self.no_env_browse_prefix}" def _re_authenticate(self): - tableau_auth: Union[ - TableauAuth, PersonalAccessTokenAuth - ] = self.config.get_tableau_auth(self.site_id) - self.server.auth.sign_in(tableau_auth) + # Sign-in again may not be enough because Tableau sometimes caches invalid sessions + # so we need to recreate the Tableau Server object + self.server = self.config.make_tableau_client(self.site_id) @property def site_content_url(self) -> Optional[str]: From 66df362c0f7f10f5f0230054977410c3f1eb688a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 20 Dec 2024 09:57:53 +0100 Subject: [PATCH 17/35] fix(dagster): support dagster v1.9.6 (#12189) --- .../src/datahub_dagster_plugin/client/dagster_generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py index 2fdd0a41edf6cb..a87f490f2d947e 100644 --- a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py +++ b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py @@ -522,7 +522,7 @@ def generate_datajob( # Also, add datahub inputs/outputs if present in input/output metatdata. for input_def_snap in op_def_snap.input_def_snaps: job_property_bag[f"input.{input_def_snap.name}"] = str( - input_def_snap._asdict() + input_def_snap.__dict__ ) if Constant.DATAHUB_INPUTS in input_def_snap.metadata: datajob.inlets.extend( @@ -533,7 +533,7 @@ def generate_datajob( for output_def_snap in op_def_snap.output_def_snaps: job_property_bag[f"output_{output_def_snap.name}"] = str( - output_def_snap._asdict() + output_def_snap.__dict__ ) if ( Constant.DATAHUB_OUTPUTS in output_def_snap.metadata From 42d4254cdcc13b10e4955bfabff83bf09e56c0dd Mon Sep 17 00:00:00 2001 From: kevinkarchacryl Date: Fri, 20 Dec 2024 04:30:59 -0500 Subject: [PATCH 18/35] fix(graphql): add suspended to corpuserstatus (#12185) --- datahub-graphql-core/src/main/resources/entity.graphql | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 926cd256a5c5a4..e086273068ee53 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -3838,6 +3838,11 @@ enum CorpUserStatus { A User that has been provisioned and logged in """ ACTIVE + + """ + A user that has been suspended + """ + SUSPENDED } union ResolvedActor = CorpUser | CorpGroup From f4f9bd3bca62beb15741493b11003642cd5a6889 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Fri, 20 Dec 2024 17:45:43 +0530 Subject: [PATCH 19/35] =?UTF-8?q?feat(ingest/snowflake):=20include=20exter?= =?UTF-8?q?nal=20table=20ddl=20lineage=20for=20queries=E2=80=A6=20(#12179)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../source/snowflake/snowflake_lineage_v2.py | 55 ++----------------- .../source/snowflake/snowflake_queries.py | 3 - .../source/snowflake/snowflake_schema_gen.py | 54 +++++++++++++++++- .../source/snowflake/snowflake_v2.py | 51 ++++++++--------- .../source_report/ingestion_stage.py | 1 + 5 files changed, 80 insertions(+), 84 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index c769c6705ac3f6..69f28a0e6e595a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -265,64 +265,17 @@ def _populate_external_upstreams(self, discovered_tables: List[str]) -> None: with PerfTimer() as timer: self.report.num_external_table_edges_scanned = 0 - for ( - known_lineage_mapping - ) in self._populate_external_lineage_from_copy_history(discovered_tables): - self.sql_aggregator.add(known_lineage_mapping) - logger.info( - "Done populating external lineage from copy history. " - f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far." - ) - - for ( - known_lineage_mapping - ) in self._populate_external_lineage_from_show_query(discovered_tables): - self.sql_aggregator.add(known_lineage_mapping) - - logger.info( - "Done populating external lineage from show external tables. " - f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far." - ) + for entry in self._get_copy_history_lineage(discovered_tables): + self.sql_aggregator.add(entry) + logger.info("Done populating external lineage from copy history. ") self.report.external_lineage_queries_secs = timer.elapsed_seconds() - # Handles the case for explicitly created external tables. - # NOTE: Snowflake does not log this information to the access_history table. - def _populate_external_lineage_from_show_query( - self, discovered_tables: List[str] - ) -> Iterable[KnownLineageMapping]: - external_tables_query: str = SnowflakeQuery.show_external_tables() - try: - for db_row in self.connection.query(external_tables_query): - key = self.identifiers.get_dataset_identifier( - db_row["name"], db_row["schema_name"], db_row["database_name"] - ) - - if key not in discovered_tables: - continue - if db_row["location"].startswith("s3://"): - yield KnownLineageMapping( - upstream_urn=make_s3_urn_for_lineage( - db_row["location"], self.config.env - ), - downstream_urn=self.identifiers.gen_dataset_urn(key), - ) - self.report.num_external_table_edges_scanned += 1 - - self.report.num_external_table_edges_scanned += 1 - except Exception as e: - logger.debug(e, exc_info=e) - self.structured_reporter.warning( - "Error populating external table lineage from Snowflake", - exc=e, - ) - self.report_status(EXTERNAL_LINEAGE, False) - # Handles the case where a table is populated from an external stage/s3 location via copy. # Eg: copy into category_english from @external_s3_stage; # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...') pattern='.*.csv'; # NOTE: Snowflake does not log this information to the access_history table. - def _populate_external_lineage_from_copy_history( + def _get_copy_history_lineage( self, discovered_tables: List[str] ) -> Iterable[KnownLineageMapping]: query: str = SnowflakeQuery.copy_lineage_history( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 2d2bdc50467c64..174aad0bddd4a8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -247,9 +247,6 @@ def get_workunits_internal( for entry in self.fetch_copy_history(): queries.append(entry) - # TODO: Add "show external tables" lineage to the main schema extractor. - # Because it's not a time-based thing, it doesn't really make sense in the snowflake-queries extractor. - with self.report.query_log_fetch_timer: for entry in self.fetch_query_log(): queries.append(entry) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index bc64693b6a1084..4b72b09fafe2dd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -16,6 +16,7 @@ ClassificationHandler, classification_workunit_processor, ) +from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage from datahub.ingestion.source.common.subtypes import ( DatasetContainerSubTypes, DatasetSubTypes, @@ -35,6 +36,7 @@ ) from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report from datahub.ingestion.source.snowflake.snowflake_schema import ( SCHEMA_PARALLELISM, @@ -65,6 +67,7 @@ get_domain_wu, ) from datahub.ingestion.source_report.ingestion_stage import ( + EXTERNAL_TABLE_DDL_LINEAGE, METADATA_EXTRACTION, PROFILING, ) @@ -96,7 +99,10 @@ TimeType, ) from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties -from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator +from datahub.sql_parsing.sql_parsing_aggregator import ( + KnownLineageMapping, + SqlParsingAggregator, +) from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor @@ -180,7 +186,8 @@ def __init__( # These are populated as side-effects of get_workunits_internal. self.databases: List[SnowflakeDatabase] = [] - self.aggregator: Optional[SqlParsingAggregator] = aggregator + + self.aggregator = aggregator def get_connection(self) -> SnowflakeConnection: return self.connection @@ -212,6 +219,19 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION) yield from self._process_database(snowflake_db) + self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE) + discovered_tables: List[str] = [ + self.identifiers.get_dataset_identifier( + table_name, schema.name, db.name + ) + for db in self.databases + for schema in db.schemas + for table_name in schema.tables + ] + if self.aggregator: + for entry in self._external_tables_ddl_lineage(discovered_tables): + self.aggregator.add(entry) + except SnowflakePermissionError as e: self.structured_reporter.failure( GENERIC_PERMISSION_ERROR_KEY, @@ -1082,3 +1102,33 @@ def get_fk_constraints_for_table( # Access to table but none of its constraints - is this possible ? return constraints.get(table_name, []) + + # Handles the case for explicitly created external tables. + # NOTE: Snowflake does not log this information to the access_history table. + def _external_tables_ddl_lineage( + self, discovered_tables: List[str] + ) -> Iterable[KnownLineageMapping]: + external_tables_query: str = SnowflakeQuery.show_external_tables() + try: + for db_row in self.connection.query(external_tables_query): + key = self.identifiers.get_dataset_identifier( + db_row["name"], db_row["schema_name"], db_row["database_name"] + ) + + if key not in discovered_tables: + continue + if db_row["location"].startswith("s3://"): + yield KnownLineageMapping( + upstream_urn=make_s3_urn_for_lineage( + db_row["location"], self.config.env + ), + downstream_urn=self.identifiers.gen_dataset_urn(key), + ) + self.report.num_external_table_edges_scanned += 1 + + self.report.num_external_table_edges_scanned += 1 + except Exception as e: + self.structured_reporter.warning( + "External table ddl lineage extraction failed", + exc=e, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index e5883dd0349a3a..884e6c49f5b62a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -161,35 +161,32 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): # For database, schema, tables, views, etc self.data_dictionary = SnowflakeDataDictionary(connection=self.connection) self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None - self.aggregator: Optional[SqlParsingAggregator] = None - - if self.config.use_queries_v2 or self.config.include_table_lineage: - self.aggregator = self._exit_stack.enter_context( - SqlParsingAggregator( - platform=self.identifiers.platform, - platform_instance=self.config.platform_instance, - env=self.config.env, - graph=self.ctx.graph, - eager_graph_load=( - # If we're ingestion schema metadata for tables/views, then we will populate - # schemas into the resolver as we go. We only need to do a bulk fetch - # if we're not ingesting schema metadata as part of ingestion. - not ( - self.config.include_technical_schema - and self.config.include_tables - and self.config.include_views - ) - and not self.config.lazy_schema_resolver - ), - generate_usage_statistics=False, - generate_operations=False, - format_queries=self.config.format_sql_queries, - ) + + self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context( + SqlParsingAggregator( + platform=self.identifiers.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + graph=self.ctx.graph, + eager_graph_load=( + # If we're ingestion schema metadata for tables/views, then we will populate + # schemas into the resolver as we go. We only need to do a bulk fetch + # if we're not ingesting schema metadata as part of ingestion. + not ( + self.config.include_technical_schema + and self.config.include_tables + and self.config.include_views + ) + and not self.config.lazy_schema_resolver + ), + generate_usage_statistics=False, + generate_operations=False, + format_queries=self.config.format_sql_queries, ) - self.report.sql_aggregator = self.aggregator.report + ) + self.report.sql_aggregator = self.aggregator.report if self.config.include_table_lineage: - assert self.aggregator is not None redundant_lineage_run_skip_handler: Optional[ RedundantLineageRunSkipHandler ] = None @@ -487,8 +484,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: databases = schema_extractor.databases - # TODO: The checkpoint state for stale entity detection can be committed here. - if self.config.shares: yield from SnowflakeSharesHandler( self.config, self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index 4308b405e46e37..92407eaae6e901 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -14,6 +14,7 @@ USAGE_EXTRACTION_INGESTION = "Usage Extraction Ingestion" USAGE_EXTRACTION_OPERATIONAL_STATS = "Usage Extraction Operational Stats" USAGE_EXTRACTION_USAGE_AGGREGATION = "Usage Extraction Usage Aggregation" +EXTERNAL_TABLE_DDL_LINEAGE = "External table DDL Lineage" QUERIES_EXTRACTION = "Queries Extraction" PROFILING = "Profiling" From 157013949e32dc664eb85127ca3b3c78c936e88f Mon Sep 17 00:00:00 2001 From: deepgarg-visa <149145061+deepgarg-visa@users.noreply.github.com> Date: Fri, 20 Dec 2024 21:42:10 +0530 Subject: [PATCH 20/35] fix(gms): Change names of charts in Analytics (#12192) --- .../datahub/graphql/analytics/resolver/GetChartsResolver.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java index 197ac87c1e22d8..d9b8008d46286a 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java @@ -125,7 +125,7 @@ private AnalyticsChart getTopUsersChart(OperationContext opContext) { final DateRange trailingMonthDateRange = dateUtil.getTrailingMonthDateRange(); final List columns = ImmutableList.of("Name", "Title", "Email"); - final String topUsersTitle = "Top Users"; + final String topUsersTitle = "Top Users (Last 30 Days)"; final List topUserRows = _analyticsService.getTopNTableChart( _analyticsService.getUsageIndexName(), @@ -198,7 +198,7 @@ private Row buildNewUsersRow(@Nonnull final SearchEntity entity) { private AnalyticsChart getNewUsersChart(OperationContext opContext) { try { final List columns = ImmutableList.of("Name", "Title", "Email"); - final String newUsersTitle = "New Users"; + final String newUsersTitle = "Active Users (Last 30 Days)"; final SearchResult result = searchForNewUsers(opContext); final List newUserRows = new ArrayList<>(); for (SearchEntity entity : result.getEntities()) { From e52a4deba8a6d436093257437cb3ae5d6148e4f8 Mon Sep 17 00:00:00 2001 From: skrydal Date: Fri, 20 Dec 2024 17:41:18 +0100 Subject: [PATCH 21/35] fix(ingest/databricks): Fix profiling (#12060) --- .../src/datahub/emitter/rest_emitter.py | 17 +- .../auto_ensure_aspect_size.py | 96 +++++ .../datahub/ingestion/source/unity/source.py | 4 + .../source_helpers/test_ensure_aspect_size.py | 346 ++++++++++++++++++ 4 files changed, 462 insertions(+), 1 deletion(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py create mode 100644 metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index e2bc14925ad383..675717b5ec4829 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -291,6 +291,7 @@ def emit_mcps( mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]], async_flag: Optional[bool] = None, ) -> int: + logger.debug("Attempting to emit batch mcps") url = f"{self._gms_server}/aspects?action=ingestProposalBatch" for mcp in mcps: ensure_has_system_metadata(mcp) @@ -303,15 +304,22 @@ def emit_mcps( current_chunk_size = INGEST_MAX_PAYLOAD_BYTES for mcp_obj in mcp_objs: mcp_obj_size = len(json.dumps(mcp_obj)) + logger.debug( + f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}" + ) if ( mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH ): + logger.debug("Decided to create new chunk") mcp_obj_chunks.append([]) current_chunk_size = 0 mcp_obj_chunks[-1].append(mcp_obj) current_chunk_size += mcp_obj_size + logger.debug( + f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks" + ) for mcp_obj_chunk in mcp_obj_chunks: # TODO: We're calling json.dumps on each MCP object twice, once to estimate @@ -338,8 +346,15 @@ def emit_usage(self, usageStats: UsageAggregation) -> None: def _emit_generic(self, url: str, payload: str) -> None: curl_command = make_curl_command(self._session, "POST", url, payload) + payload_size = len(payload) + if payload_size > INGEST_MAX_PAYLOAD_BYTES: + # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail + logger.warning( + f"Apparent payload size exceeded {INGEST_MAX_PAYLOAD_BYTES}, might fail with an exception due to the size" + ) logger.debug( - "Attempting to emit to DataHub GMS; using curl equivalent to:\n%s", + "Attempting to emit aspect (size: %s) to DataHub GMS; using curl equivalent to:\n%s", + payload_size, curl_command, ) try: diff --git a/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py b/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py new file mode 100644 index 00000000000000..559f0b77f59dfa --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py @@ -0,0 +1,96 @@ +import json +import logging +from typing import Iterable, List + +from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES +from datahub.emitter.serialization_helper import pre_json_transform +from datahub.ingestion.api.source import SourceReport +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.metadata.schema_classes import ( + DatasetProfileClass, + SchemaFieldClass, + SchemaMetadataClass, +) + +logger = logging.getLogger(__name__) + + +class EnsureAspectSizeProcessor: + def __init__( + self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES + ): + self.report = report + self.payload_constraint = payload_constraint + + def ensure_dataset_profile_size( + self, dataset_urn: str, profile: DatasetProfileClass + ) -> None: + """ + This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted + in the future + """ + sample_fields_size = 0 + if profile.fieldProfiles: + logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}") + for field in profile.fieldProfiles: + if field.sampleValues: + values_len = 0 + for value in field.sampleValues: + if value: + values_len += len(value) + logger.debug( + f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}" + ) + if sample_fields_size + values_len > self.payload_constraint: + field.sampleValues = [] + self.report.warning( + title="Dataset profile truncated due to size constraint", + message="Dataset profile contained too much data and would have caused ingestion to fail", + context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints", + ) + else: + sample_fields_size += values_len + else: + logger.debug(f"Field {field.fieldPath} has no sample values") + + def ensure_schema_metadata_size( + self, dataset_urn: str, schema: SchemaMetadataClass + ) -> None: + """ + This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted + in the future + """ + total_fields_size = 0 + logger.debug(f"Amount of schema fields: {len(schema.fields)}") + accepted_fields: List[SchemaFieldClass] = [] + for field in schema.fields: + field_size = len(json.dumps(pre_json_transform(field.to_obj()))) + logger.debug(f"Field {field.fieldPath} takes total {field_size}") + if total_fields_size + field_size < self.payload_constraint: + accepted_fields.append(field) + total_fields_size += field_size + else: + self.report.warning( + title="Schema truncated due to size constraint", + message="Dataset schema contained too much data and would have caused ingestion to fail", + context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints", + ) + + schema.fields = accepted_fields + + def ensure_aspect_size( + self, + stream: Iterable[MetadataWorkUnit], + ) -> Iterable[MetadataWorkUnit]: + """ + We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception + on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects. + """ + for wu in stream: + logger.debug(f"Ensuring size of workunit: {wu.id}") + + if schema := wu.get_aspect_of_type(SchemaMetadataClass): + self.ensure_schema_metadata_size(wu.get_urn(), schema) + elif profile := wu.get_aspect_of_type(DatasetProfileClass): + self.ensure_dataset_profile_size(wu.get_urn(), profile) + yield wu diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 9d9a746580f939..7bfa7fdb28aaf8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -26,6 +26,9 @@ gen_containers, ) from datahub.emitter.sql_parsing_builder import SqlParsingBuilder +from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import ( + EnsureAspectSizeProcessor, +) from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, @@ -260,6 +263,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: StaleEntityRemovalHandler.create( self, self.config, self.ctx ).workunit_processor, + EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size, ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py b/metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py new file mode 100644 index 00000000000000..bdf1e0a2e0e860 --- /dev/null +++ b/metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py @@ -0,0 +1,346 @@ +import json +import time +from unittest.mock import patch + +import pytest +from freezegun.api import freeze_time + +from datahub.emitter.aspect import JSON_CONTENT_TYPE +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES +from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import ( + EnsureAspectSizeProcessor, +) +from datahub.ingestion.api.source import SourceReport +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent +from datahub.metadata.schema_classes import ( + ChangeTypeClass, + DatasetFieldProfileClass, + DatasetProfileClass, + DatasetSnapshotClass, + GenericAspectClass, + MetadataChangeProposalClass, + NumberTypeClass, + OtherSchemaClass, + SchemaFieldClass, + SchemaFieldDataTypeClass, + SchemaMetadataClass, + StatusClass, + StringTypeClass, + SubTypesClass, +) + + +@pytest.fixture +def processor(): + return EnsureAspectSizeProcessor(SourceReport()) + + +def too_big_schema_metadata() -> SchemaMetadataClass: + fields = [ + SchemaFieldClass( + "aaaa", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ), + SchemaFieldClass( + "bbbb", + nativeDataType="string", + type=SchemaFieldDataTypeClass(type=StringTypeClass()), + ), + SchemaFieldClass( + "cccc", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ), + ] + # simple int type field takes ~160 bytes in JSON representation, below is to assure we exceed the threshold + for f_no in range(1000): + fields.append( + SchemaFieldClass( + fieldPath=f"t{f_no}", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + description=20000 * "a", + ) + ) + + # adding small field to check whether it will still be present in the output + fields.append( + SchemaFieldClass( + "dddd", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ) + ) + return SchemaMetadataClass( + schemaName="abcdef", + version=1, + platform="s3", + hash="ABCDE1234567890", + platformSchema=OtherSchemaClass(rawSchema="aaa"), + fields=fields, + ) + + +def proper_schema_metadata() -> SchemaMetadataClass: + fields = [ + SchemaFieldClass( + "aaaa", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ), + SchemaFieldClass( + "bbbb", + nativeDataType="string", + type=SchemaFieldDataTypeClass(type=StringTypeClass()), + ), + SchemaFieldClass( + "cccc", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ), + ] + return SchemaMetadataClass( + schemaName="abcdef", + version=1, + platform="s3", + hash="ABCDE1234567890", + platformSchema=OtherSchemaClass(rawSchema="aaa"), + fields=fields, + ) + + +def proper_dataset_profile() -> DatasetProfileClass: + sample_values = [ + "23483295", + "234234", + "324234", + "12123", + "3150314", + "19231", + "211", + "93498", + "12837", + "73847", + "12434", + "33466", + "98785", + "4546", + "4547", + "342", + "11", + "34", + "444", + "38576", + ] + field_profiles = [ + DatasetFieldProfileClass(fieldPath="a", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="b", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="c", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="d", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="e", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="f", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="g", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="h", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="i", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="j", sampleValues=sample_values), + ] + return DatasetProfileClass( + timestampMillis=int(time.time()) * 1000, fieldProfiles=field_profiles + ) + + +@freeze_time("2023-01-02 00:00:00") +def test_ensure_size_of_proper_dataset_profile(processor): + profile = proper_dataset_profile() + orig_repr = json.dumps(profile.to_obj()) + processor.ensure_dataset_profile_size( + "urn:li:dataset:(s3, dummy_dataset, DEV)", profile + ) + assert orig_repr == json.dumps( + profile.to_obj() + ), "Aspect was modified in case where workunit processor should have been no-op" + + +@freeze_time("2023-01-02 00:00:00") +def test_ensure_size_of_too_big_schema_metadata(processor): + schema = too_big_schema_metadata() + assert len(schema.fields) == 1004 + + processor.ensure_schema_metadata_size( + "urn:li:dataset:(s3, dummy_dataset, DEV)", schema + ) + assert len(schema.fields) < 1004, "Schema has not been properly truncated" + assert schema.fields[-1].fieldPath == "dddd", "Small field was not added at the end" + # +100kb is completely arbitrary, but we are truncating the aspect based on schema fields size only, not total taken + # by other parameters of the aspect - it is reasonable approach though - schema fields is the only field in schema + # metadata which can be expected to grow out of control + assert ( + len(json.dumps(schema.to_obj())) < INGEST_MAX_PAYLOAD_BYTES + 100000 + ), "Aspect exceeded acceptable size" + + +@freeze_time("2023-01-02 00:00:00") +def test_ensure_size_of_proper_schema_metadata(processor): + schema = proper_schema_metadata() + orig_repr = json.dumps(schema.to_obj()) + processor.ensure_schema_metadata_size( + "urn:li:dataset:(s3, dummy_dataset, DEV)", schema + ) + assert orig_repr == json.dumps( + schema.to_obj() + ), "Aspect was modified in case where workunit processor should have been no-op" + + +@freeze_time("2023-01-02 00:00:00") +def test_ensure_size_of_too_big_dataset_profile(processor): + profile = proper_dataset_profile() + big_field = DatasetFieldProfileClass( + fieldPath="big", + sampleValues=20 * [(int(INGEST_MAX_PAYLOAD_BYTES / 20) - 10) * "a"], + ) + assert profile.fieldProfiles + profile.fieldProfiles.insert(4, big_field) + processor.ensure_dataset_profile_size( + "urn:li:dataset:(s3, dummy_dataset, DEV)", profile + ) + + expected_profile = proper_dataset_profile() + reduced_field = DatasetFieldProfileClass( + fieldPath="big", + sampleValues=[], + ) + assert expected_profile.fieldProfiles + expected_profile.fieldProfiles.insert(4, reduced_field) + assert json.dumps(profile.to_obj()) == json.dumps( + expected_profile.to_obj() + ), "Field 'big' was not properly removed from aspect due to its size" + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_triggered_by_data_profile_aspect( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + ret = [ # noqa: F841 + *processor.ensure_aspect_size( + [ + MetadataChangeProposalWrapper( + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspect=proper_dataset_profile(), + ).as_workunit() + ] + ) + ] + ensure_dataset_profile_size_mock.assert_called_once() + ensure_schema_metadata_size_mock.assert_not_called() + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_triggered_by_data_profile_aspect_mcpc( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + profile_aspect = proper_dataset_profile() + mcpc = MetadataWorkUnit( + id="test", + mcp_raw=MetadataChangeProposalClass( + entityType="dataset", + changeType=ChangeTypeClass.UPSERT, + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspectName=DatasetProfileClass.ASPECT_NAME, + aspect=GenericAspectClass( + value=json.dumps(profile_aspect.to_obj()).encode(), + contentType=JSON_CONTENT_TYPE, + ), + ), + ) + ret = [*processor.ensure_aspect_size([mcpc])] # noqa: F841 + ensure_dataset_profile_size_mock.assert_called_once() + ensure_schema_metadata_size_mock.assert_not_called() + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_triggered_by_data_profile_aspect_mce( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + snapshot = DatasetSnapshotClass( + urn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspects=[proper_schema_metadata()], + ) + mce = MetadataWorkUnit( + id="test", mce=MetadataChangeEvent(proposedSnapshot=snapshot) + ) + ret = [*processor.ensure_aspect_size([mce])] # noqa: F841 + ensure_schema_metadata_size_mock.assert_called_once() + ensure_dataset_profile_size_mock.assert_not_called() + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_triggered_by_schema_metadata_aspect( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + ret = [ # noqa: F841 + *processor.ensure_aspect_size( + [ + MetadataChangeProposalWrapper( + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspect=proper_schema_metadata(), + ).as_workunit() + ] + ) + ] + ensure_schema_metadata_size_mock.assert_called_once() + ensure_dataset_profile_size_mock.assert_not_called() + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_not_triggered_by_unhandled_aspects( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + ret = [ # noqa: F841 + *processor.ensure_aspect_size( + [ + MetadataChangeProposalWrapper( + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspect=StatusClass(removed=False), + ).as_workunit(), + MetadataChangeProposalWrapper( + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspect=SubTypesClass(typeNames=["table"]), + ).as_workunit(), + ] + ) + ] + ensure_schema_metadata_size_mock.assert_not_called() + ensure_dataset_profile_size_mock.assert_not_called() From 98c056d569d4e5f2fa031a5a3ac8f3009ee49567 Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Sat, 21 Dec 2024 00:36:57 +0530 Subject: [PATCH 22/35] refactor(ingest/tableau): mark the `fetch_size` configuration as deprecated (#12126) --- .../ingestion/source/tableau/tableau.py | 18 +++++++++++------- .../integration/tableau/test_tableau_ingest.py | 1 + 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index fadcb8ff8f3966..984cf9357199d6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -49,6 +49,7 @@ DatasetSourceConfigMixin, ) from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated +from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ( ContainerKey, @@ -380,11 +381,6 @@ class TableauConfig( description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.", ) - fetch_size: int = Field( - default=250, - description="Specifies the number of records to retrieve in each batch during a query execution.", - ) - # We've found that even with a small workbook page size (e.g. 10), the Tableau API often # returns warnings like this: # { @@ -499,6 +495,10 @@ class TableauConfig( "This can only be used with ingest_tags enabled as it will overwrite tags entered from the UI.", ) + _fetch_size = pydantic_removed_field( + "fetch_size", + ) + # pre = True because we want to take some decision before pydantic initialize the configuration to default values @root_validator(pre=True) def projects_backward_compatibility(cls, values: Dict) -> Dict: @@ -1147,7 +1147,7 @@ def get_connection_object_page( connection_type: str, query_filter: str, current_cursor: Optional[str], - fetch_size: int = 250, + fetch_size: int, retry_on_auth_error: bool = True, retries_remaining: Optional[int] = None, ) -> Tuple[dict, Optional[str], int]: @@ -1344,7 +1344,11 @@ def get_connection_objects( connection_type=connection_type, query_filter=filter_, current_cursor=current_cursor, - fetch_size=self.config.fetch_size, + # `filter_page` contains metadata object IDs (e.g., Project IDs, Field IDs, Sheet IDs, etc.). + # The number of IDs is always less than or equal to page_size. + # If the IDs are primary keys, the number of metadata objects to load matches the number of records to return. + # In our case, mostly, the IDs are primary key, therefore, fetch_size is set equal to page_size. + fetch_size=page_size, ) yield from connection_objects.get(c.NODES) or [] diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 4b2ac96931b950..fa00eaef9ccabb 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -1324,6 +1324,7 @@ def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph): query_filter=mock.MagicMock(), current_cursor=None, retries_remaining=1, + fetch_size=10, ) warnings = list(reporter.warnings) From 3c3d0322fe9608ccf7cbaadfd83f6f7f0e7afeff Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Sat, 21 Dec 2024 01:27:34 +0530 Subject: [PATCH 23/35] test(ingest/tableau): add test for extract_project_hierarchy scenario (#12079) --- .../tableau/test_tableau_ingest.py | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index fa00eaef9ccabb..c3a8880bf20a09 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -27,6 +27,7 @@ from datahub.ingestion.source.tableau import tableau_constant as c from datahub.ingestion.source.tableau.tableau import ( TableauConfig, + TableauProject, TableauSiteSource, TableauSource, TableauSourceReport, @@ -1342,6 +1343,82 @@ def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) +@pytest.mark.parametrize( + "extract_project_hierarchy, allowed_projects", + [ + (True, ["project1", "project4", "project3"]), + (False, ["project1", "project4"]), + ], +) +def test_extract_project_hierarchy(extract_project_hierarchy, allowed_projects): + context = PipelineContext(run_id="0", pipeline_name="test_tableau") + + config_dict = config_source_default.copy() + + del config_dict["stateful_ingestion"] + del config_dict["projects"] + + config_dict["project_pattern"] = { + "allow": ["project1", "project4"], + "deny": ["project2"], + } + + config_dict["extract_project_hierarchy"] = extract_project_hierarchy + + config = TableauConfig.parse_obj(config_dict) + + site_source = TableauSiteSource( + config=config, + ctx=context, + platform="tableau", + site=SiteItem(name="Site 1", content_url="site1"), + site_id="site1", + report=TableauSourceReport(), + server=Server("https://test-tableau-server.com"), + ) + + all_project_map: Dict[str, TableauProject] = { + "p1": TableauProject( + id="1", + name="project1", + path=[], + parent_id=None, + parent_name=None, + description=None, + ), + "p2": TableauProject( + id="2", + name="project2", + path=[], + parent_id="1", + parent_name="project1", + description=None, + ), + "p3": TableauProject( + id="3", + name="project3", + path=[], + parent_id="1", + parent_name="project1", + description=None, + ), + "p4": TableauProject( + id="4", + name="project4", + path=[], + parent_id=None, + parent_name=None, + description=None, + ), + } + + site_source._init_tableau_project_registry(all_project_map) + + assert allowed_projects == [ + project.name for project in site_source.tableau_project_registry.values() + ] + + @pytest.mark.integration def test_connection_report_test(requests_mock): server_info_response = """ From 667fa8fccec40037c55ec1c99a35777dbc0e5eaf Mon Sep 17 00:00:00 2001 From: "nicholas.fwang" Date: Sat, 21 Dec 2024 04:59:44 +0900 Subject: [PATCH 24/35] docs(structured properties): fix entityTypes in creating structured property (#12187) --- docs/api/tutorials/structured-properties.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/tutorials/structured-properties.md b/docs/api/tutorials/structured-properties.md index 95c89424e9ca7a..2caa015e206595 100644 --- a/docs/api/tutorials/structured-properties.md +++ b/docs/api/tutorials/structured-properties.md @@ -73,7 +73,7 @@ mutation createStructuredProperty { {numberValue: 365, description:"Use this for non-sensitive data that can be retained for longer"} ], cardinality: SINGLE, - entityTypes: ["urn:li:entityType:dataset", "urn:li:entityType:dataFlow"], + entityTypes: ["urn:li:entityType:datahub.dataset", "urn:li:entityType:datahub.dataFlow"], } ) { urn From 327c6f911ada269d8ad9554bceed8aaf16568295 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Fri, 20 Dec 2024 15:59:07 -0600 Subject: [PATCH 25/35] chore(bump): bump alpine and dockerize (#12184) --- .../docker-custom-build-and-push/action.yml | 3 +- .github/workflows/docker-postgres-setup.yml | 2 +- .github/workflows/docker-unified.yml | 46 +++++++++---------- docker/datahub-gms/Dockerfile | 4 +- docker/datahub-mae-consumer/Dockerfile | 4 +- docker/datahub-mce-consumer/Dockerfile | 4 +- docker/datahub-upgrade/Dockerfile | 4 +- docker/elasticsearch-setup/Dockerfile | 4 +- docker/mysql-setup/Dockerfile | 4 +- docker/postgres-setup/Dockerfile | 4 +- 10 files changed, 40 insertions(+), 39 deletions(-) diff --git a/.github/actions/docker-custom-build-and-push/action.yml b/.github/actions/docker-custom-build-and-push/action.yml index ccaff510c120aa..cc2c2bd86416d7 100644 --- a/.github/actions/docker-custom-build-and-push/action.yml +++ b/.github/actions/docker-custom-build-and-push/action.yml @@ -97,10 +97,11 @@ runs: cache-to: | type=inline - name: Upload image locally for testing (if not publishing) - uses: ishworkh/docker-image-artifact-upload@v1 + uses: ishworkh/container-image-artifact-upload@v2.0.0 if: ${{ inputs.publish != 'true' }} with: image: ${{ steps.single_tag.outputs.SINGLE_TAG }} + retention_days: "2" # Code for building multi-platform images and pushing to Docker Hub. - name: Set up QEMU diff --git a/.github/workflows/docker-postgres-setup.yml b/.github/workflows/docker-postgres-setup.yml index 956f3f7b1c3903..c028bfb55d48d5 100644 --- a/.github/workflows/docker-postgres-setup.yml +++ b/.github/workflows/docker-postgres-setup.yml @@ -52,7 +52,7 @@ jobs: with: images: | acryldata/datahub-postgres-setup - tags: ${{ needs.setup.outputs.tag }} + image_tag: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish == 'true' }} diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 49dd26e1cd27e3..16a2d29e9fd85e 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -186,7 +186,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_GMS_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -257,7 +257,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_MAE_CONSUMER_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -328,7 +328,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_MCE_CONSUMER_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -399,7 +399,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -472,7 +472,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: actions/checkout@v4 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_FRONTEND_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -533,7 +533,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_KAFKA_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -594,7 +594,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_MYSQL_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -655,7 +655,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -727,7 +727,7 @@ jobs: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 - name: Download Base Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }} @@ -775,7 +775,7 @@ jobs: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 - name: Download Base Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }} @@ -836,7 +836,7 @@ jobs: if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish =='true' }} run: ./gradlew :metadata-ingestion:codegen - name: Download Base Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }} @@ -883,7 +883,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image Slim Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} @@ -937,7 +937,7 @@ jobs: if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} run: ./gradlew :metadata-ingestion:codegen - name: Download Base Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }} @@ -982,7 +982,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image Full Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.datahub_ingestion_full_build.outputs.needs_artifact_download == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }} @@ -1079,47 +1079,47 @@ jobs: - name: Disk Check run: df -h . && docker images - name: Download GMS image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.gms_build.result == 'success' }} with: image: ${{ env.DATAHUB_GMS_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download Frontend image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.frontend_build.result == 'success' }} with: image: ${{ env.DATAHUB_FRONTEND_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download Kafka Setup image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.kafka_setup_build.result == 'success' }} with: image: ${{ env.DATAHUB_KAFKA_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download Mysql Setup image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.mysql_setup_build.result == 'success' }} with: image: ${{ env.DATAHUB_MYSQL_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download Elastic Setup image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.elasticsearch_setup_build.result == 'success' }} with: image: ${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download MCE Consumer image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.mce_consumer_build.result == 'success' }} with: image: ${{ env.DATAHUB_MCE_CONSUMER_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download MAE Consumer image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.mae_consumer_build.result == 'success' }} with: image: ${{ env.DATAHUB_MAE_CONSUMER_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download upgrade image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.datahub_upgrade_build.result == 'success' }} with: image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download datahub-ingestion-slim image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' && needs.datahub_ingestion_slim_build.result == 'success' }} with: image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} diff --git a/docker/datahub-gms/Dockerfile b/docker/datahub-gms/Dockerfile index b15bf3c6f9f17b..47b10535f8deea 100644 --- a/docker/datahub-gms/Dockerfile +++ b/docker/datahub-gms/Dockerfile @@ -6,12 +6,12 @@ ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com ARG MAVEN_CENTRAL_REPO_URL=https://repo1.maven.org/maven2 -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/datahub-mae-consumer/Dockerfile b/docker/datahub-mae-consumer/Dockerfile index 6edaa29ee1a8bb..74375072761d89 100644 --- a/docker/datahub-mae-consumer/Dockerfile +++ b/docker/datahub-mae-consumer/Dockerfile @@ -6,12 +6,12 @@ ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com ARG MAVEN_CENTRAL_REPO_URL=https://repo1.maven.org/maven2 -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/datahub-mce-consumer/Dockerfile b/docker/datahub-mce-consumer/Dockerfile index 1eb56633c561e6..3adef53cd06068 100644 --- a/docker/datahub-mce-consumer/Dockerfile +++ b/docker/datahub-mce-consumer/Dockerfile @@ -6,12 +6,12 @@ ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com ARG MAVEN_CENTRAL_REPO_URL=https://repo1.maven.org/maven2 -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/datahub-upgrade/Dockerfile b/docker/datahub-upgrade/Dockerfile index 3d59a903414b1a..a8ef4e8034fdd5 100644 --- a/docker/datahub-upgrade/Dockerfile +++ b/docker/datahub-upgrade/Dockerfile @@ -6,12 +6,12 @@ ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com ARG MAVEN_CENTRAL_REPO_URL=https://repo1.maven.org/maven2 -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/elasticsearch-setup/Dockerfile b/docker/elasticsearch-setup/Dockerfile index 4e64dcbc1e452c..1a6fe5bee6c840 100644 --- a/docker/elasticsearch-setup/Dockerfile +++ b/docker/elasticsearch-setup/Dockerfile @@ -6,11 +6,11 @@ ARG APP_ENV=prod # Defining custom repo urls for use in enterprise environments. Re-used between stages below. ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/mysql-setup/Dockerfile b/docker/mysql-setup/Dockerfile index b0ca45ad8f6f24..8a2d42bc233180 100644 --- a/docker/mysql-setup/Dockerfile +++ b/docker/mysql-setup/Dockerfile @@ -1,11 +1,11 @@ # Defining custom repo urls for use in enterprise environments. Re-used between stages below. ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/postgres-setup/Dockerfile b/docker/postgres-setup/Dockerfile index e145456e807d4d..31e9687cea15e8 100644 --- a/docker/postgres-setup/Dockerfile +++ b/docker/postgres-setup/Dockerfile @@ -1,11 +1,11 @@ # Defining custom repo urls for use in enterprise environments. Re-used between stages below. ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk From f6c0cf34c075e078fe6cf3c2e18e6a8d711cc8db Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Fri, 20 Dec 2024 17:04:58 -0600 Subject: [PATCH 26/35] docs update: Update v_0_3_7.md (#12197) Co-authored-by: Chris Collins --- docs/managed-datahub/release-notes/v_0_3_7.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/managed-datahub/release-notes/v_0_3_7.md b/docs/managed-datahub/release-notes/v_0_3_7.md index 75f5ac21224c27..31302403ea9305 100644 --- a/docs/managed-datahub/release-notes/v_0_3_7.md +++ b/docs/managed-datahub/release-notes/v_0_3_7.md @@ -13,6 +13,12 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies ## Known Issues +### v0.3.7.8 + * Notes Feature + * Adding a Note to an entity will result in that note showing up in the Settings > Home Page list of announcements as well as the profile page of the entity. + * If more than 30 Notes are added to entities, there's a risk that home page announcements will not show up on the home page properly. + * Notes are only supported for Dataset and Column entities in this release. + ### v0.3.7.7 * Postgres regression, non-functional when using postgres @@ -24,7 +30,9 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies ### v0.3.7.8 +- Helm Chart Requirement: 1.4.157+ - [Postgres] Fix regression from MySQL fix in v0.3.7.7 +- [UI] Fix editing post on entity profile page becomes announcement ### v0.3.7.7 From 8e9fc20fb6ec57b547c97d433ec5f85b8a3efe9a Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Fri, 20 Dec 2024 20:00:09 -0600 Subject: [PATCH 27/35] feat(gradle): add quickstartPgDebug option (#12195) --- docker/build.gradle | 262 ++++++++++++++++++++++---------------------- 1 file changed, 131 insertions(+), 131 deletions(-) diff --git a/docker/build.gradle b/docker/build.gradle index 25e3dc12036ef9..7b36c0d9acdcf0 100644 --- a/docker/build.gradle +++ b/docker/build.gradle @@ -18,24 +18,131 @@ ext { ':datahub-upgrade', ':metadata-service:war', ] - quickstart_modules = backend_profile_modules + [ - ':metadata-jobs:mce-consumer-job', - ':metadata-jobs:mae-consumer-job', - ':datahub-frontend' + + python_services_modules = [] + + // Common configuration for all tasks + common_config = [ + captureContainersOutput: true, + captureContainersOutputToFiles: project.file('build/container-logs') ] - debug_modules = quickstart_modules - [':metadata-jobs:mce-consumer-job', - ':metadata-jobs:mae-consumer-job'] - compose_args = ['-f', compose_base] - debug_reloadable = [ - 'datahub-gms-debug', - 'system-update-debug', - 'frontend-debug' + // declarative task configuration + quickstart_configs = [ + 'quickstart': [ + profile: 'quickstart-consumers', + modules: python_services_modules + backend_profile_modules + [ + ':datahub-frontend', + ':metadata-jobs:mce-consumer-job', + ':metadata-jobs:mae-consumer-job', + ] + ], + 'quickstartDebug': [ + profile: 'debug', + modules: python_services_modules + backend_profile_modules + [':datahub-frontend'], + isDebug: true + ], + 'quickstartPg': [ + profile: 'quickstart-postgres', + modules: (backend_profile_modules - [':docker:mysql-setup']) + [ + ':docker:postgres-setup', + ':datahub-frontend' + ] + ], + 'quickstartPgDebug': [ + profile: 'debug-postgres', + modules: python_services_modules + (backend_profile_modules - [':docker:mysql-setup']) + [ + ':docker:postgres-setup', + ':datahub-frontend' + ], + isDebug: true + ], + 'quickstartSlim': [ + profile: 'quickstart-backend', + modules: backend_profile_modules + [':docker:datahub-ingestion'], + additionalEnv: [ + 'DATAHUB_ACTIONS_IMAGE': 'acryldata/datahub-ingestion', + 'ACTIONS_VERSION': "v${version}-slim", + 'ACTIONS_EXTRA_PACKAGES': 'acryl-datahub-actions[executor] acryl-datahub-actions', + 'ACTIONS_CONFIG': 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml', + 'DATAHUB_LOCAL_COMMON_ENV': "${rootProject.project(':metadata-integration:java:spark-lineage-legacy').projectDir}/spark-smoke-test/smoke-gms.env" + ] + ], + 'quickstartStorage': [ + profile: 'quickstart-storage', + preserveVolumes: true + ] ] - // Postgres - pg_quickstart_modules = quickstart_modules - [':docker:mysql-setup'] + [':docker:postgres-setup'] +} + +// Register all quickstart tasks +quickstart_configs.each { taskName, config -> + tasks.register(taskName) +} + +// Dynamically create all quickstart tasks and configurations +dockerCompose { + // Configure default settings that apply to all configurations + useComposeFiles = [compose_base] + projectName = project_name + projectNamePrefix = '' + buildBeforeUp = false + buildBeforePull = false + stopContainers = false + removeVolumes = false + + quickstart_configs.each { taskName, config -> + "${taskName}" { + isRequiredBy(tasks.named(taskName)) + if (config.profile) { + composeAdditionalArgs = ['--profile', config.profile] + } + + // Common environment variables + environment.put 'DATAHUB_VERSION', config.isDebug ? + System.getenv("DATAHUB_VERSION") ?: "v${version}" : + "v${version}" + environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' + environment.put "METADATA_TESTS_ENABLED", "true" + environment.put "DATAHUB_REPO", "${docker_registry}" + + // Additional environment variables if specified + if (config.additionalEnv) { + config.additionalEnv.each { key, value -> + environment.put key, value + } + } + + useComposeFiles = [compose_base] + projectName = project_name + projectNamePrefix = '' + buildBeforeUp = false + buildBeforePull = false + stopContainers = false + removeVolumes = false + + // Apply common configuration + common_config.each { key, value -> + delegate."${key}" = value + } + + // Apply additional task-specific configuration if specified + if (config.additionalConfig) { + config.additionalConfig.each { key, value -> + delegate."${key}" = value + } + } + } + } +} - revision = 1 // increment to trigger rebuild +// Configure dependencies for ComposeUp tasks +quickstart_configs.each { taskName, config -> + if (config.modules) { + tasks.getByName("${taskName}ComposeUp").dependsOn( + config.modules.collect { it + ":${config.isDebug ? 'dockerTagDebug' : 'dockerTag'}" } + ) + } } tasks.register('minDockerCompose2.20', Exec) { @@ -43,18 +150,11 @@ tasks.register('minDockerCompose2.20', Exec) { args '-c', 'echo -e "$(docker compose version --short)\n2.20"|sort --version-sort --check=quiet --reverse' } -tasks.register('quickstart') {} -tasks.register('quickstartSlim') {} -tasks.register('quickstartDebug') {} -tasks.register('quickstartPg') {} -tasks.register('quickstartStorage') {} - tasks.register('quickstartNuke') { doFirst { - dockerCompose.quickstart.removeVolumes = true - dockerCompose.quickstartPg.removeVolumes = true - dockerCompose.quickstartSlim.removeVolumes = true - dockerCompose.quickstartDebug.removeVolumes = true + quickstart_configs.each { taskName, config -> + dockerCompose."${taskName}".removeVolumes = !config.preserveVolumes + } } finalizedBy(tasks.withType(ComposeDownForced)) } @@ -63,117 +163,17 @@ tasks.register('quickstartDown') { finalizedBy(tasks.withType(ComposeDownForced)) } -dockerCompose { - quickstart { - isRequiredBy(tasks.named('quickstart')) - composeAdditionalArgs = ['--profile', 'quickstart-consumers'] - - environment.put 'DATAHUB_VERSION', "v${version}" - environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - captureContainersOutput = true - captureContainersOutputToFiles = project.file('build/container-logs') - } - - quickstartPg { - isRequiredBy(tasks.named('quickstartPg')) - composeAdditionalArgs = ['--profile', 'quickstart-postgres'] - - environment.put 'DATAHUB_VERSION', "v${version}" - environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - } - - /** - * The smallest disk footprint required for Spark integration tests - * - * No frontend, mae, mce, or other services - */ - quickstartSlim { - isRequiredBy(tasks.named('quickstartSlim')) - composeAdditionalArgs = ['--profile', 'quickstart-backend'] - - environment.put 'DATAHUB_VERSION', "v${version}" - environment.put "DATAHUB_ACTIONS_IMAGE", "acryldata/datahub-ingestion" - environment.put "ACTIONS_VERSION", "v${version}-slim" - environment.put "ACTIONS_EXTRA_PACKAGES", 'acryl-datahub-actions[executor] acryl-datahub-actions' - environment.put "ACTIONS_CONFIG", 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' - environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally - // disabled for spark-lineage smoke-test - environment.put 'DATAHUB_LOCAL_COMMON_ENV', "${rootProject.project(':metadata-integration:java:spark-lineage-legacy').projectDir}/spark-smoke-test/smoke-gms.env" - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - captureContainersOutput = true - captureContainersOutputToFiles = project.file('build/container-logs') - } - - quickstartDebug { - isRequiredBy(tasks.named('quickstartDebug')) - composeAdditionalArgs = ['--profile', 'debug'] - - if (System.getenv().containsKey("DATAHUB_VERSION")) { - environment.put 'DATAHUB_VERSION', System.getenv("DATAHUB_VERSION") - } - environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - } - - quickstartStorage { - isRequiredBy(tasks.named('quickstartStorage')) - composeAdditionalArgs = ['--profile', 'quickstart-storage'] - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - } -} -tasks.getByName('quickstartComposeUp').dependsOn( - quickstart_modules.collect { it + ':dockerTag' }) -tasks.getByName('quickstartPgComposeUp').dependsOn( - pg_quickstart_modules.collect { it + ':dockerTag' }) -tasks.getByName('quickstartSlimComposeUp').dependsOn( - ([':docker:datahub-ingestion'] + backend_profile_modules) - .collect { it + ':dockerTag' }) -tasks.getByName('quickstartDebugComposeUp').dependsOn( - debug_modules.collect { it + ':dockerTagDebug' } -) tasks.withType(ComposeUp).configureEach { shouldRunAfter('quickstartNuke') dependsOn tasks.named("minDockerCompose2.20") } task debugReload(type: Exec) { - def cmd = ['docker compose -p datahub --profile debug'] + compose_args + ['restart'] + debug_reloadable + def cmd = ['docker compose -p datahub --profile debug'] + ['-f', compose_base] + [ + 'restart', + 'datahub-gms-debug', + 'system-update-debug', + 'frontend-debug' + ] commandLine 'bash', '-c', cmd.join(" ") -} +} \ No newline at end of file From 0b4d96e95c50c3db1fdf8cb65954e1f423c17310 Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Sat, 21 Dec 2024 12:07:53 +0530 Subject: [PATCH 28/35] fix(ingest/powerbi): support comments in m-query grammar (#12177) --- .../powerbi/powerbi-lexical-grammar.rule | 18 ++++++++-- .../integration/powerbi/test_m_parser.py | 36 +++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule index 51a0dff288558f..f237e2503317f2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule @@ -21,6 +21,11 @@ // | empty_string // | empty_string "," argument_list // - Added sql_string in any_literal +// - Added WS_INLINE? in field expression +// Added to ignore any comments +// %ignore WS // Ignore whitespace +// %ignore CPP_COMMENT // Ignore single-line comments +// %ignore C_COMMENT // Ignore multi-line comments lexical_unit: lexical_elements? @@ -245,6 +250,8 @@ operator_or_punctuator: "," | "=>" | ".." | "..." + | "{{" + | "}}" document: section_document | expression_document @@ -275,6 +282,7 @@ expression: logical_or_expression | if_expression | error_raising_expression | error_handling_expression + | outer_expression logical_or_expression: logical_and_expression @@ -376,6 +384,8 @@ sql_content: /(?:[^\"\\]|\\[\"]|\"\"|\#\(lf\))+/ sql_string: "\"" sql_content "\"" +outer_expression: "{{" expression "}}" + argument_list: WS_INLINE? expression | WS_INLINE? expression WS_INLINE? "," WS_INLINE? argument_list | WS_INLINE? sql_string @@ -409,7 +419,7 @@ record_expression: "[" field_list? "]" field_list: field | field "," field_list -field: field_name WS_INLINE? "=" WS_INLINE? expression +field: WS_INLINE? field_name WS_INLINE? "=" WS_INLINE? expression field_name: generalized_identifier | quoted_identifier @@ -621,4 +631,8 @@ any_literal: record_literal %import common.DIGIT %import common.LF %import common.CR -%import common.ESCAPED_STRING \ No newline at end of file +%import common.ESCAPED_STRING + +%ignore WS // Ignore whitespace +%ignore CPP_COMMENT // Ignore single-line comments +%ignore C_COMMENT // Ignore multi-line comments \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 63821f9038a88c..832d00d9c54702 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -1171,3 +1171,39 @@ def test_m_query_timeout(mock_get_lark_parser): assert ( is_entry_present ), 'Warning message "M-Query Parsing Timeout" should be present in reporter' + + +def test_comments_in_m_query(): + q: str = 'let\n Source = Snowflake.Databases("xaa48144.snowflakecomputing.com", "COMPUTE_WH", [Role="ACCOUNTADMIN"]),\n SNOWFLAKE_SAMPLE_DATA_Database = Source{[Name="SNOWFLAKE_SAMPLE_DATA", Kind="Database"]}[Data],\n TPCDS_SF100TCL_Schema = SNOWFLAKE_SAMPLE_DATA_Database{[Name="TPCDS_SF100TCL", Kind="Schema"]}[Data],\n ITEM_Table = TPCDS_SF100TCL_Schema{[Name="ITEM", Kind="Table"]}[Data],\n \n // Group by I_BRAND and calculate the count\n BrandCountsTable = Table.Group(ITEM_Table, {"I_BRAND"}, {{"BrandCount", each Table.RowCount(_), Int64.Type}})\nin\n BrandCountsTable' + + table: powerbi_data_classes.Table = powerbi_data_classes.Table( + columns=[], + measures=[], + expression=q, + name="pet_price_index", + full_name="datalake.sandbox_pet.pet_price_index", + ) + + reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + parameters={ + "hostname": "xyz.databricks.com", + "http_path": "/sql/1.0/warehouses/abc", + "catalog": "cat", + "schema": "public", + }, + )[0].upstreams + + assert len(data_platform_tables) == 1 + assert ( + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpcds_sf100tcl.item,PROD)" + ) From 95b9d1b4c9687c3d505485aa600b5040a2549047 Mon Sep 17 00:00:00 2001 From: Jonny Dixon <45681293+acrylJonny@users.noreply.github.com> Date: Sat, 21 Dec 2024 06:38:59 +0000 Subject: [PATCH 29/35] feat(ingest/aws-common): improved instance profile support (#12139) for ec2, ecs, eks, lambda, beanstalk, app runner and cft roles --- .../ingestion/source/aws/aws_common.py | 258 ++++++++++++-- .../tests/unit/test_aws_common.py | 328 ++++++++++++++++++ 2 files changed, 559 insertions(+), 27 deletions(-) create mode 100644 metadata-ingestion/tests/unit/test_aws_common.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py b/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py index 161aed5bb59881..b76eb95def1ede 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py @@ -1,7 +1,12 @@ +import logging +import os from datetime import datetime, timedelta, timezone -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union +from enum import Enum +from http import HTTPStatus +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union import boto3 +import requests from boto3.session import Session from botocore.config import DEFAULT_TIMEOUT, Config from botocore.utils import fix_s3_host @@ -14,6 +19,8 @@ ) from datahub.configuration.source_common import EnvConfigMixin +logger = logging.getLogger(__name__) + if TYPE_CHECKING: from mypy_boto3_dynamodb import DynamoDBClient from mypy_boto3_glue import GlueClient @@ -22,6 +29,26 @@ from mypy_boto3_sts import STSClient +class AwsEnvironment(Enum): + EC2 = "EC2" + ECS = "ECS" + EKS = "EKS" + LAMBDA = "LAMBDA" + APP_RUNNER = "APP_RUNNER" + BEANSTALK = "ELASTIC_BEANSTALK" + CLOUD_FORMATION = "CLOUD_FORMATION" + UNKNOWN = "UNKNOWN" + + +class AwsServicePrincipal(Enum): + LAMBDA = "lambda.amazonaws.com" + EKS = "eks.amazonaws.com" + APP_RUNNER = "apprunner.amazonaws.com" + ECS = "ecs.amazonaws.com" + ELASTIC_BEANSTALK = "elasticbeanstalk.amazonaws.com" + EC2 = "ec2.amazonaws.com" + + class AwsAssumeRoleConfig(PermissiveConfigModel): # Using the PermissiveConfigModel to allow the user to pass additional arguments. @@ -34,6 +61,163 @@ class AwsAssumeRoleConfig(PermissiveConfigModel): ) +def get_instance_metadata_token() -> Optional[str]: + """Get IMDSv2 token""" + try: + response = requests.put( + "http://169.254.169.254/latest/api/token", + headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"}, + timeout=1, + ) + if response.status_code == HTTPStatus.OK: + return response.text + except requests.exceptions.RequestException: + logger.debug("Failed to get IMDSv2 token") + return None + + +def is_running_on_ec2() -> bool: + """Check if code is running on EC2 using IMDSv2""" + token = get_instance_metadata_token() + if not token: + return False + + try: + response = requests.get( + "http://169.254.169.254/latest/meta-data/instance-id", + headers={"X-aws-ec2-metadata-token": token}, + timeout=1, + ) + return response.status_code == HTTPStatus.OK + except requests.exceptions.RequestException: + return False + + +def detect_aws_environment() -> AwsEnvironment: + """ + Detect the AWS environment we're running in. + Order matters as some environments may have multiple indicators. + """ + # Check Lambda first as it's most specific + if os.getenv("AWS_LAMBDA_FUNCTION_NAME"): + if os.getenv("AWS_EXECUTION_ENV", "").startswith("CloudFormation"): + return AwsEnvironment.CLOUD_FORMATION + return AwsEnvironment.LAMBDA + + # Check EKS (IRSA) + if os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE") and os.getenv("AWS_ROLE_ARN"): + return AwsEnvironment.EKS + + # Check App Runner + if os.getenv("AWS_APP_RUNNER_SERVICE_ID"): + return AwsEnvironment.APP_RUNNER + + # Check ECS + if os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv( + "ECS_CONTAINER_METADATA_URI" + ): + return AwsEnvironment.ECS + + # Check Elastic Beanstalk + if os.getenv("ELASTIC_BEANSTALK_ENVIRONMENT_NAME"): + return AwsEnvironment.BEANSTALK + + if is_running_on_ec2(): + return AwsEnvironment.EC2 + + return AwsEnvironment.UNKNOWN + + +def get_instance_role_arn() -> Optional[str]: + """Get role ARN from EC2 instance metadata using IMDSv2""" + token = get_instance_metadata_token() + if not token: + return None + + try: + response = requests.get( + "http://169.254.169.254/latest/meta-data/iam/security-credentials/", + headers={"X-aws-ec2-metadata-token": token}, + timeout=1, + ) + if response.status_code == 200: + role_name = response.text.strip() + if role_name: + sts = boto3.client("sts") + identity = sts.get_caller_identity() + return identity.get("Arn") + except Exception as e: + logger.debug(f"Failed to get instance role ARN: {e}") + return None + + +def get_lambda_role_arn() -> Optional[str]: + """Get the Lambda function's role ARN""" + try: + function_name = os.getenv("AWS_LAMBDA_FUNCTION_NAME") + if not function_name: + return None + + lambda_client = boto3.client("lambda") + function_config = lambda_client.get_function_configuration( + FunctionName=function_name + ) + return function_config.get("Role") + except Exception as e: + logger.debug(f"Failed to get Lambda role ARN: {e}") + return None + + +def get_current_identity() -> Tuple[Optional[str], Optional[str]]: + """ + Get the current role ARN and source type based on the runtime environment. + Returns (role_arn, credential_source) + """ + env = detect_aws_environment() + + if env == AwsEnvironment.LAMBDA: + role_arn = get_lambda_role_arn() + return role_arn, AwsServicePrincipal.LAMBDA.value + + elif env == AwsEnvironment.EKS: + role_arn = os.getenv("AWS_ROLE_ARN") + return role_arn, AwsServicePrincipal.EKS.value + + elif env == AwsEnvironment.APP_RUNNER: + try: + sts = boto3.client("sts") + identity = sts.get_caller_identity() + return identity.get("Arn"), AwsServicePrincipal.APP_RUNNER.value + except Exception as e: + logger.debug(f"Failed to get App Runner role: {e}") + + elif env == AwsEnvironment.ECS: + try: + metadata_uri = os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv( + "ECS_CONTAINER_METADATA_URI" + ) + if metadata_uri: + response = requests.get(f"{metadata_uri}/task", timeout=1) + if response.status_code == HTTPStatus.OK: + task_metadata = response.json() + if "TaskARN" in task_metadata: + return ( + task_metadata.get("TaskARN"), + AwsServicePrincipal.ECS.value, + ) + except Exception as e: + logger.debug(f"Failed to get ECS task role: {e}") + + elif env == AwsEnvironment.BEANSTALK: + # Beanstalk uses EC2 instance metadata + return get_instance_role_arn(), AwsServicePrincipal.ELASTIC_BEANSTALK.value + + elif env == AwsEnvironment.EC2: + return get_instance_role_arn(), AwsServicePrincipal.EC2.value + + return None, None + + def assume_role( role: AwsAssumeRoleConfig, aws_region: Optional[str], @@ -95,7 +279,7 @@ class AwsConnectionConfig(ConfigModel): ) aws_profile: Optional[str] = Field( default=None, - description="Named AWS profile to use. Only used if access key / secret are unset. If not set the default will be used", + description="The [named profile](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html) to use from AWS credentials. Falls back to default profile if not specified and no access keys provided. Profiles are configured in ~/.aws/credentials or ~/.aws/config.", ) aws_region: Optional[str] = Field(None, description="AWS region code.") @@ -145,6 +329,7 @@ def _normalized_aws_roles(self) -> List[AwsAssumeRoleConfig]: def get_session(self) -> Session: if self.aws_access_key_id and self.aws_secret_access_key: + # Explicit credentials take precedence session = Session( aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, @@ -152,38 +337,57 @@ def get_session(self) -> Session: region_name=self.aws_region, ) elif self.aws_profile: + # Named profile is second priority session = Session( region_name=self.aws_region, profile_name=self.aws_profile ) else: - # Use boto3's credential autodetection. + # Use boto3's credential autodetection session = Session(region_name=self.aws_region) - if self._normalized_aws_roles(): - # Use existing session credentials to start the chain of role assumption. - current_credentials = session.get_credentials() - credentials = { - "AccessKeyId": current_credentials.access_key, - "SecretAccessKey": current_credentials.secret_key, - "SessionToken": current_credentials.token, - } - - for role in self._normalized_aws_roles(): - if self._should_refresh_credentials(): - credentials = assume_role( - role, - self.aws_region, - credentials=credentials, + target_roles = self._normalized_aws_roles() + if target_roles: + current_role_arn, credential_source = get_current_identity() + + # Only assume role if: + # 1. We're not in a known AWS environment with a role, or + # 2. We need to assume a different role than our current one + should_assume_role = current_role_arn is None or any( + role.RoleArn != current_role_arn for role in target_roles + ) + + if should_assume_role: + env = detect_aws_environment() + logger.debug(f"Assuming role(s) from {env.value} environment") + + current_credentials = session.get_credentials() + if current_credentials is None: + raise ValueError("No credentials available for role assumption") + + credentials = { + "AccessKeyId": current_credentials.access_key, + "SecretAccessKey": current_credentials.secret_key, + "SessionToken": current_credentials.token, + } + + for role in target_roles: + if self._should_refresh_credentials(): + credentials = assume_role( + role=role, + aws_region=self.aws_region, + credentials=credentials, + ) + if isinstance(credentials["Expiration"], datetime): + self._credentials_expiration = credentials["Expiration"] + + session = Session( + aws_access_key_id=credentials["AccessKeyId"], + aws_secret_access_key=credentials["SecretAccessKey"], + aws_session_token=credentials["SessionToken"], + region_name=self.aws_region, ) - if isinstance(credentials["Expiration"], datetime): - self._credentials_expiration = credentials["Expiration"] - - session = Session( - aws_access_key_id=credentials["AccessKeyId"], - aws_secret_access_key=credentials["SecretAccessKey"], - aws_session_token=credentials["SessionToken"], - region_name=self.aws_region, - ) + else: + logger.debug(f"Using existing role from {credential_source}") return session diff --git a/metadata-ingestion/tests/unit/test_aws_common.py b/metadata-ingestion/tests/unit/test_aws_common.py new file mode 100644 index 00000000000000..9291fb91134b1c --- /dev/null +++ b/metadata-ingestion/tests/unit/test_aws_common.py @@ -0,0 +1,328 @@ +import json +import os +from unittest.mock import MagicMock, patch + +import boto3 +import pytest +from moto import mock_iam, mock_lambda, mock_sts + +from datahub.ingestion.source.aws.aws_common import ( + AwsConnectionConfig, + AwsEnvironment, + detect_aws_environment, + get_current_identity, + get_instance_metadata_token, + get_instance_role_arn, + is_running_on_ec2, +) + + +@pytest.fixture +def mock_aws_config(): + return AwsConnectionConfig( + aws_access_key_id="test-key", + aws_secret_access_key="test-secret", + aws_region="us-east-1", + ) + + +class TestAwsCommon: + def test_environment_detection_no_environment(self): + """Test environment detection when no AWS environment is present""" + with patch.dict(os.environ, {}, clear=True): + assert detect_aws_environment() == AwsEnvironment.UNKNOWN + + def test_environment_detection_lambda(self): + """Test Lambda environment detection""" + with patch.dict(os.environ, {"AWS_LAMBDA_FUNCTION_NAME": "test-function"}): + assert detect_aws_environment() == AwsEnvironment.LAMBDA + + def test_environment_detection_lambda_cloudformation(self): + """Test CloudFormation Lambda environment detection""" + with patch.dict( + os.environ, + { + "AWS_LAMBDA_FUNCTION_NAME": "test-function", + "AWS_EXECUTION_ENV": "CloudFormation.xxx", + }, + ): + assert detect_aws_environment() == AwsEnvironment.CLOUD_FORMATION + + def test_environment_detection_eks(self): + """Test EKS environment detection""" + with patch.dict( + os.environ, + { + "AWS_WEB_IDENTITY_TOKEN_FILE": "/var/run/secrets/token", + "AWS_ROLE_ARN": "arn:aws:iam::123456789012:role/test-role", + }, + ): + assert detect_aws_environment() == AwsEnvironment.EKS + + def test_environment_detection_app_runner(self): + """Test App Runner environment detection""" + with patch.dict(os.environ, {"AWS_APP_RUNNER_SERVICE_ID": "service-id"}): + assert detect_aws_environment() == AwsEnvironment.APP_RUNNER + + def test_environment_detection_ecs(self): + """Test ECS environment detection""" + with patch.dict( + os.environ, {"ECS_CONTAINER_METADATA_URI_V4": "http://169.254.170.2/v4"} + ): + assert detect_aws_environment() == AwsEnvironment.ECS + + def test_environment_detection_beanstalk(self): + """Test Elastic Beanstalk environment detection""" + with patch.dict(os.environ, {"ELASTIC_BEANSTALK_ENVIRONMENT_NAME": "my-env"}): + assert detect_aws_environment() == AwsEnvironment.BEANSTALK + + @patch("requests.put") + def test_ec2_metadata_token(self, mock_put): + """Test EC2 metadata token retrieval""" + mock_put.return_value.status_code = 200 + mock_put.return_value.text = "token123" + + token = get_instance_metadata_token() + assert token == "token123" + + mock_put.assert_called_once_with( + "http://169.254.169.254/latest/api/token", + headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"}, + timeout=1, + ) + + @patch("requests.put") + def test_ec2_metadata_token_failure(self, mock_put): + """Test EC2 metadata token failure case""" + mock_put.return_value.status_code = 404 + + token = get_instance_metadata_token() + assert token is None + + @patch("requests.get") + @patch("requests.put") + def test_is_running_on_ec2(self, mock_put, mock_get): + """Test EC2 instance detection with IMDSv2""" + mock_put.return_value.status_code = 200 + mock_put.return_value.text = "token123" + mock_get.return_value.status_code = 200 + + assert is_running_on_ec2() is True + + mock_put.assert_called_once_with( + "http://169.254.169.254/latest/api/token", + headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"}, + timeout=1, + ) + mock_get.assert_called_once_with( + "http://169.254.169.254/latest/meta-data/instance-id", + headers={"X-aws-ec2-metadata-token": "token123"}, + timeout=1, + ) + + @patch("requests.get") + @patch("requests.put") + def test_is_running_on_ec2_failure(self, mock_put, mock_get): + """Test EC2 instance detection failure""" + mock_put.return_value.status_code = 404 + assert is_running_on_ec2() is False + + mock_put.return_value.status_code = 200 + mock_put.return_value.text = "token123" + mock_get.return_value.status_code = 404 + assert is_running_on_ec2() is False + + @mock_sts + @mock_lambda + @mock_iam + def test_get_current_identity_lambda(self): + """Test getting identity in Lambda environment""" + with patch.dict( + os.environ, + { + "AWS_LAMBDA_FUNCTION_NAME": "test-function", + "AWS_DEFAULT_REGION": "us-east-1", + }, + ): + # Create IAM role first with proper trust policy + iam_client = boto3.client("iam", region_name="us-east-1") + trust_policy = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": "lambda.amazonaws.com"}, + "Action": "sts:AssumeRole", + } + ], + } + iam_client.create_role( + RoleName="test-role", AssumeRolePolicyDocument=json.dumps(trust_policy) + ) + + lambda_client = boto3.client("lambda", region_name="us-east-1") + lambda_client.create_function( + FunctionName="test-function", + Runtime="python3.8", + Role="arn:aws:iam::123456789012:role/test-role", + Handler="index.handler", + Code={"ZipFile": b"def handler(event, context): pass"}, + ) + + role_arn, source = get_current_identity() + assert source == "lambda.amazonaws.com" + assert role_arn == "arn:aws:iam::123456789012:role/test-role" + + @patch("requests.get") + @patch("requests.put") + @mock_sts + def test_get_instance_role_arn_success(self, mock_put, mock_get): + """Test getting EC2 instance role ARN""" + mock_put.return_value.status_code = 200 + mock_put.return_value.text = "token123" + mock_get.return_value.status_code = 200 + mock_get.return_value.text = "test-role" + + with patch("boto3.client") as mock_boto: + mock_sts = MagicMock() + mock_sts.get_caller_identity.return_value = { + "Arn": "arn:aws:sts::123456789012:assumed-role/test-role/instance" + } + mock_boto.return_value = mock_sts + + role_arn = get_instance_role_arn() + assert ( + role_arn == "arn:aws:sts::123456789012:assumed-role/test-role/instance" + ) + + @mock_sts + def test_aws_connection_config_basic(self, mock_aws_config): + """Test basic AWS connection configuration""" + session = mock_aws_config.get_session() + creds = session.get_credentials() + assert creds.access_key == "test-key" + assert creds.secret_key == "test-secret" + + @mock_sts + def test_aws_connection_config_with_session_token(self): + """Test AWS connection with session token""" + config = AwsConnectionConfig( + aws_access_key_id="test-key", + aws_secret_access_key="test-secret", + aws_session_token="test-token", + aws_region="us-east-1", + ) + + session = config.get_session() + creds = session.get_credentials() + assert creds.token == "test-token" + + @mock_sts + def test_aws_connection_config_role_assumption(self): + """Test AWS connection with role assumption""" + config = AwsConnectionConfig( + aws_access_key_id="test-key", + aws_secret_access_key="test-secret", + aws_region="us-east-1", + aws_role="arn:aws:iam::123456789012:role/test-role", + ) + + with patch( + "datahub.ingestion.source.aws.aws_common.get_current_identity" + ) as mock_identity: + mock_identity.return_value = (None, None) + session = config.get_session() + creds = session.get_credentials() + assert creds is not None + + @mock_sts + def test_aws_connection_config_skip_role_assumption(self): + """Test AWS connection skipping role assumption when already in role""" + config = AwsConnectionConfig( + aws_region="us-east-1", + aws_role="arn:aws:iam::123456789012:role/current-role", + ) + + with patch( + "datahub.ingestion.source.aws.aws_common.get_current_identity" + ) as mock_identity: + mock_identity.return_value = ( + "arn:aws:iam::123456789012:role/current-role", + "ec2.amazonaws.com", + ) + session = config.get_session() + assert session is not None + + @mock_sts + def test_aws_connection_config_multiple_roles(self): + """Test AWS connection with multiple role assumption""" + config = AwsConnectionConfig( + aws_access_key_id="test-key", + aws_secret_access_key="test-secret", + aws_region="us-east-1", + aws_role=[ + "arn:aws:iam::123456789012:role/role1", + "arn:aws:iam::123456789012:role/role2", + ], + ) + + with patch( + "datahub.ingestion.source.aws.aws_common.get_current_identity" + ) as mock_identity: + mock_identity.return_value = (None, None) + session = config.get_session() + assert session is not None + + def test_aws_connection_config_validation_error(self): + """Test AWS connection validation""" + with patch.dict( + "os.environ", + { + "AWS_ACCESS_KEY_ID": "test-key", + # Deliberately missing AWS_SECRET_ACCESS_KEY + "AWS_DEFAULT_REGION": "us-east-1", + }, + clear=True, + ): + config = AwsConnectionConfig() # Let it pick up from environment + session = config.get_session() + with pytest.raises( + Exception, + match="Partial credentials found in env, missing: AWS_SECRET_ACCESS_KEY", + ): + session.get_credentials() + + @pytest.mark.parametrize( + "env_vars,expected_environment", + [ + ({}, AwsEnvironment.UNKNOWN), + ({"AWS_LAMBDA_FUNCTION_NAME": "test"}, AwsEnvironment.LAMBDA), + ( + { + "AWS_LAMBDA_FUNCTION_NAME": "test", + "AWS_EXECUTION_ENV": "CloudFormation", + }, + AwsEnvironment.CLOUD_FORMATION, + ), + ( + { + "AWS_WEB_IDENTITY_TOKEN_FILE": "/token", + "AWS_ROLE_ARN": "arn:aws:iam::123:role/test", + }, + AwsEnvironment.EKS, + ), + ({"AWS_APP_RUNNER_SERVICE_ID": "service-123"}, AwsEnvironment.APP_RUNNER), + ( + {"ECS_CONTAINER_METADATA_URI_V4": "http://169.254.170.2"}, + AwsEnvironment.ECS, + ), + ( + {"ELASTIC_BEANSTALK_ENVIRONMENT_NAME": "my-env"}, + AwsEnvironment.BEANSTALK, + ), + ], + ) + def test_environment_detection_parametrized(self, env_vars, expected_environment): + """Parametrized test for environment detection with different configurations""" + with patch.dict(os.environ, env_vars, clear=True): + assert detect_aws_environment() == expected_environment From 8350a4e03ac9a259bb21e295c173972fd74d5f6f Mon Sep 17 00:00:00 2001 From: Jonny Dixon <45681293+acrylJonny@users.noreply.github.com> Date: Sat, 21 Dec 2024 07:52:27 +0000 Subject: [PATCH 30/35] feat(ingest/hive): lineage from/to file storage (#11841) Co-authored-by: Aseem Bansal --- .../src/datahub/ingestion/source/sql/hive.py | 614 +++++++++++++++++- 1 file changed, 606 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py index 59f301baf40165..fad54fda453786 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py @@ -1,7 +1,10 @@ import json import logging import re -from typing import Any, Dict, Iterable, List, Optional, Union +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from urllib.parse import urlparse from pydantic.class_validators import validator from pydantic.fields import Field @@ -11,7 +14,12 @@ from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveDialect, HiveTimestamp from sqlalchemy.engine.reflection import Inspector -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mce_builder import ( + make_data_platform_urn, + make_dataplatform_instance_urn, + make_dataset_urn_with_platform_instance, + make_schema_field_urn, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.decorators import ( SourceCapability, @@ -29,14 +37,24 @@ TwoTierSQLAlchemyConfig, TwoTierSQLAlchemySource, ) -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( + DataPlatformInstanceClass, + DatasetLineageTypeClass, + DatasetPropertiesClass, DateTypeClass, + FineGrainedLineageClass, + FineGrainedLineageDownstreamTypeClass, + FineGrainedLineageUpstreamTypeClass, NullTypeClass, NumberTypeClass, - SchemaField, + OtherSchemaClass, + SchemaFieldClass, + SchemaMetadataClass, TimeTypeClass, + UpstreamClass, + UpstreamLineageClass, + ViewPropertiesClass, ) -from datahub.metadata.schema_classes import ViewPropertiesClass from datahub.utilities import config_clean from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column @@ -46,6 +64,511 @@ register_custom_type(HiveTimestamp, TimeTypeClass) register_custom_type(HiveDecimal, NumberTypeClass) + +class StoragePlatform(Enum): + """Enumeration of storage platforms supported for lineage""" + + S3 = "s3" + AZURE = "abs" + GCS = "gcs" + DBFS = "dbfs" + LOCAL = "file" + HDFS = "hdfs" + + +# Mapping of URL schemes to storage platforms +STORAGE_SCHEME_MAPPING = { + # S3 and derivatives + "s3": StoragePlatform.S3, + "s3a": StoragePlatform.S3, + "s3n": StoragePlatform.S3, + # Azure and derivatives + "abfs": StoragePlatform.AZURE, + "abfss": StoragePlatform.AZURE, + "adl": StoragePlatform.AZURE, + "adls": StoragePlatform.AZURE, + "wasb": StoragePlatform.AZURE, + "wasbs": StoragePlatform.AZURE, + # GCS and derivatives + "gs": StoragePlatform.GCS, + "gcs": StoragePlatform.GCS, + # DBFS + "dbfs": StoragePlatform.DBFS, + # Local filesystem + "file": StoragePlatform.LOCAL, + # HDFS + "hdfs": StoragePlatform.HDFS, +} + + +class StoragePathParser: + """Parser for storage paths with platform-specific logic""" + + @staticmethod + def parse_storage_location(location: str) -> Optional[Tuple[StoragePlatform, str]]: + """ + Parse a storage location into platform and normalized path. + + Args: + location: Storage location URI (e.g., s3://bucket/path, abfss://container@account.dfs.core.windows.net/path) + + Returns: + Tuple of (StoragePlatform, normalized_path) if valid, None if invalid + """ + + try: + # Handle special case for local files with no scheme + if location.startswith("/"): + return StoragePlatform.LOCAL, location + + # Parse the URI + parsed = urlparse(location) + scheme = parsed.scheme.lower() + + if not scheme: + return None + + # Look up the platform + platform = STORAGE_SCHEME_MAPPING.get(scheme) + if not platform: + return None + + # Get normalized path based on platform + if platform == StoragePlatform.S3: + # For S3, combine bucket and path + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + elif platform == StoragePlatform.AZURE: + if scheme in ("abfs", "abfss"): + # Format: abfss://container@account.dfs.core.windows.net/path + container = parsed.netloc.split("@")[0] + path = f"{container}/{parsed.path.lstrip('/')}" + else: + # Handle other Azure schemes + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + elif platform == StoragePlatform.GCS: + # For GCS, combine bucket and path + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + elif platform == StoragePlatform.DBFS: + # For DBFS, use path as-is + path = parsed.path.lstrip("/") + + elif platform == StoragePlatform.LOCAL: + # For local files, use full path + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + elif platform == StoragePlatform.HDFS: + # For HDFS, use full path + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + else: + return None + + # Clean up the path + path = path.rstrip("/") # Remove trailing slashes + path = re.sub(r"/+", "/", path) # Normalize multiple slashes + path = f"/{path}" + + return platform, path + + except Exception as exp: + logger.warning(f"Failed to parse storage location {location}: {exp}") + return None + + @staticmethod + def get_platform_name(platform: StoragePlatform) -> str: + """Get the platform name to use in URNs""" + + platform_names = { + StoragePlatform.S3: "s3", + StoragePlatform.AZURE: "adls", + StoragePlatform.GCS: "gcs", + StoragePlatform.DBFS: "dbfs", + StoragePlatform.LOCAL: "file", + StoragePlatform.HDFS: "hdfs", + } + return platform_names[platform] + + +class HiveStorageLineageConfig: + """Configuration for Hive storage lineage.""" + + def __init__( + self, + emit_storage_lineage: bool, + hive_storage_lineage_direction: str, + include_column_lineage: bool, + storage_platform_instance: Optional[str], + ): + if hive_storage_lineage_direction.lower() not in ["upstream", "downstream"]: + raise ValueError( + "hive_storage_lineage_direction must be either upstream or downstream" + ) + + self.emit_storage_lineage = emit_storage_lineage + self.hive_storage_lineage_direction = hive_storage_lineage_direction.lower() + self.include_column_lineage = include_column_lineage + self.storage_platform_instance = storage_platform_instance + + +@dataclass +class HiveStorageSourceReport: + """Report for tracking storage lineage statistics""" + + storage_locations_scanned: int = 0 + filtered_locations: List[str] = Field(default_factory=list) + failed_locations: List[str] = Field(default_factory=list) + + def report_location_scanned(self) -> None: + self.storage_locations_scanned += 1 + + def report_location_filtered(self, location: str) -> None: + self.filtered_locations.append(location) + + def report_location_failed(self, location: str) -> None: + self.failed_locations.append(location) + + +class HiveStorageLineage: + """Handles storage lineage for Hive tables""" + + def __init__( + self, + config: HiveStorageLineageConfig, + env: str, + convert_urns_to_lowercase: bool = False, + ): + self.config = config + self.env = env + self.convert_urns_to_lowercase = convert_urns_to_lowercase + self.report = HiveStorageSourceReport() + + def _make_dataset_platform_instance( + self, + platform: str, + instance: Optional[str], + ) -> DataPlatformInstanceClass: + """Create DataPlatformInstance aspect""" + + return DataPlatformInstanceClass( + platform=make_data_platform_urn(platform), + instance=make_dataplatform_instance_urn(platform, instance) + if instance + else None, + ) + + def _make_storage_dataset_urn( + self, + storage_location: str, + ) -> Optional[Tuple[str, str]]: + """ + Create storage dataset URN from location. + Returns tuple of (urn, platform) if successful, None otherwise. + """ + + platform_instance = None + storage_info = StoragePathParser.parse_storage_location(storage_location) + if not storage_info: + logger.debug(f"Could not parse storage location: {storage_location}") + return None + + platform, path = storage_info + platform_name = StoragePathParser.get_platform_name(platform) + + if self.convert_urns_to_lowercase: + platform_name = platform_name.lower() + path = path.lower() + if self.config.storage_platform_instance: + platform_instance = self.config.storage_platform_instance.lower() + + try: + storage_urn = make_dataset_urn_with_platform_instance( + platform=platform_name, + name=path, + env=self.env, + platform_instance=platform_instance, + ) + return storage_urn, platform_name + except Exception as exp: + logger.error(f"Failed to create URN for {platform_name}:{path}: {exp}") + return None + + def _get_fine_grained_lineages( + self, + dataset_urn: str, + storage_urn: str, + dataset_schema: SchemaMetadataClass, + storage_schema: SchemaMetadataClass, + ) -> Iterable[FineGrainedLineageClass]: + """Generate column-level lineage between dataset and storage""" + + if not self.config.include_column_lineage: + return + + for dataset_field in dataset_schema.fields: + dataset_path = dataset_field.fieldPath + + # Find matching field in storage schema + matching_field = next( + (f for f in storage_schema.fields if f.fieldPath == dataset_path), + None, + ) + + if matching_field: + if self.config.hive_storage_lineage_direction == "upstream": + yield FineGrainedLineageClass( + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + upstreams=[ + make_schema_field_urn( + parent_urn=storage_urn, + field_path=matching_field.fieldPath, + ) + ], + downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, + downstreams=[ + make_schema_field_urn( + parent_urn=dataset_urn, + field_path=dataset_path, + ) + ], + ) + else: + yield FineGrainedLineageClass( + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + upstreams=[ + make_schema_field_urn( + parent_urn=dataset_urn, + field_path=dataset_path, + ) + ], + downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, + downstreams=[ + make_schema_field_urn( + parent_urn=storage_urn, + field_path=matching_field.fieldPath, + ) + ], + ) + + def _create_lineage_mcp( + self, + source_urn: str, + target_urn: str, + fine_grained_lineages: Optional[Iterable[FineGrainedLineageClass]] = None, + ) -> Iterable[MetadataWorkUnit]: + """Create lineage MCP between source and target datasets""" + + lineages_list = ( + list(fine_grained_lineages) if fine_grained_lineages is not None else None + ) + + upstream_lineage = UpstreamLineageClass( + upstreams=[ + UpstreamClass(dataset=source_urn, type=DatasetLineageTypeClass.COPY) + ], + fineGrainedLineages=lineages_list, + ) + + yield MetadataWorkUnit( + id=f"{source_urn}-{target_urn}-lineage", + mcp=MetadataChangeProposalWrapper( + entityUrn=target_urn, aspect=upstream_lineage + ), + ) + + def get_storage_dataset_mcp( + self, + storage_location: str, + platform_instance: Optional[str] = None, + schema_metadata: Optional[SchemaMetadataClass] = None, + ) -> Iterable[MetadataWorkUnit]: + """ + Generate MCPs for storage dataset if needed. + This creates the storage dataset entity in DataHub. + """ + + storage_info = StoragePathParser.parse_storage_location( + storage_location, + ) + if not storage_info: + return + + platform, path = storage_info + platform_name = StoragePathParser.get_platform_name(platform) + + if self.convert_urns_to_lowercase: + platform_name = platform_name.lower() + path = path.lower() + if self.config.storage_platform_instance: + platform_instance = self.config.storage_platform_instance.lower() + + try: + storage_urn = make_dataset_urn_with_platform_instance( + platform=platform_name, + name=path, + env=self.env, + platform_instance=platform_instance, + ) + + # Dataset properties + props = DatasetPropertiesClass(name=path) + yield MetadataWorkUnit( + id=f"storage-{storage_urn}-props", + mcp=MetadataChangeProposalWrapper( + entityUrn=storage_urn, + aspect=props, + ), + ) + + # Platform instance + platform_instance_aspect = self._make_dataset_platform_instance( + platform=platform_name, + instance=platform_instance, + ) + yield MetadataWorkUnit( + id=f"storage-{storage_urn}-platform", + mcp=MetadataChangeProposalWrapper( + entityUrn=storage_urn, aspect=platform_instance_aspect + ), + ) + + # Schema if available + if schema_metadata: + storage_schema = SchemaMetadataClass( + schemaName=f"{platform.value}_schema", + platform=f"urn:li:dataPlatform:{platform.value}", + version=0, + fields=schema_metadata.fields, + hash="", + platformSchema=OtherSchemaClass(rawSchema=""), + ) + yield MetadataWorkUnit( + id=f"storage-{storage_urn}-schema", + mcp=MetadataChangeProposalWrapper( + entityUrn=storage_urn, aspect=storage_schema + ), + ) + + except Exception as e: + logger.error( + f"Failed to create storage dataset MCPs for {storage_location}: {e}" + ) + return + + def get_lineage_mcp( + self, + dataset_urn: str, + table: Dict[str, Any], + dataset_schema: Optional[SchemaMetadataClass] = None, + ) -> Iterable[MetadataWorkUnit]: + """ + Generate lineage MCP for a Hive table to its storage location. + + Args: + dataset_urn: URN of the Hive dataset + table: Hive table dictionary containing metadata + dataset_schema: Optional schema metadata for the Hive dataset + + Returns: + MetadataWorkUnit containing the lineage MCP if successful + """ + + platform_instance = None + + if not self.config.emit_storage_lineage: + return + + # Get storage location from table + storage_location = table.get("StorageDescriptor", {}).get("Location") + if not storage_location: + return + + # Create storage dataset URN + storage_info = self._make_storage_dataset_urn(storage_location) + if not storage_info: + self.report.report_location_failed(storage_location) + return + + storage_urn, storage_platform = storage_info + self.report.report_location_scanned() + + if self.config.storage_platform_instance: + platform_instance = self.config.storage_platform_instance.lower() + + # Create storage dataset entity + yield from self.get_storage_dataset_mcp( + storage_location=storage_location, + platform_instance=platform_instance, + schema_metadata=dataset_schema, + ) + + # Get storage schema if available (implement based on storage system) + storage_schema = ( + self._get_storage_schema(storage_location, dataset_schema) + if dataset_schema + else None + ) + + # Generate fine-grained lineage if schemas available + fine_grained_lineages = ( + None + if not (dataset_schema and storage_schema) + else self._get_fine_grained_lineages( + dataset_urn, storage_urn, dataset_schema, storage_schema + ) + ) + + # Create lineage MCP + if self.config.hive_storage_lineage_direction == "upstream": + yield from self._create_lineage_mcp( + source_urn=storage_urn, + target_urn=dataset_urn, + fine_grained_lineages=fine_grained_lineages, + ) + else: + yield from self._create_lineage_mcp( + source_urn=dataset_urn, + target_urn=storage_urn, + fine_grained_lineages=fine_grained_lineages, + ) + + def _get_storage_schema( + self, + storage_location: str, + table_schema: Optional[SchemaMetadataClass] = None, + ) -> Optional[SchemaMetadataClass]: + """ + Get schema metadata for storage location. + Currently supports: + - Delta tables + - Parquet files + - Spark tables + + Returns: + SchemaMetadataClass if schema can be inferred, None otherwise + """ + + if not table_schema: + return None + + storage_info = StoragePathParser.parse_storage_location(storage_location) + if not storage_info: + return None + + platform, _ = storage_info + + return SchemaMetadataClass( + schemaName=f"{platform.value}_schema", + platform=f"urn:li:dataPlatform:{platform.value}", + version=0, + fields=table_schema.fields, + hash="", + platformSchema=OtherSchemaClass(rawSchema=""), + ) + + try: from databricks_dbapi.sqlalchemy_dialects.hive import DatabricksPyhiveDialect from pyhive.sqlalchemy_hive import _type_map @@ -94,8 +617,8 @@ def dbapi_get_columns_patched(self, connection, table_name, schema=None, **kw): DatabricksPyhiveDialect.get_columns = dbapi_get_columns_patched except ModuleNotFoundError: pass -except Exception as e: - logger.warning(f"Failed to patch method due to {e}") +except Exception as exp: + logger.warning(f"Failed to patch method due to {exp}") @reflection.cache # type: ignore @@ -126,10 +649,48 @@ class HiveConfig(TwoTierSQLAlchemyConfig): # defaults scheme: str = Field(default="hive", hidden_from_docs=True) + # Overriding as table location lineage is richer implementation here than with include_table_location_lineage + include_table_location_lineage: bool = Field(default=False, hidden_from_docs=True) + + emit_storage_lineage: bool = Field( + default=False, + description="Whether to emit storage-to-Hive lineage", + ) + hive_storage_lineage_direction: str = Field( + default="upstream", + description="If 'upstream', storage is upstream to Hive. If 'downstream' storage is downstream to Hive", + ) + include_column_lineage: bool = Field( + default=True, + description="When enabled, column-level lineage will be extracted from storage", + ) + storage_platform_instance: Optional[str] = Field( + default=None, + description="Platform instance for the storage system", + ) + @validator("host_port") def clean_host_port(cls, v): return config_clean.remove_protocol(v) + @validator("hive_storage_lineage_direction") + def _validate_direction(cls, v: str) -> str: + """Validate the lineage direction.""" + if v.lower() not in ["upstream", "downstream"]: + raise ValueError( + "storage_lineage_direction must be either upstream or downstream" + ) + return v.lower() + + def get_storage_lineage_config(self) -> HiveStorageLineageConfig: + """Convert base config parameters to HiveStorageLineageConfig""" + return HiveStorageLineageConfig( + emit_storage_lineage=self.emit_storage_lineage, + hive_storage_lineage_direction=self.hive_storage_lineage_direction, + include_column_lineage=self.include_column_lineage, + storage_platform_instance=self.storage_platform_instance, + ) + @platform_name("Hive") @config_class(HiveConfig) @@ -151,12 +712,49 @@ class HiveSource(TwoTierSQLAlchemySource): def __init__(self, config, ctx): super().__init__(config, ctx, "hive") + self.storage_lineage = HiveStorageLineage( + config=config.get_storage_lineage_config(), + env=config.env, + convert_urns_to_lowercase=config.convert_urns_to_lowercase, + ) @classmethod def create(cls, config_dict, ctx): config = HiveConfig.parse_obj(config_dict) return cls(config, ctx) + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + """Generate workunits for tables and their storage lineage.""" + for wu in super().get_workunits_internal(): + yield wu + + if not isinstance(wu, MetadataWorkUnit): + continue + + # Get dataset URN and required aspects using workunit methods + try: + dataset_urn = wu.get_urn() + dataset_props = wu.get_aspect_of_type(DatasetPropertiesClass) + schema_metadata = wu.get_aspect_of_type(SchemaMetadataClass) + except Exception as exp: + logger.warning(f"Failed to process workunit {wu.id}: {exp}") + continue + + # Only proceed if we have the necessary properties + if dataset_props and dataset_props.customProperties: + table = { + "StorageDescriptor": { + "Location": dataset_props.customProperties.get("Location") + } + } + + if table.get("StorageDescriptor", {}).get("Location"): + yield from self.storage_lineage.get_lineage_mcp( + dataset_urn=dataset_urn, + table=table, + dataset_schema=schema_metadata, + ) + def get_schema_names(self, inspector): assert isinstance(self.config, HiveConfig) # This condition restricts the ingestion to the specified database. @@ -173,7 +771,7 @@ def get_schema_fields_for_column( pk_constraints: Optional[Dict[Any, Any]] = None, partition_keys: Optional[List[str]] = None, tags: Optional[List[str]] = None, - ) -> List[SchemaField]: + ) -> List[SchemaFieldClass]: fields = super().get_schema_fields_for_column( dataset_name, column, From 494c522405830aaec181bcd2d61b2cfe9a53f155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Sun, 22 Dec 2024 13:21:41 +0100 Subject: [PATCH 31/35] fix(ingest/mssql): add container dataflow/ datajob entities (#12194) --- .../ingestion/source/sql/mssql/job_models.py | 26 +++ .../ingestion/source/sql/mssql/source.py | 10 + .../golden_mces_mssql_no_db_to_file.json | 207 ++++++++++++++++- .../golden_mces_mssql_no_db_with_filter.json | 162 ++++++++++++- .../golden_mces_mssql_to_file.json | 219 +++++++++++++++++- ...golden_mces_mssql_with_lower_case_urn.json | 207 ++++++++++++++++- 6 files changed, 795 insertions(+), 36 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py index d3941e7add0fd0..0cd62611519285 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -7,7 +7,9 @@ make_data_platform_urn, make_dataplatform_instance_urn, ) +from datahub.emitter.mcp_builder import DatabaseKey from datahub.metadata.schema_classes import ( + ContainerClass, DataFlowInfoClass, DataJobInfoClass, DataJobInputOutputClass, @@ -210,6 +212,18 @@ def as_datajob_info_aspect(self) -> DataJobInfoClass: status=self.status, ) + @property + def as_container_aspect(self) -> ContainerClass: + databaseKey = DatabaseKey( + platform=self.entity.flow.orchestrator, + instance=self.entity.flow.platform_instance + if self.entity.flow.platform_instance + else None, + env=self.entity.flow.env, + database=self.entity.flow.db, + ) + return ContainerClass(container=databaseKey.as_urn()) + @property def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: if self.entity.flow.platform_instance: @@ -257,6 +271,18 @@ def as_dataflow_info_aspect(self) -> DataFlowInfoClass: externalUrl=self.external_url, ) + @property + def as_container_aspect(self) -> ContainerClass: + databaseKey = DatabaseKey( + platform=self.entity.orchestrator, + instance=self.entity.platform_instance + if self.entity.platform_instance + else None, + env=self.entity.env, + database=self.entity.db, + ) + return ContainerClass(container=databaseKey.as_urn()) + @property def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: if self.entity.platform_instance: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index 9d8b67041998ce..547adcc8eccc9e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -639,6 +639,11 @@ def construct_job_workunits( aspect=data_job.as_datajob_info_aspect, ).as_workunit() + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_job.as_container_aspect, + ).as_workunit() + data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect if data_platform_instance_aspect: yield MetadataChangeProposalWrapper( @@ -662,6 +667,11 @@ def construct_flow_workunits( aspect=data_flow.as_dataflow_info_aspect, ).as_workunit() + yield MetadataChangeProposalWrapper( + entityUrn=data_flow.urn, + aspect=data_flow.as_container_aspect, + ).as_workunit() + data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect if data_platform_instance_aspect: yield MetadataChangeProposalWrapper( diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index 72dcda25c1296c..720ef0b3929453 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -105,6 +105,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -113,11 +150,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", + "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-05 16:44:43.910000", - "date_modified": "2024-12-05 16:44:44.043000", + "date_created": "2024-12-20 15:15:24.483000", + "date_modified": "2024-12-20 15:15:24.653000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -136,6 +173,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -154,6 +207,27 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2103,8 +2177,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2269,6 +2343,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2282,8 +2393,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-05 16:44:43.800000", - "date_modified": "2024-12-05 16:44:43.800000" + "date_created": "2024-12-20 15:15:24.290000", + "date_modified": "2024-12-20 15:15:24.290000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2298,6 +2409,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2310,8 +2458,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-05 16:44:43.803000", - "date_modified": "2024-12-05 16:44:43.803000" + "date_created": "2024-12-20 15:15:24.300000", + "date_modified": "2024-12-20 15:15:24.300000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2326,6 +2474,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", @@ -4427,8 +4612,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n" }, "name": "View1", "tags": [] diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index 0df89ff1eb94d7..cf3abbfc62997a 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -105,6 +105,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -113,11 +150,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", + "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-05 16:44:43.910000", - "date_modified": "2024-12-05 16:44:44.043000", + "date_created": "2024-12-20 15:15:24.483000", + "date_modified": "2024-12-20 15:15:24.653000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -136,6 +173,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -154,6 +207,27 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2103,8 +2177,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2269,6 +2343,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2282,8 +2393,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-05 16:44:43.800000", - "date_modified": "2024-12-05 16:44:43.800000" + "date_created": "2024-12-20 15:15:24.290000", + "date_modified": "2024-12-20 15:15:24.290000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2298,6 +2409,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index b36188405e7e11..c2289f954a36ee 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -112,6 +112,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", @@ -129,6 +145,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -137,11 +178,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "b8907be7-52f5-4df4-a870-f4fe0679ec45", + "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-19 12:34:45.843000", - "date_modified": "2024-12-19 12:34:46.017000", + "date_created": "2024-12-20 15:15:24.483000", + "date_modified": "2024-12-20 15:15:24.653000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -160,6 +201,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -195,6 +252,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", @@ -2502,6 +2584,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", @@ -2519,6 +2617,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2532,8 +2655,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-19 12:34:45.660000", - "date_modified": "2024-12-19 12:34:45.660000" + "date_created": "2024-12-20 15:15:24.290000", + "date_modified": "2024-12-20 15:15:24.290000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2548,6 +2671,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2565,6 +2704,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2577,8 +2741,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-19 12:34:45.667000", - "date_modified": "2024-12-19 12:34:45.667000" + "date_created": "2024-12-20 15:15:24.300000", + "date_modified": "2024-12-20 15:15:24.300000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2593,6 +2757,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2610,6 +2790,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index ebcadcc11dcbfa..4db18dae27b7e9 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -105,6 +105,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -113,11 +150,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "4130c37d-146c-43da-a671-dd9a413a44b3", + "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-11-22 12:58:03.260000", - "date_modified": "2024-11-22 12:58:03.440000", + "date_created": "2024-12-20 15:15:24.483000", + "date_modified": "2024-12-20 15:15:24.653000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -136,6 +173,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -154,6 +207,27 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2103,8 +2177,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2269,6 +2343,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2282,8 +2393,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-11-22 12:58:03.137000", - "date_modified": "2024-11-22 12:58:03.137000" + "date_created": "2024-12-20 15:15:24.290000", + "date_modified": "2024-12-20 15:15:24.290000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2298,6 +2409,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2310,8 +2458,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-11-22 12:58:03.140000", - "date_modified": "2024-11-22 12:58:03.140000" + "date_created": "2024-12-20 15:15:24.300000", + "date_modified": "2024-12-20 15:15:24.300000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2326,6 +2474,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", @@ -4427,8 +4612,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n" }, "name": "View1", "tags": [] From ff262bc65e7ab3e067f51a412cfb40db6e726fea Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Sun, 22 Dec 2024 18:24:18 +0530 Subject: [PATCH 32/35] Revert "fix(mssql): adds missing containers for dataflow and datajob entities, required for browse paths v2 generation" (#12201) --- .../ingestion/source/sql/mssql/job_models.py | 26 --- .../ingestion/source/sql/mssql/source.py | 10 - .../golden_mces_mssql_no_db_to_file.json | 207 +---------------- .../golden_mces_mssql_no_db_with_filter.json | 162 +------------ .../golden_mces_mssql_to_file.json | 219 +----------------- ...golden_mces_mssql_with_lower_case_urn.json | 207 +---------------- 6 files changed, 36 insertions(+), 795 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py index 0cd62611519285..d3941e7add0fd0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -7,9 +7,7 @@ make_data_platform_urn, make_dataplatform_instance_urn, ) -from datahub.emitter.mcp_builder import DatabaseKey from datahub.metadata.schema_classes import ( - ContainerClass, DataFlowInfoClass, DataJobInfoClass, DataJobInputOutputClass, @@ -212,18 +210,6 @@ def as_datajob_info_aspect(self) -> DataJobInfoClass: status=self.status, ) - @property - def as_container_aspect(self) -> ContainerClass: - databaseKey = DatabaseKey( - platform=self.entity.flow.orchestrator, - instance=self.entity.flow.platform_instance - if self.entity.flow.platform_instance - else None, - env=self.entity.flow.env, - database=self.entity.flow.db, - ) - return ContainerClass(container=databaseKey.as_urn()) - @property def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: if self.entity.flow.platform_instance: @@ -271,18 +257,6 @@ def as_dataflow_info_aspect(self) -> DataFlowInfoClass: externalUrl=self.external_url, ) - @property - def as_container_aspect(self) -> ContainerClass: - databaseKey = DatabaseKey( - platform=self.entity.orchestrator, - instance=self.entity.platform_instance - if self.entity.platform_instance - else None, - env=self.entity.env, - database=self.entity.db, - ) - return ContainerClass(container=databaseKey.as_urn()) - @property def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: if self.entity.platform_instance: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index 547adcc8eccc9e..9d8b67041998ce 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -639,11 +639,6 @@ def construct_job_workunits( aspect=data_job.as_datajob_info_aspect, ).as_workunit() - yield MetadataChangeProposalWrapper( - entityUrn=data_job.urn, - aspect=data_job.as_container_aspect, - ).as_workunit() - data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect if data_platform_instance_aspect: yield MetadataChangeProposalWrapper( @@ -667,11 +662,6 @@ def construct_flow_workunits( aspect=data_flow.as_dataflow_info_aspect, ).as_workunit() - yield MetadataChangeProposalWrapper( - entityUrn=data_flow.urn, - aspect=data_flow.as_container_aspect, - ).as_workunit() - data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect if data_platform_instance_aspect: yield MetadataChangeProposalWrapper( diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index 720ef0b3929453..72dcda25c1296c 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -105,43 +105,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -150,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", + "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-20 15:15:24.483000", - "date_modified": "2024-12-20 15:15:24.653000", + "date_created": "2024-12-05 16:44:43.910000", + "date_modified": "2024-12-05 16:44:44.043000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -173,22 +136,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -207,27 +154,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2177,8 +2103,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "is_view": "True" }, "name": "PersonsView", "tags": [] @@ -2343,43 +2269,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2393,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-20 15:15:24.290000", - "date_modified": "2024-12-20 15:15:24.290000" + "date_created": "2024-12-05 16:44:43.800000", + "date_modified": "2024-12-05 16:44:43.800000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2409,43 +2298,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2458,8 +2310,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-20 15:15:24.300000", - "date_modified": "2024-12-20 15:15:24.300000" + "date_created": "2024-12-05 16:44:43.803000", + "date_modified": "2024-12-05 16:44:43.803000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2474,43 +2326,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", @@ -4612,8 +4427,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n" + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", + "is_view": "True" }, "name": "View1", "tags": [] diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index cf3abbfc62997a..0df89ff1eb94d7 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -105,43 +105,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -150,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", + "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-20 15:15:24.483000", - "date_modified": "2024-12-20 15:15:24.653000", + "date_created": "2024-12-05 16:44:43.910000", + "date_modified": "2024-12-05 16:44:44.043000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -173,22 +136,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -207,27 +154,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2177,8 +2103,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "is_view": "True" }, "name": "PersonsView", "tags": [] @@ -2343,43 +2269,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2393,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-20 15:15:24.290000", - "date_modified": "2024-12-20 15:15:24.290000" + "date_created": "2024-12-05 16:44:43.800000", + "date_modified": "2024-12-05 16:44:43.800000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2409,43 +2298,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index c2289f954a36ee..b36188405e7e11 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -112,22 +112,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", @@ -145,31 +129,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -178,11 +137,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", + "job_id": "b8907be7-52f5-4df4-a870-f4fe0679ec45", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-20 15:15:24.483000", - "date_modified": "2024-12-20 15:15:24.653000", + "date_created": "2024-12-19 12:34:45.843000", + "date_modified": "2024-12-19 12:34:46.017000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -201,22 +160,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -252,31 +195,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", @@ -2584,22 +2502,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", @@ -2617,31 +2519,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2655,8 +2532,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-20 15:15:24.290000", - "date_modified": "2024-12-20 15:15:24.290000" + "date_created": "2024-12-19 12:34:45.660000", + "date_modified": "2024-12-19 12:34:45.660000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2671,22 +2548,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2704,31 +2565,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2741,8 +2577,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-20 15:15:24.300000", - "date_modified": "2024-12-20 15:15:24.300000" + "date_created": "2024-12-19 12:34:45.667000", + "date_modified": "2024-12-19 12:34:45.667000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2757,22 +2593,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2790,31 +2610,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index 4db18dae27b7e9..ebcadcc11dcbfa 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -105,43 +105,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -150,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", + "job_id": "4130c37d-146c-43da-a671-dd9a413a44b3", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-20 15:15:24.483000", - "date_modified": "2024-12-20 15:15:24.653000", + "date_created": "2024-11-22 12:58:03.260000", + "date_modified": "2024-11-22 12:58:03.440000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -173,22 +136,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -207,27 +154,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2177,8 +2103,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "is_view": "True" }, "name": "PersonsView", "tags": [] @@ -2343,43 +2269,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2393,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-20 15:15:24.290000", - "date_modified": "2024-12-20 15:15:24.290000" + "date_created": "2024-11-22 12:58:03.137000", + "date_modified": "2024-11-22 12:58:03.137000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2409,43 +2298,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2458,8 +2310,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-20 15:15:24.300000", - "date_modified": "2024-12-20 15:15:24.300000" + "date_created": "2024-11-22 12:58:03.140000", + "date_modified": "2024-11-22 12:58:03.140000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2474,43 +2326,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", @@ -4612,8 +4427,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n" + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", + "is_view": "True" }, "name": "View1", "tags": [] From 73dce9e4180d7beef1ea6c9a7c9eeedbc551d18a Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Sun, 22 Dec 2024 10:28:19 -0600 Subject: [PATCH 33/35] =?UTF-8?q?chore(bump):=20bump=20node=20version=20lo?= =?UTF-8?q?ng=20term=20support=20release=20(build=20time=20=E2=80=A6=20(#1?= =?UTF-8?q?2199)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/build-and-test.yml | 2 +- .github/workflows/docker-unified.yml | 2 +- datahub-web-react/build.gradle | 3 +-- datahub-web-react/package.json | 2 +- docs-website/build.gradle | 2 +- smoke-test/build.gradle | 2 +- 6 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 1b10fe6e74372b..98071b536a336a 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -110,7 +110,7 @@ jobs: run: | ./gradlew :datahub-frontend:build :datahub-web-react:build --parallel env: - NODE_OPTIONS: "--max-old-space-size=3072" + NODE_OPTIONS: "--max-old-space-size=4096" - name: Gradle compile (jdk8) for legacy Spark if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }} run: | diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 16a2d29e9fd85e..03a9b3afc3bc58 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -446,7 +446,7 @@ jobs: ./gradlew :datahub-frontend:dist -x test -x yarnTest -x yarnLint --parallel mv ./datahub-frontend/build/distributions/datahub-frontend-*.zip datahub-frontend.zip env: - NODE_OPTIONS: "--max-old-space-size=3072" + NODE_OPTIONS: "--max-old-space-size=4096" - name: Build and push uses: ./.github/actions/docker-custom-build-and-push with: diff --git a/datahub-web-react/build.gradle b/datahub-web-react/build.gradle index b9fffce173c5c4..bf1aa401e3f560 100644 --- a/datahub-web-react/build.gradle +++ b/datahub-web-react/build.gradle @@ -16,7 +16,7 @@ node { } // Version of node to use. - version = '21.2.0' + version = '22.12.0' // Version of Yarn to use. yarnVersion = '1.22.22' @@ -93,7 +93,6 @@ task yarnLintFix(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { } task yarnBuild(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { - environment = [NODE_OPTIONS: "--max-old-space-size=3072 --openssl-legacy-provider"] args = ['run', 'build'] outputs.cacheIf { true } diff --git a/datahub-web-react/package.json b/datahub-web-react/package.json index 31c10804482f0c..2d1d667a89f14a 100644 --- a/datahub-web-react/package.json +++ b/datahub-web-react/package.json @@ -90,7 +90,7 @@ "analyze": "source-map-explorer 'dist/assets/*.js'", "start": "yarn run generate && vite", "ec2-dev": "yarn run generate && CI=true;export CI;vite", - "build": "yarn run generate && NODE_OPTIONS='--max-old-space-size=3072 --openssl-legacy-provider' CI=false vite build", + "build": "yarn run generate && NODE_OPTIONS='--max-old-space-size=4096 --openssl-legacy-provider' CI=false vite build", "test": "vitest", "generate": "graphql-codegen --config codegen.yml", "lint": "eslint . --ext .ts,.tsx --quiet && yarn format-check && yarn type-check", diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 797863d2019fbd..1be790695e87e6 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -14,7 +14,7 @@ node { } // Version of node to use. - version = '21.2.0' + version = '22.12.0' // Version of Yarn to use. yarnVersion = '1.22.22' diff --git a/smoke-test/build.gradle b/smoke-test/build.gradle index def3e814b2ba0a..73ecdcb08ea149 100644 --- a/smoke-test/build.gradle +++ b/smoke-test/build.gradle @@ -16,7 +16,7 @@ node { } // Version of node to use. - version = '21.2.0' + version = '22.12.0' // Version of Yarn to use. yarnVersion = '1.22.22' From 0562c7a190c4548e29c7845fa44e9adf0248e4de Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Mon, 23 Dec 2024 16:56:54 +0530 Subject: [PATCH 34/35] fix(ingest): exclude aspect from migration (#12206) --- .../src/datahub/ingestion/source/datahub/config.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py index a3304334cb1ebc..cd3c2146e6d848 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py @@ -14,6 +14,17 @@ DEFAULT_DATABASE_TABLE_NAME = "metadata_aspect_v2" DEFAULT_KAFKA_TOPIC_NAME = "MetadataChangeLog_Timeseries_v1" DEFAULT_DATABASE_BATCH_SIZE = 10_000 +DEFAULT_EXCLUDE_ASPECTS = { + "dataHubIngestionSourceKey", + "dataHubIngestionSourceInfo", + "datahubIngestionRunSummary", + "datahubIngestionCheckpoint", + "dataHubSecretKey", + "dataHubSecretValue", + "globalSettingsKey", + "globalSettingsInfo", + "testResults", +} class DataHubSourceConfig(StatefulIngestionConfigBase): @@ -44,7 +55,7 @@ class DataHubSourceConfig(StatefulIngestionConfigBase): ) exclude_aspects: Set[str] = Field( - default_factory=set, + default=DEFAULT_EXCLUDE_ASPECTS, description="Set of aspect names to exclude from ingestion", ) From d06980f6f3421ac5d3a3fc21d5c15f3e3057338f Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Mon, 23 Dec 2024 19:11:40 +0530 Subject: [PATCH 35/35] fix(ingest/snowflake): handle empty snowflake column upstreams (#12207) --- .../source/snowflake/snowflake_lineage_v2.py | 6 ++--- .../unit/snowflake/test_snowflake_source.py | 24 +++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 69f28a0e6e595a..b815a6584379ac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -4,7 +4,7 @@ from datetime import datetime from typing import Any, Collection, Iterable, List, Optional, Set, Tuple, Type -from pydantic import BaseModel, validator +from pydantic import BaseModel, Field, validator from datahub.configuration.datetimes import parse_absolute_time from datahub.ingestion.api.closeable import Closeable @@ -72,8 +72,8 @@ class ColumnUpstreamJob(BaseModel): class ColumnUpstreamLineage(BaseModel): - column_name: str - upstreams: List[ColumnUpstreamJob] + column_name: Optional[str] + upstreams: List[ColumnUpstreamJob] = Field(default_factory=list) class UpstreamTableNode(BaseModel): diff --git a/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py index c735feb5396086..2ff85a08f052f9 100644 --- a/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py @@ -18,6 +18,7 @@ DEFAULT_TEMP_TABLES_PATTERNS, SnowflakeV2Config, ) +from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import UpstreamLineageEdge from datahub.ingestion.source.snowflake.snowflake_query import ( SnowflakeQuery, create_deny_regex_sql_filter, @@ -664,3 +665,26 @@ def test_create_snowsight_base_url_ap_northeast_1(): def test_snowflake_utils() -> None: assert_doctest(datahub.ingestion.source.snowflake.snowflake_utils) + + +def test_snowflake_query_result_parsing(): + db_row = { + "DOWNSTREAM_TABLE_NAME": "db.schema.downstream_table", + "DOWNSTREAM_TABLE_DOMAIN": "Table", + "UPSTREAM_TABLES": [ + { + "query_id": "01b92f61-0611-c826-000d-0103cf9b5db7", + "upstream_object_domain": "Table", + "upstream_object_name": "db.schema.upstream_table", + } + ], + "UPSTREAM_COLUMNS": [{}], + "QUERIES": [ + { + "query_id": "01b92f61-0611-c826-000d-0103cf9b5db7", + "query_text": "Query test", + "start_time": "2022-12-01 19:56:34", + } + ], + } + assert UpstreamLineageEdge.parse_obj(db_row)