This is an automated email from the ASF dual-hosted git repository. sarath pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/atlas.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new 3c60783 ATLAS-4324: FS entity created for load data inpath is created as shell entity 3c60783 is described below commit 3c607836173472dfa05e60ff102c50133e02b709 Author: Radhika Kundam <rkun...@cloudera.com> AuthorDate: Thu Jun 3 00:10:12 2021 -0700 ATLAS-4324: FS entity created for load data inpath is created as shell entity Signed-off-by: Sarath Subramanian <sar...@apache.org> (cherry picked from commit a9e528bca9277707a1847d110c33288188a8fff5) --- .../java/org/apache/atlas/hive/hook/HiveHook.java | 1 + .../atlas/hive/hook/utils/HiveDDLEntityFilter.java | 47 +++++++++++--- .../hive/hook/utils/ActiveEntityFilterTest.java | 34 +++++++++- .../src/test/resources/json/hs2-create-db-v2.json | 24 +++++++ ...eate-db-with-no-pathentities-to-retain-v2.json} | 0 ...2-create-db-with-no-pathentities-to-retain.json | 73 ++++++++++++++++++++++ .../test/resources/json/hs2-create-table-v2.json | 17 ++++- .../test/resources/json/hs2-load-inpath-v2.json | 18 ++++++ ...d-inpath-with-no-pathentities-to-retain-v2.json | 4 ++ ...load-inpath-with-no-pathentities-to-retain.json | 18 ++++++ .../src/test/resources/json/hs2-load-inpath.json | 18 ++++++ 11 files changed, 240 insertions(+), 14 deletions(-) diff --git a/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/HiveHook.java b/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/HiveHook.java index 5e6785a..9548eda 100644 --- a/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/HiveHook.java +++ b/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/HiveHook.java @@ -70,6 +70,7 @@ public class HiveHook extends AtlasHook implements ExecuteWithHookContext { public static final String HOOK_HIVE_TABLE_PRUNE_PATTERN = CONF_PREFIX + "hive_table.prune.pattern"; public static final String HOOK_HIVE_TABLE_CACHE_SIZE = CONF_PREFIX + "hive_table.cache.size"; public static final String HOOK_HIVE_IGNORE_DDL_OPERATIONS = CONF_PREFIX + "hs2.ignore.ddl.operations"; + public static final String HOOK_HIVE_FILTER_ENTITY_TYPES_TO_RETAIN = CONF_PREFIX + "hs2.filter.entity.types.to.retain"; public static final String DEFAULT_HOST_NAME = "localhost"; private static final Map<String, HiveOperation> OPERATION_MAP = new HashMap<>(); diff --git a/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/utils/HiveDDLEntityFilter.java b/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/utils/HiveDDLEntityFilter.java index 2b39e81..9163c47 100644 --- a/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/utils/HiveDDLEntityFilter.java +++ b/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/utils/HiveDDLEntityFilter.java @@ -18,15 +18,20 @@ package org.apache.atlas.hive.hook.utils; import com.google.common.annotations.VisibleForTesting; +import org.apache.atlas.ApplicationProperties; +import org.apache.atlas.hive.hook.HiveHook; import org.apache.atlas.hive.hook.events.BaseHiveEvent; import org.apache.atlas.model.instance.AtlasEntity; import org.apache.atlas.model.instance.AtlasObjectId; -import org.apache.atlas.model.instance.AtlasRelatedObjectId; import org.apache.atlas.model.notification.HookNotification; +import org.apache.atlas.utils.AtlasPathExtractorUtil; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.MapUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; @@ -36,14 +41,38 @@ import java.util.function.Function; import java.util.stream.Collectors; public class HiveDDLEntityFilter implements EntityFilter { + private static final Logger LOG = LoggerFactory.getLogger(HiveDDLEntityFilter.class); + + private static final Set<String> defaultPathTypes = new HashSet<String>() {{ + add(AtlasPathExtractorUtil.HDFS_TYPE_PATH); + add(AtlasPathExtractorUtil.ADLS_GEN2_DIRECTORY); + add(AtlasPathExtractorUtil.GCS_VIRTUAL_DIR); + add(AtlasPathExtractorUtil.AWS_S3_V2_PSEUDO_DIR); + add(AtlasPathExtractorUtil.AWS_S3_PSEUDO_DIR); + }}; + private static final Set<String> typesToRetain = new HashSet<String>() {{ add(BaseHiveEvent.HIVE_TYPE_PROCESS); add(BaseHiveEvent.HIVE_TYPE_PROCESS_EXECUTION); add(BaseHiveEvent.HIVE_TYPE_COLUMN_LINEAGE); add(BaseHiveEvent.HIVE_DB_DDL); add(BaseHiveEvent.HIVE_TABLE_DDL); + addAll(defaultPathTypes); + addAll(getConfiguredTypesToRetain()); }}; + private static List<String> getConfiguredTypesToRetain() { + String[] configuredTypesToRetain = null; + + try { + configuredTypesToRetain = ApplicationProperties.get().getStringArray(HiveHook.HOOK_HIVE_FILTER_ENTITY_TYPES_TO_RETAIN); + } catch (Exception e) { + LOG.error("Failed to load application properties", e); + } + + return configuredTypesToRetain != null ? Arrays.asList(configuredTypesToRetain) : new ArrayList<>(); + } + public List<HookNotification> apply(List<HookNotification> incoming) { if (CollectionUtils.isEmpty(incoming)) { return incoming; @@ -164,15 +193,11 @@ public class HiveDDLEntityFilter implements EntityFilter { } private static void inferObjectTypeResetGuid(Object o) { - if (o instanceof AtlasRelatedObjectId) { - AtlasRelatedObjectId oid = (AtlasRelatedObjectId) o; - if (oid.getUniqueAttributes() != null) { - oid.setGuid(null); - } - } if (o instanceof AtlasObjectId) { - AtlasObjectId oid = (AtlasObjectId) o; - if (oid.getUniqueAttributes() != null) { + AtlasObjectId oid = (AtlasObjectId) o; + String typeName = oid.getTypeName(); + + if (oid.getUniqueAttributes() != null && !defaultPathTypes.contains(typeName)) { oid.setGuid(null); } } else { @@ -181,7 +206,9 @@ public class HiveDDLEntityFilter implements EntityFilter { return; } - if (hm.containsKey(BaseHiveEvent.ATTRIBUTE_UNIQUE_ATTRIBUTES)) { + String typeName = hm.containsKey(AtlasObjectId.KEY_TYPENAME) ? (String) hm.get(AtlasObjectId.KEY_TYPENAME) : null; + + if (hm.containsKey(BaseHiveEvent.ATTRIBUTE_UNIQUE_ATTRIBUTES) && !defaultPathTypes.contains(typeName)) { hm.put(BaseHiveEvent.ATTRIBUTE_GUID, null); } } diff --git a/addons/hive-bridge/src/test/java/org/apache/atlas/hive/hook/utils/ActiveEntityFilterTest.java b/addons/hive-bridge/src/test/java/org/apache/atlas/hive/hook/utils/ActiveEntityFilterTest.java index 6a7af5e..4dde1dc 100644 --- a/addons/hive-bridge/src/test/java/org/apache/atlas/hive/hook/utils/ActiveEntityFilterTest.java +++ b/addons/hive-bridge/src/test/java/org/apache/atlas/hive/hook/utils/ActiveEntityFilterTest.java @@ -35,6 +35,7 @@ import java.util.Map; import java.util.Set; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotEquals; import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertTrue; @@ -55,6 +56,9 @@ public class ActiveEntityFilterTest { assertMessageFromFile("hs2-alter-view"); assertMessageFromFile("hs2-drop-table"); assertAtlasEntitiesWithExtInfoFromFile("hs2-create-process"); + assertAtlasEntitiesWithExtInfoFromFile("hs2-load-inpath"); + assertAtlasEntitiesWithExtInfoFromFile("hs2-create-db-with-no-pathentities-to-retain", false); + assertAtlasEntitiesWithExtInfoFromFile("hs2-load-inpath-with-no-pathentities-to-retain", false); } private void assertMessageFromFile(String msgFile) throws IOException { @@ -135,20 +139,31 @@ public class ActiveEntityFilterTest { } private void assertAtlasEntitiesWithExtInfoFromFile(String entityFile) throws IOException { + assertAtlasEntitiesWithExtInfoFromFile(entityFile, true); + } + + private void assertAtlasEntitiesWithExtInfoFromFile(String entityFile, boolean retainPathEntities) throws IOException { AtlasEntity.AtlasEntitiesWithExtInfo incoming = TestResourceFileUtils.readObjectFromJson("", entityFile, AtlasEntity.AtlasEntitiesWithExtInfo.class); AtlasEntity.AtlasEntitiesWithExtInfo expected = TestResourceFileUtils.readObjectFromJson("", entityFile + FILE_SUFFIX, AtlasEntity.AtlasEntitiesWithExtInfo.class); HiveDDLEntityFilter hiveLineageEntityFilter = new HiveDDLEntityFilter(); AtlasEntity.AtlasEntitiesWithExtInfo actual = hiveLineageEntityFilter.apply(incoming); - assertAtlasEntitiesWithExtInfo(actual, expected); + + if (retainPathEntities) { + assertAtlasEntitiesWithExtInfo(actual, expected); + } else { + assertAtlasEntitiesWithNoPathEntitiesToRetain(actual, expected); + } } private void assertAtlasEntitiesWithExtInfo(AtlasEntity.AtlasEntitiesWithExtInfo actual, AtlasEntity.AtlasEntitiesWithExtInfo expected) { assertNotNull(actual); assertNotNull(expected); - assertEquals(actual.getEntities().size(), expected.getEntities().size()); - assertEntity(actual.getEntities(), expected.getEntities()); + if (expected.getEntities() != null && actual.getEntities() != null) { + assertEquals(actual.getEntities().size(), expected.getEntities().size()); + assertEntity(actual.getEntities(), expected.getEntities()); + } assertEquals(MapUtils.isEmpty(actual.getReferredEntities()), MapUtils.isEmpty(expected.getReferredEntities())); if (expected.getReferredEntities() != null && actual.getReferredEntities() != null) { @@ -156,6 +171,19 @@ public class ActiveEntityFilterTest { } } + private void assertAtlasEntitiesWithNoPathEntitiesToRetain(AtlasEntity.AtlasEntitiesWithExtInfo actual, AtlasEntity.AtlasEntitiesWithExtInfo expected) { + assertNotNull(actual); + assertNotNull(expected); + + if (expected.getEntities() != null && actual.getEntities() != null) { + assertNotEquals(actual.getEntities().size(), expected.getEntities().size()); + } + + if (expected.getReferredEntities() != null && actual.getReferredEntities() != null) { + assertNotEquals(actual.getReferredEntities().size(), expected.getReferredEntities().size()); + } + } + private void assertEntity(Map<String, AtlasEntity> actual, Map<String, AtlasEntity> expected) { assertEquals(actual.size(), expected.size()); } diff --git a/addons/hive-bridge/src/test/resources/json/hs2-create-db-v2.json b/addons/hive-bridge/src/test/resources/json/hs2-create-db-v2.json index 42553b5..881ee10 100644 --- a/addons/hive-bridge/src/test/resources/json/hs2-create-db-v2.json +++ b/addons/hive-bridge/src/test/resources/json/hs2-create-db-v2.json @@ -25,6 +25,30 @@ } }, "proxy": false + }, + { + "typeName": "hdfs_path", + "attributes": { + "path": "hdfs://ve0126.halxg.cloudera.com:8020/warehouse/tablespace/external/hive/cadb02.db", + "qualifiedName": "hdfs://ve0126.halxg.cloudera.com:8020/warehouse/tablespace/external/hive/cadb02.db@cm", + "clusterName": "cm", + "name": "/warehouse/tablespace/external/hive/cadb02.db" + }, + "guid": "-14529329955589450", + "isIncomplete": false, + "provenanceType": 0, + "version": 0, + "relationshipAttributes": { + "hiveDb": { + "guid": "-14529329955589448", + "typeName": "hive_db", + "uniqueAttributes": { + "qualifiedName": "cadb02@cm" + }, + "relationshipType": "hive_db_location" + } + }, + "proxy": false } ] } diff --git a/addons/hive-bridge/src/test/resources/json/hs2-create-db-v2.json b/addons/hive-bridge/src/test/resources/json/hs2-create-db-with-no-pathentities-to-retain-v2.json similarity index 100% copy from addons/hive-bridge/src/test/resources/json/hs2-create-db-v2.json copy to addons/hive-bridge/src/test/resources/json/hs2-create-db-with-no-pathentities-to-retain-v2.json diff --git a/addons/hive-bridge/src/test/resources/json/hs2-create-db-with-no-pathentities-to-retain.json b/addons/hive-bridge/src/test/resources/json/hs2-create-db-with-no-pathentities-to-retain.json new file mode 100644 index 0000000..a5b810f --- /dev/null +++ b/addons/hive-bridge/src/test/resources/json/hs2-create-db-with-no-pathentities-to-retain.json @@ -0,0 +1,73 @@ +{ + "referredEntities": {}, + "entities": [ + { + "typeName": "hive_db", + "attributes": { + "owner": "hive", + "ownerType": "USER", + "managedLocation": null, + "qualifiedName": "cadb02@cm", + "clusterName": "cm", + "name": "cadb02", + "location": "hdfs://ve0126.halxg.cloudera.com:8020/warehouse/tablespace/external/hive/cadb02.db", + "parameters": {} + }, + "guid": "-14529329955589448", + "isIncomplete": false, + "provenanceType": 0, + "version": 0, + "proxy": false + }, + { + "typeName": "hive_db_ddl", + "attributes": { + "serviceType": "hive", + "qualifiedName": "cadb02@cm:1616450673617", + "execTime": 1616450673617, + "queryText": "create database cadb02", + "name": "create database cadb02", + "userName": "hive" + }, + "guid": "-14529329955589449", + "isIncomplete": false, + "provenanceType": 0, + "version": 0, + "relationshipAttributes": { + "db": { + "guid": "-14529329955589448", + "typeName": "hive_db", + "uniqueAttributes": { + "qualifiedName": "cadb02@cm" + }, + "relationshipType": "hive_db_ddl_queries" + } + }, + "proxy": false + }, + { + "typeName": "hdfs_path", + "attributes": { + "path": "hdfs://ve0126.halxg.cloudera.com:8020/warehouse/tablespace/external/hive/cadb02.db", + "qualifiedName": "hdfs://ve0126.halxg.cloudera.com:8020/warehouse/tablespace/external/hive/cadb02.db@cm", + "clusterName": "cm", + "name": "/warehouse/tablespace/external/hive/cadb02.db" + }, + "guid": "-14529329955589450", + "isIncomplete": false, + "provenanceType": 0, + "version": 0, + "relationshipAttributes": { + "hiveDb": { + "guid": "-14529329955589448", + "typeName": "hive_db", + "uniqueAttributes": { + "qualifiedName": "cadb02@cm" + }, + "relationshipType": "hive_db_location" + } + }, + "proxy": false + } + ] +} diff --git a/addons/hive-bridge/src/test/resources/json/hs2-create-table-v2.json b/addons/hive-bridge/src/test/resources/json/hs2-create-table-v2.json index b67f2ff..801918e 100644 --- a/addons/hive-bridge/src/test/resources/json/hs2-create-table-v2.json +++ b/addons/hive-bridge/src/test/resources/json/hs2-create-table-v2.json @@ -1,5 +1,20 @@ { - "referredEntities": {}, + "referredEntities": { + "-14529329955589455": { + "typeName": "hdfs_path", + "attributes": { + "path": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv", + "qualifiedName": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv@cm", + "clusterName": "cm", + "name": "/tmp/external/hh6.csv" + }, + "guid": "-14529329955589455", + "isIncomplete": false, + "provenanceType": 0, + "version": 0, + "proxy": false + } + }, "entities": [ { "typeName": "hive_process", diff --git a/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-v2.json b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-v2.json new file mode 100644 index 0000000..dd31aa0 --- /dev/null +++ b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-v2.json @@ -0,0 +1,18 @@ +{ + "referredEntities": { + "-14529329955589455": { + "typeName": "hdfs_path", + "attributes": { + "path": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv", + "qualifiedName": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv@cm", + "clusterName": "cm", + "name": "/tmp/external/hh6.csv" + }, + "guid": "-14529329955589455", + "isIncomplete": false, + "provenanceType": 0, + "version": 0, + "proxy": false + } + } +} \ No newline at end of file diff --git a/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-with-no-pathentities-to-retain-v2.json b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-with-no-pathentities-to-retain-v2.json new file mode 100644 index 0000000..7f90d19 --- /dev/null +++ b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-with-no-pathentities-to-retain-v2.json @@ -0,0 +1,4 @@ +{ + "referredEntities": { + } +} \ No newline at end of file diff --git a/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-with-no-pathentities-to-retain.json b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-with-no-pathentities-to-retain.json new file mode 100644 index 0000000..dd31aa0 --- /dev/null +++ b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-with-no-pathentities-to-retain.json @@ -0,0 +1,18 @@ +{ + "referredEntities": { + "-14529329955589455": { + "typeName": "hdfs_path", + "attributes": { + "path": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv", + "qualifiedName": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv@cm", + "clusterName": "cm", + "name": "/tmp/external/hh6.csv" + }, + "guid": "-14529329955589455", + "isIncomplete": false, + "provenanceType": 0, + "version": 0, + "proxy": false + } + } +} \ No newline at end of file diff --git a/addons/hive-bridge/src/test/resources/json/hs2-load-inpath.json b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath.json new file mode 100644 index 0000000..dd31aa0 --- /dev/null +++ b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath.json @@ -0,0 +1,18 @@ +{ + "referredEntities": { + "-14529329955589455": { + "typeName": "hdfs_path", + "attributes": { + "path": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv", + "qualifiedName": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv@cm", + "clusterName": "cm", + "name": "/tmp/external/hh6.csv" + }, + "guid": "-14529329955589455", + "isIncomplete": false, + "provenanceType": 0, + "version": 0, + "proxy": false + } + } +} \ No newline at end of file