Re: [PR] Spark Action to Analyze table [iceberg]

via GitHub Tue, 18 Jun 2024 04:15:13 -0700


findepi commented on code in PR #10288:
URL: https://github.com/apache/iceberg/pull/10288#discussion_r1644268788



##########
spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/AnalyzeTableSparkAction.java:
##########
@@ -47,13 +55,20 @@ public class AnalyzeTableSparkAction extends 
BaseSparkAction<AnalyzeTableSparkAc
   private static final Logger LOG = 
LoggerFactory.getLogger(AnalyzeTableSparkAction.class);
 
   private final Table table;
-  private Set<String> columns = ImmutableSet.of();
-  private Set<String> types = StandardBlobTypes.blobTypes();
+  private final Set<String> supportedBlobTypes =
+      ImmutableSet.of(StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1);

Review Comment:
   Can this be static?



##########
core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java:
##########
@@ -26,4 +29,8 @@ private StandardBlobTypes() {}
    * href="https://datasketches.apache.org/";>Apache DataSketches</a> library
    */
   public static final String APACHE_DATASKETCHES_THETA_V1 = 
"apache-datasketches-theta-v1";
+
+  public static Set<String> allStandardBlobTypes() {

Review Comment:
   Do we still need this method?



##########
spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/AnalyzeTableSparkAction.java:
##########
@@ -47,13 +55,20 @@ public class AnalyzeTableSparkAction extends 
BaseSparkAction<AnalyzeTableSparkAc
   private static final Logger LOG = 
LoggerFactory.getLogger(AnalyzeTableSparkAction.class);
 
   private final Table table;
-  private Set<String> columns = ImmutableSet.of();
-  private Set<String> types = StandardBlobTypes.blobTypes();
+  private final Set<String> supportedBlobTypes =
+      ImmutableSet.of(StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1);
+  private Set<String> columns;
+  private Set<String> blobTypesToAnalyze = supportedBlobTypes;
   private Long snapshotId;
 
   AnalyzeTableSparkAction(SparkSession spark, Table table) {
     super(spark);
     this.table = table;
+    Snapshot snapshot = table.currentSnapshot();
+    ValidationException.check(snapshot != null, "Cannot analyze a table that 
has no snapshots");

Review Comment:
   It would be nice to handle this case gracefully.
   Table without snapshots is an empty table (no data).
   Also, stats are assigned to snapshots, and there is no snapshot, so there 
cannot be a stats file created. 
   Thus there is only one way to handle this gracefully -- just no-op.
   I believe this would be better from user-perspective than just throwing.



##########
spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/AnalyzeTableSparkAction.java:
##########
@@ -63,97 +78,98 @@ protected AnalyzeTableSparkAction self() {
 
   @Override
   public Result execute() {
-    if (snapshotId == null) {
-      snapshotId = table.currentSnapshot().snapshotId();
-    }
     String desc = String.format("Analyzing table %s for snapshot id %s", 
table.name(), snapshotId);
     JobGroupInfo info = newJobGroupInfo("ANALYZE-TABLE", desc);
     return withJobGroupInfo(info, this::doExecute);
   }
 
   private Result doExecute() {
-    LOG.info("Starting the analysis of {} for snapshot {}", table.name(), 
snapshotId);
-    List<AnalysisResult> analysisResults =
-        types.stream()
-            .map(
-                statsName -> {
-                  switch (statsName) {
+    LOG.info("Starting analysis of {} for snapshot {}", table.name(), 
snapshotId);
+    List<AnalysisResult> results = Lists.newArrayList();
+    List<Blob> blobs =
+        blobTypesToAnalyze.stream()
+            .flatMap(
+                type -> {
+                  switch (type) {
                     case StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1:
-                      return generateNDVAndCommit();
+                      try {
+                        return generateNDVBlobs().stream();
+                      } catch (Exception e) {
+                        LOG.error(
+                            "Error occurred when collecting statistics for 
blob type {}", type, e);
+                        ImmutableAnalyzeTable.AnalysisResult result =
+                            ImmutableAnalyzeTable.AnalysisResult.builder()
+                                .type(type)
+                                .addErrors(e.getMessage())
+                                .build();
+                        results.add(result);
+                      }
+                      break;
                     default:
-                      return ImmutableAnalyzeTable.AnalysisResult.builder()
-                          .type(statsName)
-                          .addAllErrors(Lists.newArrayList("Stats type not 
supported"))
-                          .build();
+                      throw new UnsupportedOperationException();
                   }
+                  return Stream.empty();
                 })
             .collect(Collectors.toList());
-    return 
ImmutableAnalyzeTable.Result.builder().analysisResults(analysisResults).build();
+    try {
+      writeAndCommitPuffin(blobs);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+    return 
ImmutableAnalyzeTable.Result.builder().analysisResults(results).build();
   }
 
-  private boolean analyzableTypes(Set<String> columnNames) {
-    return columnNames.stream()
-        .anyMatch(
-            columnName -> {
-              Types.NestedField field = table.schema().findField(columnName);
-              if (field == null) {
-                throw new ValidationException("No column with %s name in the 
table", columnName);
-              }
-              Type.TypeID type = field.type().typeId();
-              return type == Type.TypeID.INTEGER
-                  || type == Type.TypeID.LONG
-                  || type == Type.TypeID.STRING
-                  || type == Type.TypeID.DOUBLE;
-            });
+  private void writeAndCommitPuffin(List<Blob> blobs) throws Exception {
+    TableOperations operations = ((HasTableOperations) table).operations();
+    FileIO fileIO = operations.io();
+    String path = operations.metadataFileLocation(String.format("%s.stats", 
UUID.randomUUID()));
+    OutputFile outputFile = fileIO.newOutputFile(path);
+    GenericStatisticsFile statisticsFile;
+    try (PuffinWriter writer =
+        Puffin.write(outputFile).createdBy("Iceberg Analyze action").build()) {
+      blobs.forEach(writer::add);
+      writer.finish();
+      statisticsFile =
+          new GenericStatisticsFile(
+              snapshotId,
+              path,
+              writer.fileSize(),
+              writer.footerSize(),
+              writer.writtenBlobsMetadata().stream()
+                  .map(GenericBlobMetadata::from)
+                  .collect(ImmutableList.toImmutableList()));
+    }
+    table.updateStatistics().setStatistics(snapshotId, 
statisticsFile).commit();
   }
 
-  private AnalysisResult generateNDVAndCommit() {
-    try {
-      if (snapshotId == null) {
-        snapshotId = table.currentSnapshot().snapshotId();
-      }
-
-      StatisticsFile statisticsFile =
-          NDVSketchGenerator.generateNDV(
-              spark(), table, snapshotId, columns.toArray(new String[0]));
-      table.updateStatistics().setStatistics(snapshotId, 
statisticsFile).commit();
-      return ImmutableAnalyzeTable.AnalysisResult.builder()
-          .type(StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1)
-          .build();
-    } catch (IOException ioe) {
-      List<String> errors = Lists.newArrayList();
-      errors.add(ioe.getMessage());
-      return ImmutableAnalyzeTable.AnalysisResult.builder()
-          .type(StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1)
-          .addAllErrors(errors)
-          .build();
-    }
+  private List<Blob> generateNDVBlobs() {
+    return NDVSketchGenerator.generateNDVSketchesAndBlobs(spark(), table, 
snapshotId, columns);
   }
 
   @Override
   public AnalyzeTable columns(String... columnNames) {
     Preconditions.checkArgument(
         columnNames != null && columnNames.length > 0, "Columns cannot be 
null/empty");
-    Set<String> columnsSet = Sets.newHashSet(Arrays.asList(columnNames));
-    Preconditions.checkArgument(
-        analyzableTypes(columnsSet),
-        "Cannot be applied to the given columns, since the column's type is 
not supported");
-    this.columns = columnsSet;
+    for (String columnName : columnNames) {
+      Types.NestedField field = table.schema().findField(columnName);
+      if (field == null) {
+        throw new ValidationException("No column with %s name in the table", 
columnName);
+      }
+    }
+    this.columns = ImmutableSet.copyOf(columnNames);
     return this;
   }
 
   @Override
-  public AnalyzeTable types(Set<String> statisticTypes) {
-    Preconditions.checkArgument(
-        
Sets.newHashSet(StandardBlobTypes.blobTypes()).containsAll(statisticTypes),
-        "type not supported");
-    this.types = statisticTypes;
+  public AnalyzeTable blobTypes(Set<String> types) {
+    Preconditions.checkArgument(supportedBlobTypes.containsAll(types), "type 
not supported");
+    this.blobTypesToAnalyze = types;
     return this;
   }
 
   @Override
-  public AnalyzeTable snapshot(String snapshotIdStr) {
-    this.snapshotId = Long.parseLong(snapshotIdStr);
+  public AnalyzeTable snapshot(long snapId) {
+    this.snapshotId = snapId;

Review Comment:
   nit: snapId -> snapshotId



##########
spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/AnalyzeTableSparkAction.java:
##########
@@ -63,97 +78,98 @@ protected AnalyzeTableSparkAction self() {
 
   @Override
   public Result execute() {
-    if (snapshotId == null) {
-      snapshotId = table.currentSnapshot().snapshotId();
-    }
     String desc = String.format("Analyzing table %s for snapshot id %s", 
table.name(), snapshotId);
     JobGroupInfo info = newJobGroupInfo("ANALYZE-TABLE", desc);
     return withJobGroupInfo(info, this::doExecute);
   }
 
   private Result doExecute() {
-    LOG.info("Starting the analysis of {} for snapshot {}", table.name(), 
snapshotId);
-    List<AnalysisResult> analysisResults =
-        types.stream()
-            .map(
-                statsName -> {
-                  switch (statsName) {
+    LOG.info("Starting analysis of {} for snapshot {}", table.name(), 
snapshotId);
+    List<AnalysisResult> results = Lists.newArrayList();
+    List<Blob> blobs =
+        blobTypesToAnalyze.stream()
+            .flatMap(
+                type -> {
+                  switch (type) {
                     case StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1:
-                      return generateNDVAndCommit();
+                      try {
+                        return generateNDVBlobs().stream();
+                      } catch (Exception e) {
+                        LOG.error(
+                            "Error occurred when collecting statistics for 
blob type {}", type, e);
+                        ImmutableAnalyzeTable.AnalysisResult result =
+                            ImmutableAnalyzeTable.AnalysisResult.builder()
+                                .type(type)
+                                .addErrors(e.getMessage())
+                                .build();
+                        results.add(result);
+                      }
+                      break;
                     default:
-                      return ImmutableAnalyzeTable.AnalysisResult.builder()
-                          .type(statsName)
-                          .addAllErrors(Lists.newArrayList("Stats type not 
supported"))
-                          .build();
+                      throw new UnsupportedOperationException();
                   }
+                  return Stream.empty();
                 })
             .collect(Collectors.toList());
-    return 
ImmutableAnalyzeTable.Result.builder().analysisResults(analysisResults).build();
+    try {
+      writeAndCommitPuffin(blobs);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+    return 
ImmutableAnalyzeTable.Result.builder().analysisResults(results).build();
   }
 
-  private boolean analyzableTypes(Set<String> columnNames) {
-    return columnNames.stream()
-        .anyMatch(
-            columnName -> {
-              Types.NestedField field = table.schema().findField(columnName);
-              if (field == null) {
-                throw new ValidationException("No column with %s name in the 
table", columnName);
-              }
-              Type.TypeID type = field.type().typeId();
-              return type == Type.TypeID.INTEGER
-                  || type == Type.TypeID.LONG
-                  || type == Type.TypeID.STRING
-                  || type == Type.TypeID.DOUBLE;
-            });
+  private void writeAndCommitPuffin(List<Blob> blobs) throws Exception {
+    TableOperations operations = ((HasTableOperations) table).operations();
+    FileIO fileIO = operations.io();
+    String path = operations.metadataFileLocation(String.format("%s.stats", 
UUID.randomUUID()));
+    OutputFile outputFile = fileIO.newOutputFile(path);
+    GenericStatisticsFile statisticsFile;
+    try (PuffinWriter writer =
+        Puffin.write(outputFile).createdBy("Iceberg Analyze action").build()) {
+      blobs.forEach(writer::add);
+      writer.finish();
+      statisticsFile =
+          new GenericStatisticsFile(
+              snapshotId,
+              path,
+              writer.fileSize(),
+              writer.footerSize(),
+              writer.writtenBlobsMetadata().stream()
+                  .map(GenericBlobMetadata::from)
+                  .collect(ImmutableList.toImmutableList()));
+    }
+    table.updateStatistics().setStatistics(snapshotId, 
statisticsFile).commit();
   }
 
-  private AnalysisResult generateNDVAndCommit() {
-    try {
-      if (snapshotId == null) {
-        snapshotId = table.currentSnapshot().snapshotId();
-      }
-
-      StatisticsFile statisticsFile =
-          NDVSketchGenerator.generateNDV(
-              spark(), table, snapshotId, columns.toArray(new String[0]));
-      table.updateStatistics().setStatistics(snapshotId, 
statisticsFile).commit();
-      return ImmutableAnalyzeTable.AnalysisResult.builder()
-          .type(StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1)
-          .build();
-    } catch (IOException ioe) {
-      List<String> errors = Lists.newArrayList();
-      errors.add(ioe.getMessage());
-      return ImmutableAnalyzeTable.AnalysisResult.builder()
-          .type(StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1)
-          .addAllErrors(errors)
-          .build();
-    }
+  private List<Blob> generateNDVBlobs() {
+    return NDVSketchGenerator.generateNDVSketchesAndBlobs(spark(), table, 
snapshotId, columns);
   }
 
   @Override
   public AnalyzeTable columns(String... columnNames) {
     Preconditions.checkArgument(
         columnNames != null && columnNames.length > 0, "Columns cannot be 
null/empty");
-    Set<String> columnsSet = Sets.newHashSet(Arrays.asList(columnNames));
-    Preconditions.checkArgument(
-        analyzableTypes(columnsSet),
-        "Cannot be applied to the given columns, since the column's type is 
not supported");
-    this.columns = columnsSet;
+    for (String columnName : columnNames) {
+      Types.NestedField field = table.schema().findField(columnName);
+      if (field == null) {
+        throw new ValidationException("No column with %s name in the table", 
columnName);
+      }
+    }
+    this.columns = ImmutableSet.copyOf(columnNames);
     return this;
   }
 
   @Override
-  public AnalyzeTable types(Set<String> statisticTypes) {
-    Preconditions.checkArgument(
-        
Sets.newHashSet(StandardBlobTypes.blobTypes()).containsAll(statisticTypes),
-        "type not supported");
-    this.types = statisticTypes;
+  public AnalyzeTable blobTypes(Set<String> types) {

Review Comment:
   nit: types -> blobTypes (just like in the interface method declaration)



##########
spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/AnalyzeTableSparkAction.java:
##########
@@ -63,97 +78,98 @@ protected AnalyzeTableSparkAction self() {
 
   @Override
   public Result execute() {
-    if (snapshotId == null) {
-      snapshotId = table.currentSnapshot().snapshotId();
-    }
     String desc = String.format("Analyzing table %s for snapshot id %s", 
table.name(), snapshotId);
     JobGroupInfo info = newJobGroupInfo("ANALYZE-TABLE", desc);
     return withJobGroupInfo(info, this::doExecute);
   }
 
   private Result doExecute() {
-    LOG.info("Starting the analysis of {} for snapshot {}", table.name(), 
snapshotId);
-    List<AnalysisResult> analysisResults =
-        types.stream()
-            .map(
-                statsName -> {
-                  switch (statsName) {
+    LOG.info("Starting analysis of {} for snapshot {}", table.name(), 
snapshotId);
+    List<AnalysisResult> results = Lists.newArrayList();
+    List<Blob> blobs =
+        blobTypesToAnalyze.stream()
+            .flatMap(
+                type -> {
+                  switch (type) {
                     case StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1:
-                      return generateNDVAndCommit();
+                      try {
+                        return generateNDVBlobs().stream();
+                      } catch (Exception e) {
+                        LOG.error(
+                            "Error occurred when collecting statistics for 
blob type {}", type, e);
+                        ImmutableAnalyzeTable.AnalysisResult result =
+                            ImmutableAnalyzeTable.AnalysisResult.builder()
+                                .type(type)
+                                .addErrors(e.getMessage())
+                                .build();
+                        results.add(result);
+                      }
+                      break;
                     default:
-                      return ImmutableAnalyzeTable.AnalysisResult.builder()
-                          .type(statsName)
-                          .addAllErrors(Lists.newArrayList("Stats type not 
supported"))
-                          .build();
+                      throw new UnsupportedOperationException();

Review Comment:
   In case this exception is thrown (due to some code modifications in the 
future), it could be helpful to include `type` in the exception message.



##########
spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/AnalyzeTableSparkAction.java:
##########
@@ -47,13 +55,20 @@ public class AnalyzeTableSparkAction extends 
BaseSparkAction<AnalyzeTableSparkAc
   private static final Logger LOG = 
LoggerFactory.getLogger(AnalyzeTableSparkAction.class);
 
   private final Table table;
-  private Set<String> columns = ImmutableSet.of();
-  private Set<String> types = StandardBlobTypes.blobTypes();
+  private final Set<String> supportedBlobTypes =
+      ImmutableSet.of(StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1);
+  private Set<String> columns;
+  private Set<String> blobTypesToAnalyze = supportedBlobTypes;

Review Comment:
   What about combining these into a list of (blob type, columns) pairs?
   This might be necessary when we add support for new blob types.
   see https://github.com/apache/iceberg/pull/10288/files#r1639547902



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Spark Action to Analyze table [iceberg]

Reply via email to