[ 
https://issues.apache.org/jira/browse/HADOOP-18596?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17683091#comment-17683091
 ] 

ASF GitHub Bot commented on HADOOP-18596:
-----------------------------------------

steveloughran commented on code in PR #5308:
URL: https://github.com/apache/hadoop/pull/5308#discussion_r1093469149


##########
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/contract/AbstractContractDistCpTest.java:
##########
@@ -868,35 +892,41 @@ public void testDistCpUpdateCheckFileSkip() throws 
Exception {
 
     Path source = new Path(remoteDir, "file");
     Path dest = new Path(localDir, "file");
+
+    Path source_0byte = new Path(remoteDir, "file_0byte");
+    Path dest_0byte = new Path(localDir, "file_0byte");
     dest = localFS.makeQualified(dest);
+    dest_0byte = localFS.makeQualified(dest_0byte);
 
     // Creating a source file with certain dataset.
     byte[] sourceBlock = dataset(10, 'a', 'z');
 
     // Write the dataset and as well create the target path.
-    try (FSDataOutputStream out = remoteFS.create(source)) {
-      out.write(sourceBlock);
-      localFS.create(dest);
-    }
+    ContractTestUtils.createFile(localFS, dest, true, sourceBlock);
+    ContractTestUtils
+        .writeDataset(remoteFS, source, sourceBlock, sourceBlock.length,
+            1024, true);
 
-    verifyPathExists(remoteFS, "", source);
-    verifyPathExists(localFS, "", dest);
-    DistCpTestUtils
-        .assertRunDistCp(DistCpConstants.SUCCESS, remoteDir.toString(),
-            localDir.toString(), "-delete -update" + getDefaultCLIOptions(),
-            conf);
+    // Create 0 byte source and target files.
+    ContractTestUtils.createFile(remoteFS, source_0byte, true, new byte[0]);
+    ContractTestUtils.createFile(localFS, dest_0byte, true, new byte[0]);
+
+    // Execute the distcp -update job.
+    Job job = distCpUpdateWithFs(remoteDir, localDir, remoteFS, localFS);
 
     // First distcp -update would normally copy the source to dest.
     verifyFileContents(localFS, dest, sourceBlock);
+    // Verify 1 file was skipped in the distcp -update(They 0 byte files).

Review Comment:
   nit: add space after update and replace They with `the`



##########
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/contract/AbstractContractDistCpTest.java:
##########
@@ -50,6 +50,7 @@
 import org.apache.hadoop.tools.mapred.CopyMapper;
 import org.apache.hadoop.tools.util.DistCpTestUtils;
 import org.apache.hadoop.util.functional.RemoteIterators;
+import org.apache.http.annotation.Contract;

Review Comment:
   where does this come from/get used?



##########
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/contract/AbstractContractDistCpTest.java:
##########
@@ -913,32 +943,65 @@ public void testDistCpUpdateCheckFileSkip() throws 
Exception {
     long newTargetModTimeNew = modTimeSourceUpd + MODIFICATION_TIME_OFFSET;
     localFS.setTimes(dest, newTargetModTimeNew, -1);
 
-    DistCpTestUtils
-        .assertRunDistCp(DistCpConstants.SUCCESS, remoteDir.toString(),
-            localDir.toString(), "-delete -update" + getDefaultCLIOptions(),
-            conf);
+    // Execute the distcp -update job.
+    Job updatedSourceJobOldSrc =
+        distCpUpdateWithFs(remoteDir, localDir, remoteFS,
+            localFS);
 
     // File contents should remain same since the mod time for target is
     // newer than the updatedSource which indicates that the sync happened
     // more recently and there is no update.
     verifyFileContents(localFS, dest, sourceBlock);
+    // Skipped both 0 byte file and sourceFile(since mod time of target is
+    // older than the source it is perceived that source is of older version
+    // and we can skip it's copy).
+    verifySkipAndCopyCounter(updatedSourceJobOldSrc, 2, 0);
 
     // Subtract by an offset which would ensure enough gap for the test to
     // not fail due to race conditions.
     long newTargetModTimeOld =
         Math.min(modTimeSourceUpd - MODIFICATION_TIME_OFFSET, 0);
     localFS.setTimes(dest, newTargetModTimeOld, -1);
 
-    DistCpTestUtils
-        .assertRunDistCp(DistCpConstants.SUCCESS, remoteDir.toString(),
-            localDir.toString(), "-delete -update" + getDefaultCLIOptions(),
-            conf);
+    // Execute the distcp -update job.
+    Job updatedSourceJobNewSrc = distCpUpdateWithFs(remoteDir, localDir,
+        remoteFS,
+        localFS);
 
-    Assertions.assertThat(RemoteIterators.toList(localFS.listFiles(dest, 
true)))
-        .hasSize(1);
+    // Verifying the target directory have both 0 byte file and the content
+    // file.
+    Assertions
+        .assertThat(RemoteIterators.toList(localFS.listFiles(localDir, true)))
+        .hasSize(2);
     // Now the copy should take place and the file contents should change
     // since the mod time for target is older than the source file indicating
     // that there was an update to the source after the last sync took place.
     verifyFileContents(localFS, dest, updatedSourceBlock);
+    // Verifying we skipped the 0 byte file and copied the updated source
+    // file (since the modification time of the new source is older than the
+    // target now).
+    verifySkipAndCopyCounter(updatedSourceJobNewSrc, 1, 1);

Review Comment:
   this is good; verifies the 0 byte support





> Distcp -update between different cloud stores to use modification time while 
> checking for file skip.
> ----------------------------------------------------------------------------------------------------
>
>                 Key: HADOOP-18596
>                 URL: https://issues.apache.org/jira/browse/HADOOP-18596
>             Project: Hadoop Common
>          Issue Type: Improvement
>          Components: tools/distcp
>            Reporter: Mehakmeet Singh
>            Assignee: Mehakmeet Singh
>            Priority: Major
>              Labels: pull-request-available
>
> Distcp -update currently relies on File size, block size, and Checksum 
> comparisons to figure out which files should be skipped or copied. 
> Since different cloud stores have different checksum algorithms we should 
> check for modification time as well to the checks.
> This would ensure that while performing -update if the files are perceived to 
> be out of sync we should copy them. The machines between which the file 
> transfers occur should be in time sync to avoid any extra copies.
> Improving testing and documentation for modification time checks between 
> different object stores to ensure no incorrect skipping of files.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to