From 608228b03425fe0605e0a0dab650f771c923110d Mon Sep 17 00:00:00 2001
From: Leonid Andreev <leonid@hmdc.harvard.edu>
Date: Tue, 7 Apr 2020 17:55:51 -0400
Subject: [PATCH] Another documentation entry, the "dataset management" section
 of the user guide. (#6558)

---
 .../source/user/dataset-management.rst        |  2 +
 .../harvard/iq/dataverse/util/FileUtil.java   | 58 +++++++++++++++----
 2 files changed, 49 insertions(+), 11 deletions(-)
diff --git a/doc/sphinx-guides/source/user/dataset-management.rst b/doc/sphinx-guides/source/user/dataset-management.rst
index 0c8da751620..f9ce457f5c0 100755
--- a/doc/sphinx-guides/source/user/dataset-management.rst
+++ b/doc/sphinx-guides/source/user/dataset-management.rst
@@ -465,6 +465,8 @@ Publish Dataset
 
 When you publish a dataset (available to an Admin, Curator, or any custom role which has this level of permission assigned), you make it available to the public so that other users can browse or search for it. Once your dataset is ready to go public, go to your dataset page and click on the "Publish" button on the right hand side of the page. A pop-up will appear to confirm that you are ready to actually Publish since once a dataset is made public it can no longer be unpublished. 
 
+Before Dataverse finalizes the publication of the dataset, it will attempt to validate all the physical files in it, to make sure they are present and intact. In an unlikely event that any files fail the validation, you will see an error message informing you that the problem must be fixed by the local Dataverse Admin before the dataset can be published. 
+
 Whenever you edit your dataset, you are able to publish a new version of the dataset. The publish dataset button will reappear whenever you edit the metadata of the dataset or add a file.
 
 Note: Prior to publishing your dataset the Data Citation will indicate that this is a draft but the "DRAFT VERSION" text
diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java
index 6f7c56c344c..fabd1dc19f2 100644
--- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java
+++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java
@@ -1690,13 +1690,17 @@ public static void validateDataFileChecksum(DataFile dataFile) throws IOExceptio
         StorageIO<DataFile> storage = dataFile.getStorageIO();
         storage.open(DataAccessOption.READ_ACCESS);
         InputStream in = null;
-
-        if (!dataFile.isTabularData()) {
-            in = storage.getInputStream();
-        } else {
-            // if this is a tabular file, read the preserved original "auxiliary file"
-            // instead:
-            in = storage.getAuxFileAsInputStream(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);
+        
+        try {
+            if (!dataFile.isTabularData()) {
+                in = storage.getInputStream();
+            } else {
+                // if this is a tabular file, read the preserved original "auxiliary file"
+                // instead:
+                in = storage.getAuxFileAsInputStream(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);
+            }
+        } catch (IOException ioex) {
+            in = null;
         }
 
         if (in == null) {
@@ -1720,13 +1724,45 @@ public static void validateDataFileChecksum(DataFile dataFile) throws IOExceptio
             throw new IOException(info);
         }
 
-        // TODO: What should we do if the datafile does not have a non-null checksum?
+        // TODO? What should we do if the datafile does not have a non-null checksum?
         // Should we fail, or should we assume that the recalculated checksum
         // is correct, and populate the checksumValue field with it?
         if (!recalculatedChecksum.equals(dataFile.getChecksumValue())) {
-            String info = BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.wrongChecksumValue", Arrays.asList(dataFile.getId().toString()));
-            logger.log(Level.INFO, info);
-            throw new IOException(info);
+            // There's one possible condition that is 100% recoverable and can
+            // be automatically fixed (issue #6660):
+            boolean fixed = false;
+            if (!dataFile.isTabularData() && dataFile.getIngestReport() != null) {
+                // try again, see if the .orig file happens to be there:
+                try {
+                    in = storage.getAuxFileAsInputStream(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);
+                } catch (IOException ioex) {
+                    in = null;
+                }
+                if (in != null) {
+                    try {
+                        recalculatedChecksum = FileUtil.calculateChecksum(in, checksumType);
+                    } catch (RuntimeException rte) {
+                        recalculatedChecksum = null;
+                    } finally {
+                        IOUtils.closeQuietly(in);
+                    }
+                    // try again: 
+                    if (recalculatedChecksum.equals(dataFile.getChecksumValue())) {
+                        fixed = true;
+                        try {
+                            storage.revertBackupAsAux(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);
+                        } catch (IOException ioex) {
+                            fixed = false;
+                        }
+                    }
+                }
+            }
+            
+            if (!fixed) {
+                String info = BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.wrongChecksumValue", Arrays.asList(dataFile.getId().toString()));
+                logger.log(Level.INFO, info);
+                throw new IOException(info);
+            }
         }
 
         logger.log(Level.INFO, "successfully validated DataFile {0}; checksum {1}", new Object[]{dataFile.getId(), recalculatedChecksum});