From bec394519826529c02adedfdd601f04b45f859c2 Mon Sep 17 00:00:00 2001 From: landreev Date: Wed, 7 Feb 2024 11:50:52 -0500 Subject: [PATCH] 8524 adding mechanism for storing tab. files with variable headers (#10282) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * "stored with header" flag #8524 * more changes for the streaming and redirect code. #8524 * disabling dynamically-generated varheader in the remaining storage drivers. #8524 * Ingest plugins (work in progress) #8524 * R ingest plugin (#8524) * still some unaddressed @todo:s, but the branch should build and the unit tests should be passing. # 8524 * work-in-progress, on the subsetting code in the download instance writer. #8524 * more work-in-progress changes. removing all the unused code from TabularSubsetGenerator, for clarity etc. #8524 * more bits and pieces #8524 * 2 more ingest plugins. #8542 * Integration tests. #8524 * typo #8524 * documenting the new setting. #8524 * a release note for the pr. also, added the "storage quotas enabled" to the list of settings documented in the config guide while I was at it. #8524 * removed all the unused code from this class (lots of it) for clarity, etc. git history can be consulted if anyone is curious about what we used to do here. #8524 * removing @todo: that's no longer relevant #8524 * (cosmetic) defined the control constants used in the integration test. #8524 --- ...4-storing-tabular-files-with-varheaders.md | 6 + .../source/installation/config.rst | 22 + .../edu/harvard/iq/dataverse/DataTable.java | 18 + .../dataverse/api/DownloadInstanceWriter.java | 78 +- .../harvard/iq/dataverse/api/TestIngest.java | 2 +- .../iq/dataverse/dataaccess/FileAccessIO.java | 3 +- .../dataaccess/GlobusOverlayAccessIO.java | 8 +- .../dataaccess/RemoteOverlayAccessIO.java | 8 +- .../iq/dataverse/dataaccess/S3AccessIO.java | 3 +- .../dataverse/dataaccess/SwiftAccessIO.java | 3 +- .../dataaccess/TabularSubsetGenerator.java | 1150 +---------------- .../dataaccess/TabularSubsetInputStream.java | 114 -- .../export/DDIExportServiceBean.java | 11 + .../dataverse/ingest/IngestServiceBean.java | 64 +- .../tabulardata/TabularDataFileReader.java | 26 +- .../impl/plugins/csv/CSVFileReader.java | 24 +- .../impl/plugins/dta/DTAFileReader.java | 11 +- .../impl/plugins/dta/NewDTAFileReader.java | 19 +- .../impl/plugins/por/PORFileReader.java | 13 +- .../impl/plugins/rdata/RDATAFileReader.java | 4 +- .../impl/plugins/rdata/RTabFileParser.java | 28 +- .../impl/plugins/sav/SAVFileReader.java | 24 +- .../impl/plugins/xlsx/XLSXFileReader.java | 11 +- .../settings/SettingsServiceBean.java | 7 +- .../iq/dataverse/util/SystemConfig.java | 8 + ...24-store-tabular-files-with-varheaders.sql | 1 + .../edu/harvard/iq/dataverse/api/FilesIT.java | 128 ++ .../dataverse/ingest/IngestFrequencyTest.java | 2 +- .../impl/plugins/csv/CSVFileReaderTest.java | 24 +- .../impl/plugins/dta/DTAFileReaderTest.java | 2 +- .../plugins/dta/NewDTAFileReaderTest.java | 14 +- 31 files changed, 501 insertions(+), 1335 deletions(-) create mode 100644 doc/release-notes/8524-storing-tabular-files-with-varheaders.md delete mode 100644 src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetInputStream.java create mode 100644 src/main/resources/db/migration/V6.1.0.2__8524-store-tabular-files-with-varheaders.sql diff --git a/doc/release-notes/8524-storing-tabular-files-with-varheaders.md b/doc/release-notes/8524-storing-tabular-files-with-varheaders.md new file mode 100644 index 00000000000..f7034c846f6 --- /dev/null +++ b/doc/release-notes/8524-storing-tabular-files-with-varheaders.md @@ -0,0 +1,6 @@ +Tabular Data Ingest can now save the generated archival files with the list of variable names added as the first tab-delimited line. As the most significant effect of this feature, +Access API will be able to take advantage of Direct Download for tab. files saved with these headers on S3 - since they no longer have to be generated and added to the streamed content on the fly. + +This behavior is controlled by the new setting `:StoreIngestedTabularFilesWithVarHeaders`. It is false by default, preserving the legacy behavior. When enabled, Dataverse will be able to handle both the newly ingested files, and any already-existing legacy files stored without these headers transparently to the user. E.g. the access API will continue delivering tab-delimited files **with** this header line, whether it needs to add it dynamically for the legacy files, or reading complete files directly from storage for the ones stored with it. + +An API for converting existing legacy tabular files will be added separately. [this line will need to be changed if we have time to add said API before 6.2 is released]. \ No newline at end of file diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index a7d7905ca4a..c233e594fa7 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -4151,3 +4151,25 @@ A true/false (default) option determining whether the dataset datafile table dis .. _supported MicroProfile Config API source: https://docs.payara.fish/community/docs/Technical%20Documentation/MicroProfile/Config/Overview.html + +.. _:UseStorageQuotas: + +:UseStorageQuotas ++++++++++++++++++ + +Enables storage use quotas in collections. See the :doc:`/api/native-api` for details. + + +.. _:StoreIngestedTabularFilesWithVarHeaders: + +:StoreIngestedTabularFilesWithVarHeaders +++++++++++++++++++++++++++++++++++++++++ + +With this setting enabled, tabular files produced during Ingest will +be stored with the list of variable names added as the first +tab-delimited line. As the most significant effect of this feature, +Access API will be able to take advantage of Direct Download for +tab. files saved with these headers on S3 - since they no longer have +to be generated and added to the streamed file on the fly. + +The setting is ``false`` by default, preserving the legacy behavior. diff --git a/src/main/java/edu/harvard/iq/dataverse/DataTable.java b/src/main/java/edu/harvard/iq/dataverse/DataTable.java index a17d8c65138..95f3aed0f40 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataTable.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataTable.java @@ -112,6 +112,16 @@ public DataTable() { @Column( nullable = true ) private String originalFileName; + + /** + * The physical tab-delimited file is in storage with the list of variable + * names saved as the 1st line. This means that we do not need to generate + * this line on the fly. (Also means that direct download mechanism can be + * used for this file!) + */ + @Column(nullable = false) + private boolean storedWithVariableHeader = false; + /* * Getter and Setter methods: */ @@ -206,6 +216,14 @@ public void setOriginalFileName(String originalFileName) { this.originalFileName = originalFileName; } + public boolean isStoredWithVariableHeader() { + return storedWithVariableHeader; + } + + public void setStoredWithVariableHeader(boolean storedWithVariableHeader) { + this.storedWithVariableHeader = storedWithVariableHeader; + } + /* * Custom overrides for hashCode(), equals() and toString() methods: */ diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index bcb8799ec9e..89b22b76a7d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -22,7 +22,6 @@ import jakarta.ws.rs.ext.Provider; import edu.harvard.iq.dataverse.DataFile; -import edu.harvard.iq.dataverse.FileMetadata; import edu.harvard.iq.dataverse.dataaccess.*; import edu.harvard.iq.dataverse.datavariable.DataVariable; import edu.harvard.iq.dataverse.engine.command.Command; @@ -104,8 +103,10 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] String auxiliaryTag = null; String auxiliaryType = null; String auxiliaryFileName = null; + // Before we do anything else, check if this download can be handled // by a redirect to remote storage (only supported on S3, as of 5.4): + if (storageIO.downloadRedirectEnabled()) { // Even if the above is true, there are a few cases where a @@ -159,7 +160,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] } } else if (dataFile.isTabularData()) { - // Many separate special cases here. + // Many separate special cases here. if (di.getConversionParam() != null) { if (di.getConversionParam().equals("format")) { @@ -180,12 +181,26 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] redirectSupported = false; } } - } else if (!di.getConversionParam().equals("noVarHeader")) { - // This is a subset request - can't do. + } else if (di.getConversionParam().equals("noVarHeader")) { + // This will work just fine, if the tab. file is + // stored without the var. header. Throw "unavailable" + // exception otherwise. + // @todo: should we actually drop support for this "noVarHeader" flag? + if (dataFile.getDataTable().isStoredWithVariableHeader()) { + throw new ServiceUnavailableException(); + } + // ... defaults to redirectSupported = true + } else { + // This must be a subset request then - can't do. + redirectSupported = false; + } + } else { + // "straight" download of the full tab-delimited file. + // can redirect, but only if stored with the variable + // header already added: + if (!dataFile.getDataTable().isStoredWithVariableHeader()) { redirectSupported = false; } - } else { - redirectSupported = false; } } } @@ -247,11 +262,16 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] // finally, issue the redirect: Response response = Response.seeOther(redirect_uri).build(); logger.fine("Issuing redirect to the file location."); + // Yes, this throws an exception. It's not an exception + // as in, "bummer, something went wrong". This is how a + // redirect is produced here! throw new RedirectionException(response); } throw new ServiceUnavailableException(); } + // Past this point, this is a locally served/streamed download + if (di.getConversionParam() != null) { // Image Thumbnail and Tabular data conversion: // NOTE: only supported on local files, as of 4.0.2! @@ -285,9 +305,14 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] // request any tabular-specific services. if (di.getConversionParam().equals("noVarHeader")) { - logger.fine("tabular data with no var header requested"); - storageIO.setNoVarHeader(Boolean.TRUE); - storageIO.setVarHeader(null); + if (!dataFile.getDataTable().isStoredWithVariableHeader()) { + logger.fine("tabular data with no var header requested"); + storageIO.setNoVarHeader(Boolean.TRUE); + storageIO.setVarHeader(null); + } else { + logger.fine("can't serve request for tabular data without varheader, since stored with it"); + throw new ServiceUnavailableException(); + } } else if (di.getConversionParam().equals("format")) { // Conversions, and downloads of "stored originals" are // now supported on all DataFiles for which StorageIO @@ -329,11 +354,10 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] if (variable.getDataTable().getDataFile().getId().equals(dataFile.getId())) { logger.fine("adding variable id " + variable.getId() + " to the list."); variablePositionIndex.add(variable.getFileOrder()); - if (subsetVariableHeader == null) { - subsetVariableHeader = variable.getName(); - } else { - subsetVariableHeader = subsetVariableHeader.concat("\t"); - subsetVariableHeader = subsetVariableHeader.concat(variable.getName()); + if (!dataFile.getDataTable().isStoredWithVariableHeader()) { + subsetVariableHeader = subsetVariableHeader == null + ? variable.getName() + : subsetVariableHeader.concat("\t" + variable.getName()); } } else { logger.warning("variable does not belong to this data file."); @@ -346,7 +370,17 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] try { File tempSubsetFile = File.createTempFile("tempSubsetFile", ".tmp"); TabularSubsetGenerator tabularSubsetGenerator = new TabularSubsetGenerator(); - tabularSubsetGenerator.subsetFile(storageIO.getInputStream(), tempSubsetFile.getAbsolutePath(), variablePositionIndex, dataFile.getDataTable().getCaseQuantity(), "\t"); + + long numberOfLines = dataFile.getDataTable().getCaseQuantity(); + if (dataFile.getDataTable().isStoredWithVariableHeader()) { + numberOfLines++; + } + + tabularSubsetGenerator.subsetFile(storageIO.getInputStream(), + tempSubsetFile.getAbsolutePath(), + variablePositionIndex, + numberOfLines, + "\t"); if (tempSubsetFile.exists()) { FileInputStream subsetStream = new FileInputStream(tempSubsetFile); @@ -354,8 +388,11 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] InputStreamIO subsetStreamIO = new InputStreamIO(subsetStream, subsetSize); logger.fine("successfully created subset output stream."); - subsetVariableHeader = subsetVariableHeader.concat("\n"); - subsetStreamIO.setVarHeader(subsetVariableHeader); + + if (subsetVariableHeader != null) { + subsetVariableHeader = subsetVariableHeader.concat("\n"); + subsetStreamIO.setVarHeader(subsetVariableHeader); + } String tabularFileName = storageIO.getFileName(); @@ -380,8 +417,13 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] } else { logger.fine("empty list of extra arguments."); } + // end of tab. data subset case + } else if (dataFile.getDataTable().isStoredWithVariableHeader()) { + logger.fine("tabular file stored with the var header included, no need to generate it on the fly"); + storageIO.setNoVarHeader(Boolean.TRUE); + storageIO.setVarHeader(null); } - } + } // end of tab. data file case if (storageIO == null) { //throw new WebApplicationException(Response.Status.SERVICE_UNAVAILABLE); diff --git a/src/main/java/edu/harvard/iq/dataverse/api/TestIngest.java b/src/main/java/edu/harvard/iq/dataverse/api/TestIngest.java index 05ba150df8e..add43ea2091 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/TestIngest.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/TestIngest.java @@ -100,7 +100,7 @@ public String datafile(@QueryParam("fileName") String fileName, @QueryParam("fil TabularDataIngest tabDataIngest = null; try { - tabDataIngest = ingestPlugin.read(fileInputStream, null); + tabDataIngest = ingestPlugin.read(fileInputStream, false, null); } catch (IOException ingestEx) { output = output.concat("Caught an exception trying to ingest file " + fileName + ": " + ingestEx.getLocalizedMessage()); return output; diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java index f2a1312a150..26637ec5742 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java @@ -120,7 +120,8 @@ public void open (DataAccessOption... options) throws IOException { && dataFile.getContentType().equals("text/tab-separated-values") && dataFile.isTabularData() && dataFile.getDataTable() != null - && (!this.noVarHeader())) { + && (!this.noVarHeader()) + && (!dataFile.getDataTable().isStoredWithVariableHeader())) { List datavariables = dataFile.getDataTable().getDataVariables(); String varHeaderLine = generateVariableHeader(datavariables); diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusOverlayAccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusOverlayAccessIO.java index 7a6809cb2ff..733daaf1328 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusOverlayAccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusOverlayAccessIO.java @@ -450,8 +450,12 @@ public void open(DataAccessOption... options) throws IOException { this.setSize(retrieveSizeFromMedia()); } // Only applies for the S3 Connector case (where we could have run an ingest) - if (dataFile.getContentType() != null && dataFile.getContentType().equals("text/tab-separated-values") - && dataFile.isTabularData() && dataFile.getDataTable() != null && (!this.noVarHeader())) { + if (dataFile.getContentType() != null + && dataFile.getContentType().equals("text/tab-separated-values") + && dataFile.isTabularData() + && dataFile.getDataTable() != null + && (!this.noVarHeader()) + && (!dataFile.getDataTable().isStoredWithVariableHeader())) { List datavariables = dataFile.getDataTable().getDataVariables(); String varHeaderLine = generateVariableHeader(datavariables); diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/RemoteOverlayAccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/RemoteOverlayAccessIO.java index 1616bfabf96..bca70259cb7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/RemoteOverlayAccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/RemoteOverlayAccessIO.java @@ -124,8 +124,12 @@ public void open(DataAccessOption... options) throws IOException { logger.fine("Setting size"); this.setSize(retrieveSizeFromMedia()); } - if (dataFile.getContentType() != null && dataFile.getContentType().equals("text/tab-separated-values") - && dataFile.isTabularData() && dataFile.getDataTable() != null && (!this.noVarHeader())) { + if (dataFile.getContentType() != null + && dataFile.getContentType().equals("text/tab-separated-values") + && dataFile.isTabularData() + && dataFile.getDataTable() != null + && (!this.noVarHeader()) + && (!dataFile.getDataTable().isStoredWithVariableHeader())) { List datavariables = dataFile.getDataTable().getDataVariables(); String varHeaderLine = generateVariableHeader(datavariables); diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java index 8afc365417e..c2143bd4789 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java @@ -225,7 +225,8 @@ public void open(DataAccessOption... options) throws IOException { && dataFile.getContentType().equals("text/tab-separated-values") && dataFile.isTabularData() && dataFile.getDataTable() != null - && (!this.noVarHeader())) { + && (!this.noVarHeader()) + && (!dataFile.getDataTable().isStoredWithVariableHeader())) { List datavariables = dataFile.getDataTable().getDataVariables(); String varHeaderLine = generateVariableHeader(datavariables); diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/SwiftAccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/SwiftAccessIO.java index 105a60ab418..717f46ffd60 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/SwiftAccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/SwiftAccessIO.java @@ -142,7 +142,8 @@ public void open(DataAccessOption... options) throws IOException { && dataFile.getContentType().equals("text/tab-separated-values") && dataFile.isTabularData() && dataFile.getDataTable() != null - && (!this.noVarHeader())) { + && (!this.noVarHeader()) + && (!dataFile.getDataTable().isStoredWithVariableHeader())) { List datavariables = dataFile.getDataTable().getDataVariables(); String varHeaderLine = generateVariableHeader(datavariables); diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetGenerator.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetGenerator.java index 782f7f3a52d..c369010c8cd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetGenerator.java @@ -60,305 +60,26 @@ public class TabularSubsetGenerator implements SubsetGenerator { - private static Logger dbgLog = Logger.getLogger(TabularSubsetGenerator.class.getPackage().getName()); + private static Logger logger = Logger.getLogger(TabularSubsetGenerator.class.getPackage().getName()); - private static int COLUMN_TYPE_STRING = 1; - private static int COLUMN_TYPE_LONG = 2; - private static int COLUMN_TYPE_DOUBLE = 3; - private static int COLUMN_TYPE_FLOAT = 4; - - private static int MAX_COLUMN_BUFFER = 8192; - - private FileChannel fileChannel = null; - - private int varcount; - private int casecount; - private int subsetcount; - - private byte[][] columnEntries = null; - - - private ByteBuffer[] columnByteBuffers; - private int[] columnBufferSizes; - private int[] columnBufferOffsets; - - private long[] columnStartOffsets; - private long[] columnTotalOffsets; - private long[] columnTotalLengths; - - public TabularSubsetGenerator() { - - } - - public TabularSubsetGenerator (DataFile datafile, List variables) throws IOException { - if (!datafile.isTabularData()) { - throw new IOException("DataFile is not tabular data."); - } - - setVarCount(datafile.getDataTable().getVarQuantity().intValue()); - setCaseCount(datafile.getDataTable().getCaseQuantity().intValue()); - - - - StorageIO dataAccess = datafile.getStorageIO(); - if (!dataAccess.isLocalFile()) { - throw new IOException("Subsetting is supported on local files only!"); - } - - //File tabfile = datafile.getFileSystemLocation().toFile(); - File tabfile = dataAccess.getFileSystemPath().toFile(); + //private static int MAX_COLUMN_BUFFER = 8192; - File rotatedImageFile = getRotatedImage(tabfile, getVarCount(), getCaseCount()); - long[] columnEndOffsets = extractColumnOffsets(rotatedImageFile, getVarCount(), getCaseCount()); - - fileChannel = (FileChannel.open(Paths.get(rotatedImageFile.getAbsolutePath()), StandardOpenOption.READ)); - - if (variables == null || variables.size() < 1 || variables.size() > getVarCount()) { - throw new IOException("Illegal number of variables in the subset request"); - } - - subsetcount = variables.size(); - columnTotalOffsets = new long[subsetcount]; - columnTotalLengths = new long[subsetcount]; - columnByteBuffers = new ByteBuffer[subsetcount]; - - + public TabularSubsetGenerator() { - if (subsetcount == 1) { - if (!datafile.getDataTable().getId().equals(variables.get(0).getDataTable().getId())) { - throw new IOException("Variable in the subset request does not belong to the datafile."); - } - dbgLog.fine("single variable subset; setting fileChannel position to "+extractColumnOffset(columnEndOffsets, variables.get(0).getFileOrder())); - fileChannel.position(extractColumnOffset(columnEndOffsets, variables.get(0).getFileOrder())); - columnTotalLengths[0] = extractColumnLength(columnEndOffsets, variables.get(0).getFileOrder()); - columnTotalOffsets[0] = 0; - } else { - columnEntries = new byte[subsetcount][]; - - columnBufferSizes = new int[subsetcount]; - columnBufferOffsets = new int[subsetcount]; - columnStartOffsets = new long[subsetcount]; - - int i = 0; - for (DataVariable var : variables) { - if (!datafile.getDataTable().getId().equals(var.getDataTable().getId())) { - throw new IOException("Variable in the subset request does not belong to the datafile."); - } - columnByteBuffers[i] = ByteBuffer.allocate(MAX_COLUMN_BUFFER); - columnTotalLengths[i] = extractColumnLength(columnEndOffsets, var.getFileOrder()); - columnStartOffsets[i] = extractColumnOffset(columnEndOffsets, var.getFileOrder()); - if (columnTotalLengths[i] < MAX_COLUMN_BUFFER) { - columnByteBuffers[i].limit((int)columnTotalLengths[i]); - } - fileChannel.position(columnStartOffsets[i]); - columnBufferSizes[i] = fileChannel.read(columnByteBuffers[i]); - columnBufferOffsets[i] = 0; - columnTotalOffsets[i] = columnBufferSizes[i]; - i++; - } - } - } - - private int getVarCount() { - return varcount; } - private void setVarCount(int varcount) { - this.varcount = varcount; - } - - private int getCaseCount() { - return casecount; - } - - private void setCaseCount(int casecount) { - this.casecount = casecount; - } - - - /* - * Note that this method operates on the *absolute* column number, i.e. - * the number of the physical column in the tabular file. This is stored - * in DataVariable.FileOrder. - * This "column number" should not be confused with the number of column - * in the subset request; a user can request any number of variable - * columns, in an order that doesn't have to follow the physical order - * of the columns in the file. - */ - private long extractColumnOffset(long[] columnEndOffsets, int column) throws IOException { - if (columnEndOffsets == null || columnEndOffsets.length <= column) { - throw new IOException("Offsets table not initialized; or column out of bounds."); - } - long columnOffset; - - if (column > 0) { - columnOffset = columnEndOffsets[column - 1]; - } else { - columnOffset = getVarCount() * 8; - } - return columnOffset; - } - - /* - * See the comment for the method above. + /** + * This class used to be much more complex. There were methods for subsetting + * from fixed-width field files; including using the optimized, "90 deg. rotated" + * versions of such files (i.e. you create a *columns-wise* copy of your data + * file in which the columns are stored sequentially, and a table of byte + * offsets of each column. You can then read individual variable columns + * for cheap; at the expense of doubling the storage size of your tabular + * data files. These methods were not used, so they were deleted (in Jan. 2024 + * prior to 6.2. + * Please consult git history if you are interested in looking at that code. */ - private long extractColumnLength(long[] columnEndOffsets, int column) throws IOException { - if (columnEndOffsets == null || columnEndOffsets.length <= column) { - throw new IOException("Offsets table not initialized; or column out of bounds."); - } - long columnLength; - - if (column > 0) { - columnLength = columnEndOffsets[column] - columnEndOffsets[column - 1]; - } else { - columnLength = columnEndOffsets[0] - varcount * 8; - } - - return columnLength; - } - - - private void bufferMoreColumnBytes(int column) throws IOException { - if (columnTotalOffsets[column] >= columnTotalLengths[column]) { - throw new IOException("attempt to buffer bytes past the column boundary"); - } - fileChannel.position(columnStartOffsets[column] + columnTotalOffsets[column]); - - columnByteBuffers[column].clear(); - if (columnTotalLengths[column] < columnTotalOffsets[column] + MAX_COLUMN_BUFFER) { - dbgLog.fine("Limiting the buffer to "+(columnTotalLengths[column] - columnTotalOffsets[column])+" bytes"); - columnByteBuffers[column].limit((int) (columnTotalLengths[column] - columnTotalOffsets[column])); - } - columnBufferSizes[column] = fileChannel.read(columnByteBuffers[column]); - dbgLog.fine("Read "+columnBufferSizes[column]+" bytes for subset column "+column); - columnBufferOffsets[column] = 0; - columnTotalOffsets[column] += columnBufferSizes[column]; - } - - public byte[] readColumnEntryBytes(int column) { - return readColumnEntryBytes(column, true); - } - - - public byte[] readColumnEntryBytes(int column, boolean addTabs) { - byte[] leftover = null; - byte[] ret = null; - - if (columnBufferOffsets[column] >= columnBufferSizes[column]) { - try { - bufferMoreColumnBytes(column); - if (columnBufferSizes[column] < 1) { - return null; - } - } catch (IOException ioe) { - return null; - } - } - - int byteindex = columnBufferOffsets[column]; - try { - while (columnByteBuffers[column].array()[byteindex] != '\n') { - byteindex++; - if (byteindex == columnBufferSizes[column]) { - // save the leftover: - if (leftover == null) { - leftover = new byte[columnBufferSizes[column] - columnBufferOffsets[column]]; - System.arraycopy(columnByteBuffers[column].array(), columnBufferOffsets[column], leftover, 0, columnBufferSizes[column] - columnBufferOffsets[column]); - } else { - byte[] merged = new byte[leftover.length + columnBufferSizes[column]]; - - System.arraycopy(leftover, 0, merged, 0, leftover.length); - System.arraycopy(columnByteBuffers[column].array(), 0, merged, leftover.length, columnBufferSizes[column]); - leftover = merged; - merged = null; - } - // read more bytes: - bufferMoreColumnBytes(column); - if (columnBufferSizes[column] < 1) { - return null; - } - byteindex = 0; - } - } - - // presumably, we have found our '\n': - if (leftover == null) { - ret = new byte[byteindex - columnBufferOffsets[column] + 1]; - System.arraycopy(columnByteBuffers[column].array(), columnBufferOffsets[column], ret, 0, byteindex - columnBufferOffsets[column] + 1); - } else { - ret = new byte[leftover.length + byteindex + 1]; - System.arraycopy(leftover, 0, ret, 0, leftover.length); - System.arraycopy(columnByteBuffers[column].array(), 0, ret, leftover.length, byteindex + 1); - } - - } catch (IOException ioe) { - return null; - } - - columnBufferOffsets[column] = (byteindex + 1); - - if (column < columnBufferOffsets.length - 1) { - ret[ret.length - 1] = '\t'; - } - return ret; - } - - public int readSingleColumnSubset(byte[] buffer) throws IOException { - if (columnTotalOffsets[0] == columnTotalLengths[0]) { - return -1; - } - - if (columnByteBuffers[0] == null) { - dbgLog.fine("allocating single column subset buffer."); - columnByteBuffers[0] = ByteBuffer.allocate(buffer.length); - } - - int bytesread = fileChannel.read(columnByteBuffers[0]); - dbgLog.fine("single column subset: read "+bytesread+" bytes."); - if (columnTotalOffsets[0] + bytesread > columnTotalLengths[0]) { - bytesread = (int)(columnTotalLengths[0] - columnTotalOffsets[0]); - } - System.arraycopy(columnByteBuffers[0].array(), 0, buffer, 0, bytesread); - - columnTotalOffsets[0] += bytesread; - columnByteBuffers[0].clear(); - return bytesread > 0 ? bytesread : -1; - } - - - public byte[] readSubsetLineBytes() throws IOException { - byte[] ret = null; - int total = 0; - for (int i = 0; i < subsetcount; i++) { - columnEntries[i] = readColumnEntryBytes(i); - if (columnEntries[i] == null) { - throw new IOException("Failed to read subset line entry"); - } - total += columnEntries[i].length; - } - - ret = new byte[total]; - int offset = 0; - for (int i = 0; i < subsetcount; i++) { - System.arraycopy(columnEntries[i], 0, ret, offset, columnEntries[i].length); - offset += columnEntries[i].length; - } - dbgLog.fine("line: "+new String(ret)); - return ret; - } - - - public void close() { - if (fileChannel != null) { - try { - fileChannel.close(); - } catch (IOException ioe) { - // don't care. - } - } - } - public void subsetFile(String infile, String outfile, List columns, Long numCases) { subsetFile(infile, outfile, columns, numCases, "\t"); } @@ -411,11 +132,15 @@ public void subsetFile(InputStream in, String outfile, List columns, Lo * files, OK to use on small files: */ - public static Double[] subsetDoubleVector(InputStream in, int column, int numCases) { + public static Double[] subsetDoubleVector(InputStream in, int column, int numCases, boolean skipHeader) { Double[] retVector = new Double[numCases]; try (Scanner scanner = new Scanner(in)) { scanner.useDelimiter("\\n"); + if (skipHeader) { + skipFirstLine(scanner); + } + for (int caseIndex = 0; caseIndex < numCases; caseIndex++) { if (scanner.hasNext()) { String[] line = (scanner.next()).split("\t", -1); @@ -463,11 +188,15 @@ public static Double[] subsetDoubleVector(InputStream in, int column, int numCas * Same deal as with the method above - straightforward, but (potentially) slow. * Not a resource hog though - will only try to store one vector in memory. */ - public static Float[] subsetFloatVector(InputStream in, int column, int numCases) { + public static Float[] subsetFloatVector(InputStream in, int column, int numCases, boolean skipHeader) { Float[] retVector = new Float[numCases]; try (Scanner scanner = new Scanner(in)) { scanner.useDelimiter("\\n"); + if (skipHeader) { + skipFirstLine(scanner); + } + for (int caseIndex = 0; caseIndex < numCases; caseIndex++) { if (scanner.hasNext()) { String[] line = (scanner.next()).split("\t", -1); @@ -513,11 +242,15 @@ public static Float[] subsetFloatVector(InputStream in, int column, int numCases * Same deal as with the method above - straightforward, but (potentially) slow. * Not a resource hog though - will only try to store one vector in memory. */ - public static Long[] subsetLongVector(InputStream in, int column, int numCases) { + public static Long[] subsetLongVector(InputStream in, int column, int numCases, boolean skipHeader) { Long[] retVector = new Long[numCases]; try (Scanner scanner = new Scanner(in)) { scanner.useDelimiter("\\n"); + if (skipHeader) { + skipFirstLine(scanner); + } + for (int caseIndex = 0; caseIndex < numCases; caseIndex++) { if (scanner.hasNext()) { String[] line = (scanner.next()).split("\t", -1); @@ -549,11 +282,15 @@ public static Long[] subsetLongVector(InputStream in, int column, int numCases) * Same deal as with the method above - straightforward, but (potentially) slow. * Not a resource hog though - will only try to store one vector in memory. */ - public static String[] subsetStringVector(InputStream in, int column, int numCases) { + public static String[] subsetStringVector(InputStream in, int column, int numCases, boolean skipHeader) { String[] retVector = new String[numCases]; try (Scanner scanner = new Scanner(in)) { scanner.useDelimiter("\\n"); + if (skipHeader) { + skipFirstLine(scanner); + } + for (int caseIndex = 0; caseIndex < numCases; caseIndex++) { if (scanner.hasNext()) { String[] line = (scanner.next()).split("\t", -1); @@ -621,819 +358,10 @@ public static String[] subsetStringVector(InputStream in, int column, int numCas } - /* - * Straightforward method for subsetting a tab-delimited data file, extracting - * all the columns representing continuous variables and returning them as - * a 2-dimensional array of Doubles; - * Inefficient on large files, OK to use on small ones. - */ - public static Double[][] subsetDoubleVectors(InputStream in, Set columns, int numCases) throws IOException { - Double[][] retVector = new Double[columns.size()][numCases]; - try (Scanner scanner = new Scanner(in)) { - scanner.useDelimiter("\\n"); - - for (int caseIndex = 0; caseIndex < numCases; caseIndex++) { - if (scanner.hasNext()) { - String[] line = (scanner.next()).split("\t", -1); - int j = 0; - for (Integer i : columns) { - try { - // TODO: verify that NaN and +-Inf are going to be - // handled correctly here! -- L.A. - // NO, "+-Inf" is not handled correctly; see the - // comment further down below. - retVector[j][caseIndex] = new Double(line[i]); - } catch (NumberFormatException ex) { - retVector[j][caseIndex] = null; // missing value - } - j++; - } - } else { - throw new IOException("Tab file has fewer rows than the stored number of cases!"); - } - } - - int tailIndex = numCases; - while (scanner.hasNext()) { - String nextLine = scanner.next(); - if (!"".equals(nextLine)) { - throw new IOException("Tab file has more nonempty rows than the stored number of cases ("+numCases+")! current index: "+tailIndex+", line: "+nextLine); - } - tailIndex++; - } - - } - return retVector; - - } - - public String[] subsetStringVector(DataFile datafile, int column) throws IOException { - return (String[])subsetObjectVector(datafile, column, COLUMN_TYPE_STRING); - } - - public Double[] subsetDoubleVector(DataFile datafile, int column) throws IOException { - return (Double[])subsetObjectVector(datafile, column, COLUMN_TYPE_DOUBLE); - } - - public Long[] subsetLongVector(DataFile datafile, int column) throws IOException { - return (Long[])subsetObjectVector(datafile, column, COLUMN_TYPE_LONG); - } - - // Float methods are temporary; - // In normal operations we'll be treating all the floating point types as - // doubles. I need to be able to handle floats for some 4.0 vs 3.* ingest - // tests. -- L.A. - - public Float[] subsetFloatVector(DataFile datafile, int column) throws IOException { - return (Float[])subsetObjectVector(datafile, column, COLUMN_TYPE_FLOAT); - } - - public String[] subsetStringVector(File tabfile, int column, int varcount, int casecount) throws IOException { - return (String[])subsetObjectVector(tabfile, column, varcount, casecount, COLUMN_TYPE_STRING); - } - - public Double[] subsetDoubleVector(File tabfile, int column, int varcount, int casecount) throws IOException { - return (Double[])subsetObjectVector(tabfile, column, varcount, casecount, COLUMN_TYPE_DOUBLE); - } - - public Long[] subsetLongVector(File tabfile, int column, int varcount, int casecount) throws IOException { - return (Long[])subsetObjectVector(tabfile, column, varcount, casecount, COLUMN_TYPE_LONG); - } - - public Float[] subsetFloatVector(File tabfile, int column, int varcount, int casecount) throws IOException { - return (Float[])subsetObjectVector(tabfile, column, varcount, casecount, COLUMN_TYPE_FLOAT); - } - - public Object[] subsetObjectVector(DataFile dataFile, int column, int columntype) throws IOException { - if (!dataFile.isTabularData()) { - throw new IOException("DataFile is not tabular data."); - } - - int varcount = dataFile.getDataTable().getVarQuantity().intValue(); - int casecount = dataFile.getDataTable().getCaseQuantity().intValue(); - - if (column >= varcount) { - throw new IOException("Column "+column+" is out of bounds."); - } - - StorageIO dataAccess = dataFile.getStorageIO(); - if (!dataAccess.isLocalFile()) { - throw new IOException("Subsetting is supported on local files only!"); - } - - //File tabfile = datafile.getFileSystemLocation().toFile(); - File tabfile = dataAccess.getFileSystemPath().toFile(); - - if (columntype == COLUMN_TYPE_STRING) { - String filename = dataFile.getFileMetadata().getLabel(); - if (filename != null) { - filename = filename.replaceFirst("^_", ""); - Integer fnumvalue = null; - try { - fnumvalue = new Integer(filename); - } catch (Exception ex){ - fnumvalue = null; - } - if (fnumvalue != null) { - //if ((fnumvalue.intValue() < 112497)) { // && (fnumvalue.intValue() > 60015)) { - if ((fnumvalue.intValue() < 111931)) { // && (fnumvalue.intValue() > 60015)) { - if (!(fnumvalue.intValue() == 60007 - || fnumvalue.intValue() == 59997 - || fnumvalue.intValue() == 60015 - || fnumvalue.intValue() == 59948 - || fnumvalue.intValue() == 60012 - || fnumvalue.intValue() == 52585 - || fnumvalue.intValue() == 60005 - || fnumvalue.intValue() == 60002 - || fnumvalue.intValue() == 59954 - || fnumvalue.intValue() == 60008 - || fnumvalue.intValue() == 54972 - || fnumvalue.intValue() == 55010 - || fnumvalue.intValue() == 54996 - || fnumvalue.intValue() == 53527 - || fnumvalue.intValue() == 53546 - || fnumvalue.intValue() == 55002 - || fnumvalue.intValue() == 55006 - || fnumvalue.intValue() == 54998 - || fnumvalue.intValue() == 52552 - // SPSS/SAV cases with similar issue - compat mode must be disabled - //|| fnumvalue.intValue() == 101826 // temporary - tricky file with accents and v. 16... - || fnumvalue.intValue() == 54618 // another SAV file, with long strings... - || fnumvalue.intValue() == 54619 // [same] - || fnumvalue.intValue() == 57983 - || fnumvalue.intValue() == 58262 - || fnumvalue.intValue() == 58288 - || fnumvalue.intValue() == 58656 - || fnumvalue.intValue() == 59144 - // || fnumvalue.intValue() == 69626 [nope!] - )) { - dbgLog.info("\"Old\" file name detected; using \"compatibility mode\" for a character vector subset;"); - return subsetObjectVector(tabfile, column, varcount, casecount, columntype, true); - } - } - } - } + private static void skipFirstLine(Scanner scanner) { + if (!scanner.hasNext()) { + throw new RuntimeException("Failed to read the variable name header line from the tab-delimited file!"); } - - return subsetObjectVector(tabfile, column, varcount, casecount, columntype); - } - - public Object[] subsetObjectVector(File tabfile, int column, int varcount, int casecount, int columntype) throws IOException { - return subsetObjectVector(tabfile, column, varcount, casecount, columntype, false); - } - - - - public Object[] subsetObjectVector(File tabfile, int column, int varcount, int casecount, int columntype, boolean compatmode) throws IOException { - - Object[] retVector = null; - - boolean isString = false; - boolean isDouble = false; - boolean isLong = false; - boolean isFloat = false; - - //Locale loc = new Locale("en", "US"); - - if (columntype == COLUMN_TYPE_STRING) { - isString = true; - retVector = new String[casecount]; - } else if (columntype == COLUMN_TYPE_DOUBLE) { - isDouble = true; - retVector = new Double[casecount]; - } else if (columntype == COLUMN_TYPE_LONG) { - isLong = true; - retVector = new Long[casecount]; - } else if (columntype == COLUMN_TYPE_FLOAT){ - isFloat = true; - retVector = new Float[casecount]; - } else { - throw new IOException("Unsupported column type: "+columntype); - } - - File rotatedImageFile = getRotatedImage(tabfile, varcount, casecount); - long[] columnEndOffsets = extractColumnOffsets(rotatedImageFile, varcount, casecount); - long columnOffset = 0; - long columnLength = 0; - - if (column > 0) { - columnOffset = columnEndOffsets[column - 1]; - columnLength = columnEndOffsets[column] - columnEndOffsets[column - 1]; - } else { - columnOffset = varcount * 8; - columnLength = columnEndOffsets[0] - varcount * 8; - } - int caseindex = 0; - - try (FileChannel fc = (FileChannel.open(Paths.get(rotatedImageFile.getAbsolutePath()), - StandardOpenOption.READ))) { - fc.position(columnOffset); - int MAX_COLUMN_BUFFER = 8192; - - ByteBuffer in = ByteBuffer.allocate(MAX_COLUMN_BUFFER); - - if (columnLength < MAX_COLUMN_BUFFER) { - in.limit((int) (columnLength)); - } - - long bytesRead = 0; - long bytesReadTotal = 0; - - int byteoffset = 0; - byte[] leftover = null; - - while (bytesReadTotal < columnLength) { - bytesRead = fc.read(in); - byte[] columnBytes = in.array(); - int bytecount = 0; - - while (bytecount < bytesRead) { - if (columnBytes[bytecount] == '\n') { - /* - String token = new String(columnBytes, byteoffset, bytecount-byteoffset, "UTF8"); - - if (leftover != null) { - String leftoverString = new String (leftover, "UTF8"); - token = leftoverString + token; - leftover = null; - } - */ - /* - * Note that the way I was doing it at first - above - - * was not quite the correct way - because I was creating UTF8 - * strings from the leftover bytes, and the bytes in the - * current buffer *separately*; which means, if a multi-byte - * UTF8 character got split in the middle between one buffer - * and the next, both chunks of it would become junk - * characters, on each side! - * The correct way of doing it, of course, is to create a - * merged byte buffer, and then turn it into a UTF8 string. - * -- L.A. 4.0 - */ - String token = null; - - if (leftover == null) { - token = new String(columnBytes, byteoffset, bytecount - byteoffset, "UTF8"); - } else { - byte[] merged = new byte[leftover.length + bytecount - byteoffset]; - - System.arraycopy(leftover, 0, merged, 0, leftover.length); - System.arraycopy(columnBytes, byteoffset, merged, leftover.length, bytecount - byteoffset); - token = new String(merged, "UTF8"); - leftover = null; - merged = null; - } - - if (isString) { - if ("".equals(token)) { - // An empty string is a string missing value! - // An empty string in quotes is an empty string! - retVector[caseindex] = null; - } else { - // Strip the outer quotes: - token = token.replaceFirst("^\\\"", ""); - token = token.replaceFirst("\\\"$", ""); - - // We need to restore the special characters that - // are stored in tab files escaped - quotes, new lines - // and tabs. Before we do that however, we need to - // take care of any escaped backslashes stored in - // the tab file. I.e., "foo\t" should be transformed - // to "foo"; but "foo\\t" should be transformed - // to "foo\t". This way new lines and tabs that were - // already escaped in the original data are not - // going to be transformed to unescaped tab and - // new line characters! - - String[] splitTokens = token.split(Matcher.quoteReplacement("\\\\"), -2); - - // (note that it's important to use the 2-argument version - // of String.split(), and set the limit argument to a - // negative value; otherwise any trailing backslashes - // are lost.) - - for (int i = 0; i < splitTokens.length; i++) { - splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\\""), "\""); - splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\t"), "\t"); - splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\n"), "\n"); - splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\r"), "\r"); - } - // TODO: - // Make (some of?) the above optional; for ex., we - // do need to restore the newlines when calculating UNFs; - // But if we are subsetting these vectors in order to - // create a new tab-delimited file, they will - // actually break things! -- L.A. Jul. 28 2014 - - token = StringUtils.join(splitTokens, '\\'); - - // "compatibility mode" - a hack, to be able to produce - // unfs identical to those produced by the "early" - // unf5 jar; will be removed in production 4.0. - // -- L.A. (TODO: ...) - if (compatmode && !"".equals(token)) { - if (token.length() > 128) { - if ("".equals(token.trim())) { - // don't ask... - token = token.substring(0, 129); - } else { - token = token.substring(0, 128); - // token = String.format(loc, "%.128s", token); - token = token.trim(); - // dbgLog.info("formatted and trimmed: "+token); - } - } else { - if ("".equals(token.trim())) { - // again, don't ask; - // - this replicates some bugginness - // that happens inside unf5; - token = "null"; - } else { - token = token.trim(); - } - } - } - - retVector[caseindex] = token; - } - } else if (isDouble) { - try { - // TODO: verify that NaN and +-Inf are - // handled correctly here! -- L.A. - // Verified: new Double("nan") works correctly, - // resulting in Double.NaN; - // Double("[+-]Inf") doesn't work however; - // (the constructor appears to be expecting it - // to be spelled as "Infinity", "-Infinity", etc. - if ("inf".equalsIgnoreCase(token) || "+inf".equalsIgnoreCase(token)) { - retVector[caseindex] = java.lang.Double.POSITIVE_INFINITY; - } else if ("-inf".equalsIgnoreCase(token)) { - retVector[caseindex] = java.lang.Double.NEGATIVE_INFINITY; - } else if (token == null || token.equals("")) { - // missing value: - retVector[caseindex] = null; - } else { - retVector[caseindex] = new Double(token); - } - } catch (NumberFormatException ex) { - dbgLog.warning("NumberFormatException thrown for " + token + " as Double"); - - retVector[caseindex] = null; // missing value - // TODO: ? - } - } else if (isLong) { - try { - retVector[caseindex] = new Long(token); - } catch (NumberFormatException ex) { - retVector[caseindex] = null; // assume missing value - } - } else if (isFloat) { - try { - if ("inf".equalsIgnoreCase(token) || "+inf".equalsIgnoreCase(token)) { - retVector[caseindex] = java.lang.Float.POSITIVE_INFINITY; - } else if ("-inf".equalsIgnoreCase(token)) { - retVector[caseindex] = java.lang.Float.NEGATIVE_INFINITY; - } else if (token == null || token.equals("")) { - // missing value: - retVector[caseindex] = null; - } else { - retVector[caseindex] = new Float(token); - } - } catch (NumberFormatException ex) { - dbgLog.warning("NumberFormatException thrown for " + token + " as Float"); - retVector[caseindex] = null; // assume missing value (TODO: ?) - } - } - caseindex++; - - if (bytecount == bytesRead - 1) { - byteoffset = 0; - } else { - byteoffset = bytecount + 1; - } - } else { - if (bytecount == bytesRead - 1) { - // We've reached the end of the buffer; - // This means we'll save whatever unused bytes left in - // it - i.e., the bytes between the last new line - // encountered and the end - in the leftover buffer. - - // *EXCEPT*, there may be a case of a very long String - // that is actually longer than MAX_COLUMN_BUFFER, in - // which case it is possible that we've read through - // an entire buffer of bytes without finding any - // new lines... in this case we may need to add this - // entire byte buffer to an already existing leftover - // buffer! - if (leftover == null) { - leftover = new byte[(int) bytesRead - byteoffset]; - System.arraycopy(columnBytes, byteoffset, leftover, 0, (int) bytesRead - byteoffset); - } else { - if (byteoffset != 0) { - throw new IOException("Reached the end of the byte buffer, with some leftover left from the last read; yet the offset is not zero!"); - } - byte[] merged = new byte[leftover.length + (int) bytesRead]; - - System.arraycopy(leftover, 0, merged, 0, leftover.length); - System.arraycopy(columnBytes, byteoffset, merged, leftover.length, (int) bytesRead); - // leftover = null; - leftover = merged; - merged = null; - } - byteoffset = 0; - - } - } - bytecount++; - } - - bytesReadTotal += bytesRead; - in.clear(); - if (columnLength - bytesReadTotal < MAX_COLUMN_BUFFER) { - in.limit((int) (columnLength - bytesReadTotal)); - } - } - - } - - if (caseindex != casecount) { - throw new IOException("Faile to read "+casecount+" tokens for column "+column); - //System.out.println("read "+caseindex+" tokens instead of expected "+casecount+"."); - } - - return retVector; - } - - private long[] extractColumnOffsets (File rotatedImageFile, int varcount, int casecount) throws IOException { - long[] byteOffsets = new long[varcount]; - - try (BufferedInputStream rotfileStream = new BufferedInputStream(new FileInputStream(rotatedImageFile))) { - - byte[] offsetHeader = new byte[varcount * 8]; - - int readlen = rotfileStream.read(offsetHeader); - - if (readlen != varcount * 8) { - throw new IOException("Could not read " + varcount * 8 + " header bytes from the rotated file."); - } - - for (int varindex = 0; varindex < varcount; varindex++) { - byte[] offsetBytes = new byte[8]; - System.arraycopy(offsetHeader, varindex * 8, offsetBytes, 0, 8); - - ByteBuffer offsetByteBuffer = ByteBuffer.wrap(offsetBytes); - byteOffsets[varindex] = offsetByteBuffer.getLong(); - - // System.out.println(byteOffsets[varindex]); - } - - } - - return byteOffsets; - } - - private File getRotatedImage(File tabfile, int varcount, int casecount) throws IOException { - String fileName = tabfile.getAbsolutePath(); - String rotatedImageFileName = fileName + ".90d"; - File rotatedImageFile = new File(rotatedImageFileName); - if (rotatedImageFile.exists()) { - //System.out.println("Image already exists!"); - return rotatedImageFile; - } - - return generateRotatedImage(tabfile, varcount, casecount); - - } - - private File generateRotatedImage (File tabfile, int varcount, int casecount) throws IOException { - // TODO: throw exceptions if bad file, zero varcount, etc. ... - - String fileName = tabfile.getAbsolutePath(); - String rotatedImageFileName = fileName + ".90d"; - - int MAX_OUTPUT_STREAMS = 32; - int MAX_BUFFERED_BYTES = 10 * 1024 * 1024; // 10 MB - for now? - int MAX_COLUMN_BUFFER = 8 * 1024; - - // offsetHeader will contain the byte offsets of the individual column - // vectors in the final rotated image file - byte[] offsetHeader = new byte[varcount * 8]; - int[] bufferedSizes = new int[varcount]; - long[] cachedfileSizes = new long[varcount]; - File[] columnTempFiles = new File[varcount]; - - for (int i = 0; i < varcount; i++) { - bufferedSizes[i] = 0; - cachedfileSizes[i] = 0; - } - - // TODO: adjust MAX_COLUMN_BUFFER here, so that the total size is - // no more than MAX_BUFFERED_BYTES (but no less than 1024 maybe?) - - byte[][] bufferedColumns = new byte [varcount][MAX_COLUMN_BUFFER]; - - // read the tab-delimited file: - - try (FileInputStream tabfileStream = new FileInputStream(tabfile); - Scanner scanner = new Scanner(tabfileStream)) { - scanner.useDelimiter("\\n"); - - for (int caseindex = 0; caseindex < casecount; caseindex++) { - if (scanner.hasNext()) { - String[] line = (scanner.next()).split("\t", -1); - // TODO: throw an exception if there are fewer tab-delimited - // tokens than the number of variables specified. - String token = ""; - int tokensize = 0; - for (int varindex = 0; varindex < varcount; varindex++) { - // TODO: figure out the safest way to convert strings to - // bytes here. Is it going to be safer to use getBytes("UTF8")? - // we are already making the assumption that the values - // in the tab file are in UTF8. -- L.A. - token = line[varindex] + "\n"; - tokensize = token.getBytes().length; - if (bufferedSizes[varindex] + tokensize > MAX_COLUMN_BUFFER) { - // fill the buffer and dump its contents into the temp file: - // (do note that there may be *several* MAX_COLUMN_BUFFERs - // worth of bytes in the token!) - - int tokenoffset = 0; - - if (bufferedSizes[varindex] != MAX_COLUMN_BUFFER) { - tokenoffset = MAX_COLUMN_BUFFER - bufferedSizes[varindex]; - System.arraycopy(token.getBytes(), 0, bufferedColumns[varindex], bufferedSizes[varindex], tokenoffset); - } // (otherwise the buffer is already full, and we should - // simply dump it into the temp file, without adding any - // extra bytes to it) - - File bufferTempFile = columnTempFiles[varindex]; - if (bufferTempFile == null) { - bufferTempFile = File.createTempFile("columnBufferFile", "bytes"); - columnTempFiles[varindex] = bufferTempFile; - } - - // *append* the contents of the buffer to the end of the - // temp file, if already exists: - try (BufferedOutputStream outputStream = new BufferedOutputStream( - new FileOutputStream(bufferTempFile, true))) { - outputStream.write(bufferedColumns[varindex], 0, MAX_COLUMN_BUFFER); - cachedfileSizes[varindex] += MAX_COLUMN_BUFFER; - - // keep writing MAX_COLUMN_BUFFER-size chunks of bytes into - // the temp file, for as long as there's more than MAX_COLUMN_BUFFER - // bytes left in the token: - - while (tokensize - tokenoffset > MAX_COLUMN_BUFFER) { - outputStream.write(token.getBytes(), tokenoffset, MAX_COLUMN_BUFFER); - cachedfileSizes[varindex] += MAX_COLUMN_BUFFER; - tokenoffset += MAX_COLUMN_BUFFER; - } - - } - - // buffer the remaining bytes and reset the buffered - // byte counter: - - System.arraycopy(token.getBytes(), - tokenoffset, - bufferedColumns[varindex], - 0, - tokensize - tokenoffset); - - bufferedSizes[varindex] = tokensize - tokenoffset; - - } else { - // continue buffering - System.arraycopy(token.getBytes(), 0, bufferedColumns[varindex], bufferedSizes[varindex], tokensize); - bufferedSizes[varindex] += tokensize; - } - } - } else { - throw new IOException("Tab file has fewer rows than the stored number of cases!"); - } - } - } - - // OK, we've created the individual byte vectors of the tab file columns; - // they may be partially saved in temp files and/or in memory. - // We now need to go through all these buffers and create the final - // rotated image file. - - try (BufferedOutputStream finalOut = new BufferedOutputStream( - new FileOutputStream(new File(rotatedImageFileName)))) { - - // but first we should create the offset header and write it out into - // the final file; because it should be at the head, doh! - - long columnOffset = varcount * 8; - // (this is the offset of the first column vector; it is equal to the - // size of the offset header, i.e. varcount * 8 bytes) - - for (int varindex = 0; varindex < varcount; varindex++) { - long totalColumnBytes = cachedfileSizes[varindex] + bufferedSizes[varindex]; - columnOffset += totalColumnBytes; - // totalColumnBytes; - byte[] columnOffsetByteArray = ByteBuffer.allocate(8).putLong(columnOffset).array(); - System.arraycopy(columnOffsetByteArray, 0, offsetHeader, varindex * 8, 8); - } - - finalOut.write(offsetHeader, 0, varcount * 8); - - for (int varindex = 0; varindex < varcount; varindex++) { - long cachedBytesRead = 0; - - // check if there is a cached temp file: - - File cachedTempFile = columnTempFiles[varindex]; - if (cachedTempFile != null) { - byte[] cachedBytes = new byte[MAX_COLUMN_BUFFER]; - try (BufferedInputStream cachedIn = new BufferedInputStream(new FileInputStream(cachedTempFile))) { - int readlen = 0; - while ((readlen = cachedIn.read(cachedBytes)) > -1) { - finalOut.write(cachedBytes, 0, readlen); - cachedBytesRead += readlen; - } - } - - // delete the temp file: - cachedTempFile.delete(); - - } - - if (cachedBytesRead != cachedfileSizes[varindex]) { - throw new IOException("Could not read the correct number of bytes cached for column "+varindex+"; "+ - cachedfileSizes[varindex] + " bytes expected, "+cachedBytesRead+" read."); - } - - // then check if there are any bytes buffered for this column: - - if (bufferedSizes[varindex] > 0) { - finalOut.write(bufferedColumns[varindex], 0, bufferedSizes[varindex]); - } - - } - } - - return new File(rotatedImageFileName); - - } - - /* - * Test method for taking a "rotated" image, and reversing it, reassembling - * all the columns in the original order. Which should result in a file - * byte-for-byte identical file to the original tab-delimited version. - * - * (do note that this method is not efficiently implemented; it's only - * being used for experiments so far, to confirm the accuracy of the - * accuracy of generateRotatedImage(). It should not be used for any - * practical means in the application!) - */ - private void reverseRotatedImage (File rotfile, int varcount, int casecount) throws IOException { - // open the file, read in the offset header: - try (BufferedInputStream rotfileStream = new BufferedInputStream(new FileInputStream(rotfile))) { - byte[] offsetHeader = new byte[varcount * 8]; - long[] byteOffsets = new long[varcount]; - - int readlen = rotfileStream.read(offsetHeader); - - if (readlen != varcount * 8) { - throw new IOException ("Could not read "+varcount*8+" header bytes from the rotated file."); - } - - for (int varindex = 0; varindex < varcount; varindex++) { - byte[] offsetBytes = new byte[8]; - System.arraycopy(offsetHeader, varindex*8, offsetBytes, 0, 8); - - ByteBuffer offsetByteBuffer = ByteBuffer.wrap(offsetBytes); - byteOffsets[varindex] = offsetByteBuffer.getLong(); - - //System.out.println(byteOffsets[varindex]); - } - - String [][] reversedMatrix = new String[casecount][varcount]; - - long offset = varcount * 8; - byte[] columnBytes; - - for (int varindex = 0; varindex < varcount; varindex++) { - long columnLength = byteOffsets[varindex] - offset; - - - - columnBytes = new byte[(int)columnLength]; - readlen = rotfileStream.read(columnBytes); - - if (readlen != columnLength) { - throw new IOException ("Could not read "+columnBytes+" bytes for column "+varindex); - } - /* - String columnString = new String(columnBytes); - //System.out.print(columnString); - String[] values = columnString.split("\n", -1); - - if (values.length < casecount) { - throw new IOException("count mismatch: "+values.length+" tokens found for column "+varindex); - } - - for (int caseindex = 0; caseindex < casecount; caseindex++) { - reversedMatrix[caseindex][varindex] = values[caseindex]; - }*/ - - int bytecount = 0; - int byteoffset = 0; - int caseindex = 0; - //System.out.println("generating value vector for column "+varindex); - while (bytecount < columnLength) { - if (columnBytes[bytecount] == '\n') { - String token = new String(columnBytes, byteoffset, bytecount-byteoffset); - reversedMatrix[caseindex++][varindex] = token; - byteoffset = bytecount + 1; - } - bytecount++; - } - - if (caseindex != casecount) { - throw new IOException("count mismatch: "+caseindex+" tokens found for column "+varindex); - } - offset = byteOffsets[varindex]; - } - - for (int caseindex = 0; caseindex < casecount; caseindex++) { - for (int varindex = 0; varindex < varcount; varindex++) { - System.out.print(reversedMatrix[caseindex][varindex]); - if (varindex < varcount-1) { - System.out.print("\t"); - } else { - System.out.print("\n"); - } - } - } - - } - - - } - - /** - * main() method, for testing - * usage: java edu.harvard.iq.dataverse.dataaccess.TabularSubsetGenerator testfile.tab varcount casecount column type - * make sure the CLASSPATH contains ... - * - */ - - public static void main(String[] args) { - - String tabFileName = args[0]; - int varcount = new Integer(args[1]).intValue(); - int casecount = new Integer(args[2]).intValue(); - int column = new Integer(args[3]).intValue(); - String type = args[4]; - - File tabFile = new File(tabFileName); - File rotatedImageFile = null; - - TabularSubsetGenerator subsetGenerator = new TabularSubsetGenerator(); - - /* - try { - rotatedImageFile = subsetGenerator.getRotatedImage(tabFile, varcount, casecount); - } catch (IOException ex) { - System.out.println(ex.getMessage()); - } - */ - - //System.out.println("\nFinished generating \"rotated\" column image file."); - - //System.out.println("\nOffsets:"); - - MathContext doubleMathContext = new MathContext(15, RoundingMode.HALF_EVEN); - String FORMAT_IEEE754 = "%+#.15e"; - - try { - //subsetGenerator.reverseRotatedImage(rotatedImageFile, varcount, casecount); - //String[] columns = subsetGenerator.subsetStringVector(tabFile, column, varcount, casecount); - if ("string".equals(type)) { - String[] columns = subsetGenerator.subsetStringVector(tabFile, column, varcount, casecount); - for (int i = 0; i < casecount; i++) { - System.out.println(columns[i]); - } - } else { - - Double[] columns = subsetGenerator.subsetDoubleVector(tabFile, column, varcount, casecount); - for (int i = 0; i < casecount; i++) { - if (columns[i] != null) { - BigDecimal outBigDecimal = new BigDecimal(columns[i], doubleMathContext); - System.out.println(String.format(FORMAT_IEEE754, outBigDecimal)); - } else { - System.out.println("NA"); - } - //System.out.println(columns[i]); - } - } - } catch (IOException ex) { - System.out.println(ex.getMessage()); - } - } -} - - + scanner.next(); + } +} \ No newline at end of file diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetInputStream.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetInputStream.java deleted file mode 100644 index 89e033353c1..00000000000 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetInputStream.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package edu.harvard.iq.dataverse.dataaccess; - -import edu.harvard.iq.dataverse.DataFile; -import edu.harvard.iq.dataverse.datavariable.DataVariable; -import java.io.IOException; -import java.io.InputStream; -import java.util.List; -import java.util.logging.Logger; - -/** - * - * @author Leonid Andreev - */ -public class TabularSubsetInputStream extends InputStream { - private static final Logger logger = Logger.getLogger(TabularSubsetInputStream.class.getCanonicalName()); - - private TabularSubsetGenerator subsetGenerator = null; - private int numberOfSubsetVariables; - private int numberOfObservations; - private int numberOfObservationsRead = 0; - private byte[] leftoverBytes = null; - - public TabularSubsetInputStream(DataFile datafile, List variables) throws IOException { - if (datafile == null) { - throw new IOException("Null datafile in subset request"); - } - if (!datafile.isTabularData()) { - throw new IOException("Subset requested on a non-tabular data file"); - } - numberOfObservations = datafile.getDataTable().getCaseQuantity().intValue(); - - if (variables == null || variables.size() < 1) { - throw new IOException("Null or empty list of variables in subset request."); - } - numberOfSubsetVariables = variables.size(); - subsetGenerator = new TabularSubsetGenerator(datafile, variables); - - } - - //@Override - public int read() throws IOException { - throw new IOException("read() method not implemented; do not use."); - } - - //@Override - public int read(byte[] b) throws IOException { - // TODO: - // Move this code into TabularSubsetGenerator - logger.fine("subset input stream: read request, on a "+b.length+" byte buffer;"); - - if (numberOfSubsetVariables == 1) { - logger.fine("calling the single variable subset read method"); - return subsetGenerator.readSingleColumnSubset(b); - } - - int bytesread = 0; - byte [] linebuffer; - - // do we have a leftover? - if (leftoverBytes != null) { - if (leftoverBytes.length < b.length) { - System.arraycopy(leftoverBytes, 0, b, 0, leftoverBytes.length); - bytesread = leftoverBytes.length; - leftoverBytes = null; - - } else { - // shouldn't really happen... unless it's a very large subset, - // or a very long string, etc. - System.arraycopy(leftoverBytes, 0, b, 0, b.length); - byte[] tmp = new byte[leftoverBytes.length - b.length]; - System.arraycopy(leftoverBytes, b.length, tmp, 0, leftoverBytes.length - b.length); - leftoverBytes = tmp; - tmp = null; - return b.length; - } - } - - while (bytesread < b.length && numberOfObservationsRead < numberOfObservations) { - linebuffer = subsetGenerator.readSubsetLineBytes(); - numberOfObservationsRead++; - - if (bytesread + linebuffer.length < b.length) { - // copy linebuffer into the return buffer: - System.arraycopy(linebuffer, 0, b, bytesread, linebuffer.length); - bytesread += linebuffer.length; - } else { - System.arraycopy(linebuffer, 0, b, bytesread, b.length - bytesread); - // save the leftover; - if (bytesread + linebuffer.length > b.length) { - leftoverBytes = new byte[bytesread + linebuffer.length - b.length]; - System.arraycopy(linebuffer, b.length - bytesread, leftoverBytes, 0, bytesread + linebuffer.length - b.length); - } - return b.length; - } - } - - // and this means we've reached the end of the tab file! - - return bytesread > 0 ? bytesread : -1; - } - - //@Override - public void close() { - if (subsetGenerator != null) { - subsetGenerator.close(); - } - } -} diff --git a/src/main/java/edu/harvard/iq/dataverse/export/DDIExportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/export/DDIExportServiceBean.java index 5119b4b96c7..edd01ae98a3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/export/DDIExportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/export/DDIExportServiceBean.java @@ -545,6 +545,16 @@ private void createDataFileDDI(XMLStreamWriter xmlw, Set excludedFieldSe List vars = variableService.findByDataTableId(dt.getId()); if (checkField("catgry", excludedFieldSet, includedFieldSet)) { if (checkIsWithoutFrequencies(vars)) { + // @todo: the method called here to calculate frequencies + // when they are missing from the database (for whatever + // reasons) subsets the physical tab-delimited file and + // calculates them in real time. this is very expensive operation + // potentially. let's make sure that, when we do this, we + // save the resulting frequencies in the database, so that + // we don't have to do this again. Also, let's double check + // whether the "checkIsWithoutFrequencies()" method is doing + // the right thing - as it appears to return true when there + // are no categorical variables in the DataTable (?) calculateFrequencies(df, vars); } } @@ -580,6 +590,7 @@ private boolean checkIsWithoutFrequencies(List vars) { private void calculateFrequencies(DataFile df, List vars) { + // @todo: see the comment in the part of the code that calls this method try { DataConverter dc = new DataConverter(); File tabFile = dc.downloadFromStorageIO(df.getStorageIO()); diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index 233f746fb17..9bacafd173f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -726,27 +726,17 @@ public void produceSummaryStatistics(DataFile dataFile, File generatedTabularFil } public void produceContinuousSummaryStatistics(DataFile dataFile, File generatedTabularFile) throws IOException { - - /* - // quick, but memory-inefficient way: - // - this method just loads the entire file-worth of continuous vectors - // into a Double[][] matrix. - //Double[][] variableVectors = subsetContinuousVectors(dataFile); - //calculateContinuousSummaryStatistics(dataFile, variableVectors); - - // A more sophisticated way: this subsets one column at a time, using - // the new optimized subsetting that does not have to read any extra - // bytes from the file to extract the column: - - TabularSubsetGenerator subsetGenerator = new TabularSubsetGenerator(); - */ for (int i = 0; i < dataFile.getDataTable().getVarQuantity(); i++) { if (dataFile.getDataTable().getDataVariables().get(i).isIntervalContinuous()) { logger.fine("subsetting continuous vector"); if ("float".equals(dataFile.getDataTable().getDataVariables().get(i).getFormat())) { - Float[] variableVector = TabularSubsetGenerator.subsetFloatVector(new FileInputStream(generatedTabularFile), i, dataFile.getDataTable().getCaseQuantity().intValue()); + Float[] variableVector = TabularSubsetGenerator.subsetFloatVector( + new FileInputStream(generatedTabularFile), + i, + dataFile.getDataTable().getCaseQuantity().intValue(), + dataFile.getDataTable().isStoredWithVariableHeader()); logger.fine("Calculating summary statistics on a Float vector;"); calculateContinuousSummaryStatistics(dataFile, i, variableVector); // calculate the UNF while we are at it: @@ -754,7 +744,11 @@ public void produceContinuousSummaryStatistics(DataFile dataFile, File generated calculateUNF(dataFile, i, variableVector); variableVector = null; } else { - Double[] variableVector = TabularSubsetGenerator.subsetDoubleVector(new FileInputStream(generatedTabularFile), i, dataFile.getDataTable().getCaseQuantity().intValue()); + Double[] variableVector = TabularSubsetGenerator.subsetDoubleVector( + new FileInputStream(generatedTabularFile), + i, + dataFile.getDataTable().getCaseQuantity().intValue(), + dataFile.getDataTable().isStoredWithVariableHeader()); logger.fine("Calculating summary statistics on a Double vector;"); calculateContinuousSummaryStatistics(dataFile, i, variableVector); // calculate the UNF while we are at it: @@ -776,7 +770,11 @@ public void produceDiscreteNumericSummaryStatistics(DataFile dataFile, File gene && dataFile.getDataTable().getDataVariables().get(i).isTypeNumeric()) { logger.fine("subsetting discrete-numeric vector"); - Long[] variableVector = TabularSubsetGenerator.subsetLongVector(new FileInputStream(generatedTabularFile), i, dataFile.getDataTable().getCaseQuantity().intValue()); + Long[] variableVector = TabularSubsetGenerator.subsetLongVector( + new FileInputStream(generatedTabularFile), + i, + dataFile.getDataTable().getCaseQuantity().intValue(), + dataFile.getDataTable().isStoredWithVariableHeader()); // We are discussing calculating the same summary stats for // all numerics (the same kind of sumstats that we've been calculating // for numeric continuous type) -- L.A. Jul. 2014 @@ -810,7 +808,11 @@ public void produceCharacterSummaryStatistics(DataFile dataFile, File generatedT if (dataFile.getDataTable().getDataVariables().get(i).isTypeCharacter()) { logger.fine("subsetting character vector"); - String[] variableVector = TabularSubsetGenerator.subsetStringVector(new FileInputStream(generatedTabularFile), i, dataFile.getDataTable().getCaseQuantity().intValue()); + String[] variableVector = TabularSubsetGenerator.subsetStringVector( + new FileInputStream(generatedTabularFile), + i, + dataFile.getDataTable().getCaseQuantity().intValue(), + dataFile.getDataTable().isStoredWithVariableHeader()); //calculateCharacterSummaryStatistics(dataFile, i, variableVector); // calculate the UNF while we are at it: logger.fine("Calculating UNF on a String vector"); @@ -828,20 +830,29 @@ public static void produceFrequencyStatistics(DataFile dataFile, File generatedT produceFrequencies(generatedTabularFile, vars); } - public static void produceFrequencies( File generatedTabularFile, List vars) throws IOException { + public static void produceFrequencies(File generatedTabularFile, List vars) throws IOException { for (int i = 0; i < vars.size(); i++) { Collection cats = vars.get(i).getCategories(); int caseQuantity = vars.get(i).getDataTable().getCaseQuantity().intValue(); boolean isNumeric = vars.get(i).isTypeNumeric(); + boolean skipVariableHeaderLine = vars.get(i).getDataTable().isStoredWithVariableHeader(); Object[] variableVector = null; if (cats.size() > 0) { if (isNumeric) { - variableVector = TabularSubsetGenerator.subsetFloatVector(new FileInputStream(generatedTabularFile), i, caseQuantity); + variableVector = TabularSubsetGenerator.subsetFloatVector( + new FileInputStream(generatedTabularFile), + i, + caseQuantity, + skipVariableHeaderLine); } else { - variableVector = TabularSubsetGenerator.subsetStringVector(new FileInputStream(generatedTabularFile), i, caseQuantity); + variableVector = TabularSubsetGenerator.subsetStringVector( + new FileInputStream(generatedTabularFile), + i, + caseQuantity, + skipVariableHeaderLine); } if (variableVector != null) { Hashtable freq = calculateFrequency(variableVector); @@ -923,6 +934,7 @@ public boolean ingestAsTabular(Long datafile_id) { DataFile dataFile = fileService.find(datafile_id); boolean ingestSuccessful = false; boolean forceTypeCheck = false; + boolean storingWithVariableHeader = systemConfig.isStoringIngestedFilesWithHeaders(); // Never attempt to ingest a file that's already ingested! if (dataFile.isTabularData()) { @@ -1024,11 +1036,7 @@ public boolean ingestAsTabular(Long datafile_id) { TabularDataIngest tabDataIngest = null; try { - if (additionalData != null) { - tabDataIngest = ingestPlugin.read(inputStream, additionalData); - } else { - tabDataIngest = ingestPlugin.read(inputStream, null); - } + tabDataIngest = ingestPlugin.read(inputStream, storingWithVariableHeader, additionalData); } catch (IOException ingestEx) { dataFile.SetIngestProblem(); FileUtil.createIngestFailureReport(dataFile, ingestEx.getMessage()); @@ -1081,6 +1089,7 @@ public boolean ingestAsTabular(Long datafile_id) { dataFile.setDataTable(tabDataIngest.getDataTable()); tabDataIngest.getDataTable().setDataFile(dataFile); tabDataIngest.getDataTable().setOriginalFileName(originalFileName); + dataFile.getDataTable().setStoredWithVariableHeader(storingWithVariableHeader); try { produceSummaryStatistics(dataFile, tabFile); @@ -1172,6 +1181,7 @@ public boolean ingestAsTabular(Long datafile_id) { // Replace contents of the file with the tab-delimited data produced: dataAccess.savePath(Paths.get(tabFile.getAbsolutePath())); + // Reset the file size: dataFile.setFilesize(dataAccess.getSize()); @@ -2297,7 +2307,7 @@ public static void main(String[] args) { TabularDataIngest tabDataIngest = null; try { - tabDataIngest = ingestPlugin.read(fileInputStream, null); + tabDataIngest = ingestPlugin.read(fileInputStream, false, null); } catch (IOException ingestEx) { System.err.println("Caught an exception trying to ingest file "+file+"."); System.exit(1); diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/TabularDataFileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/TabularDataFileReader.java index 223b171dfb5..0f23a3d9781 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/TabularDataFileReader.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/TabularDataFileReader.java @@ -20,10 +20,13 @@ package edu.harvard.iq.dataverse.ingest.tabulardata; +import edu.harvard.iq.dataverse.datavariable.DataVariable; import edu.harvard.iq.dataverse.ingest.tabulardata.spi.*; //import edu.harvard.iq.dataverse.ingest.plugin.metadata.*; import java.io.*; import static java.lang.System.*; +import java.util.Iterator; +import java.util.List; import java.util.regex.Matcher; /** @@ -98,7 +101,7 @@ public void setDataLanguageEncoding(String dataLanguageEncoding) { * * @throws java.io.IOException if a reading error occurs. */ - public abstract TabularDataIngest read(BufferedInputStream stream, File dataFile) + public abstract TabularDataIngest read(BufferedInputStream stream, boolean storeWithVariableHeader, File dataFile) throws IOException; @@ -176,5 +179,26 @@ protected String escapeCharacterString(String rawString) { return escapedString; } + + protected String generateVariableHeader(List dvs) { + String varHeader = null; + + if (dvs != null) { + Iterator iter = dvs.iterator(); + DataVariable dv; + + if (iter.hasNext()) { + dv = iter.next(); + varHeader = dv.getName(); + } + + while (iter.hasNext()) { + dv = iter.next(); + varHeader = varHeader + "\t" + dv.getName(); + } + } + + return varHeader; + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReader.java index 57f76df3802..f8816ababb4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReader.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReader.java @@ -110,7 +110,7 @@ private void init() throws IOException { * @throws java.io.IOException if a reading error occurs. */ @Override - public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException { + public TabularDataIngest read(BufferedInputStream stream, boolean saveWithVariableHeader, File dataFile) throws IOException { init(); if (stream == null) { @@ -124,7 +124,7 @@ public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws File tabFileDestination = File.createTempFile("data-", ".tab"); PrintWriter tabFileWriter = new PrintWriter(tabFileDestination.getAbsolutePath()); - int lineCount = readFile(localBufferedReader, dataTable, tabFileWriter); + int lineCount = readFile(localBufferedReader, dataTable, saveWithVariableHeader, tabFileWriter); logger.fine("Tab file produced: " + tabFileDestination.getAbsolutePath()); @@ -136,14 +136,17 @@ public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws } - public int readFile(BufferedReader csvReader, DataTable dataTable, PrintWriter finalOut) throws IOException { + public int readFile(BufferedReader csvReader, DataTable dataTable, boolean saveWithVariableHeader, PrintWriter finalOut) throws IOException { List variableList = new ArrayList<>(); CSVParser parser = new CSVParser(csvReader, inFormat.withHeader()); Map headers = parser.getHeaderMap(); int i = 0; + String variableNameHeader = null; + for (String varName : headers.keySet()) { + // @todo: is .keySet() guaranteed to return the names in the right order? if (varName == null || varName.isEmpty()) { // TODO: // Add a sensible variable name validation algorithm. @@ -158,6 +161,13 @@ public int readFile(BufferedReader csvReader, DataTable dataTable, PrintWriter f dv.setTypeCharacter(); dv.setIntervalDiscrete(); + + if (saveWithVariableHeader) { + variableNameHeader = variableNameHeader == null + ? varName + : variableNameHeader.concat("\t" + varName); + } + i++; } @@ -342,6 +352,14 @@ public int readFile(BufferedReader csvReader, DataTable dataTable, PrintWriter f try (BufferedReader secondPassReader = new BufferedReader(new FileReader(firstPassTempFile))) { parser = new CSVParser(secondPassReader, inFormat.withHeader()); String[] caseRow = new String[headers.size()]; + + // Save the variable name header, if requested + if (saveWithVariableHeader) { + if (variableNameHeader == null) { + throw new IOException("failed to generate the Variable Names header"); + } + finalOut.println(variableNameHeader); + } for (CSVRecord record : parser) { if (!record.isConsistent()) { diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReader.java index 2dec701592e..73818f8fb62 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReader.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReader.java @@ -505,7 +505,7 @@ private void init() throws IOException { } @Override - public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException { + public TabularDataIngest read(BufferedInputStream stream, boolean storeWithVariableHeader, File dataFile) throws IOException { dbgLog.info("***** DTAFileReader: read() start *****"); if (dataFile != null) { @@ -519,7 +519,7 @@ public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws if (releaseNumber!=104) { decodeExpansionFields(stream); } - decodeData(stream); + decodeData(stream, storeWithVariableHeader); decodeValueLabels(stream); ingesteddata.setDataTable(dataTable); @@ -1665,7 +1665,7 @@ private void parseValueLabelsReleasel108(BufferedInputStream stream) throws IOEx dbgLog.fine("parseValueLabelsRelease108(): end"); } - private void decodeData(BufferedInputStream stream) throws IOException { + private void decodeData(BufferedInputStream stream, boolean saveWithVariableHeader) throws IOException { dbgLog.fine("\n***** decodeData(): start *****"); @@ -1719,6 +1719,11 @@ private void decodeData(BufferedInputStream stream) throws IOException { BUT, this needs to be reviewed/confirmed etc! */ //String[][] dateFormat = new String[nvar][nobs]; + + // add the variable header here, if needed + if (saveWithVariableHeader) { + pwout.println(generateVariableHeader(dataTable.getDataVariables())); + } for (int i = 0; i < nobs; i++) { byte[] dataRowBytes = new byte[bytes_per_row]; diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/NewDTAFileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/NewDTAFileReader.java index 22581834676..53607d541de 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/NewDTAFileReader.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/NewDTAFileReader.java @@ -339,7 +339,7 @@ private void init() throws IOException { } @Override - public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException { + public TabularDataIngest read(BufferedInputStream stream, boolean storeWithVariableHeader, File dataFile) throws IOException { logger.fine("NewDTAFileReader: read() start"); // shit ton of diagnostics (still) needed here!! -- L.A. @@ -363,7 +363,13 @@ public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws // "characteristics" - STATA-proprietary information // (we are skipping it) readCharacteristics(dataReader); - readData(dataReader); + + String variableHeaderLine = null; + + if (storeWithVariableHeader) { + variableHeaderLine = generateVariableHeader(dataTable.getDataVariables()); + } + readData(dataReader, variableHeaderLine); // (potentially) large, (potentially) non-ASCII character strings // saved outside the section, and referenced @@ -707,7 +713,7 @@ private void readCharacteristics(DataReader reader) throws IOException { } - private void readData(DataReader reader) throws IOException { + private void readData(DataReader reader, String variableHeaderLine) throws IOException { logger.fine("Data section; at offset " + reader.getByteOffset() + "; dta map offset: " + dtaMap.getOffset_data()); logger.fine("readData(): start"); reader.readOpeningTag(TAG_DATA); @@ -731,6 +737,11 @@ private void readData(DataReader reader) throws IOException { FileOutputStream fileOutTab = new FileOutputStream(tabDelimitedDataFile); PrintWriter pwout = new PrintWriter(new OutputStreamWriter(fileOutTab, "utf8"), true); + // add the variable header here, if needed + if (variableHeaderLine != null) { + pwout.println(variableHeaderLine); + } + logger.fine("Beginning to read data stream."); for (int i = 0; i < nobs; i++) { @@ -999,6 +1010,8 @@ private void readSTRLs(DataReader reader) throws IOException { int nobs = dataTable.getCaseQuantity().intValue(); String[] line; + + //@todo: adjust for the case of storing the file with the variable header for (int obsindex = 0; obsindex < nobs; obsindex++) { if (scanner.hasNext()) { diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/por/PORFileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/por/PORFileReader.java index c90b0ea6950..2ee966c3e31 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/por/PORFileReader.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/por/PORFileReader.java @@ -180,7 +180,7 @@ private void init() throws IOException { } @Override - public TabularDataIngest read(BufferedInputStream stream, File additionalData) throws IOException{ + public TabularDataIngest read(BufferedInputStream stream, boolean storeWithVariableHeader, File additionalData) throws IOException{ dbgLog.fine("PORFileReader: read() start"); if (additionalData != null) { @@ -226,7 +226,7 @@ public TabularDataIngest read(BufferedInputStream stream, File additionalData) t headerId = "8S"; } - decode(headerId, bfReader); + decode(headerId, bfReader, storeWithVariableHeader); // for last iteration @@ -382,7 +382,7 @@ public TabularDataIngest read(BufferedInputStream stream, File additionalData) t return ingesteddata; } - private void decode(String headerId, BufferedReader reader) throws IOException{ + private void decode(String headerId, BufferedReader reader, boolean storeWithVariableHeader) throws IOException{ if (headerId.equals("1")) decodeProductName(reader); else if (headerId.equals("2")) decodeLicensee(reader); else if (headerId.equals("3")) decodeFileLabel(reader); @@ -398,7 +398,7 @@ private void decode(String headerId, BufferedReader reader) throws IOException{ else if (headerId.equals("C")) decodeVariableLabel(reader); else if (headerId.equals("D")) decodeValueLabel(reader); else if (headerId.equals("E")) decodeDocument(reader); - else if (headerId.equals("F")) decodeData(reader); + else if (headerId.equals("F")) decodeData(reader, storeWithVariableHeader); } @@ -1099,7 +1099,7 @@ private void decodeDocument(BufferedReader reader) throws IOException { } - private void decodeData(BufferedReader reader) throws IOException { + private void decodeData(BufferedReader reader, boolean storeWithVariableHeader) throws IOException { dbgLog.fine("decodeData(): start"); // TODO: get rid of this "variableTypeFinal"; -- L.A. 4.0 beta int[] variableTypeFinal= new int[varQnty]; @@ -1126,6 +1126,9 @@ private void decodeData(BufferedReader reader) throws IOException { // contents (variable) checker concering decimals Arrays.fill(variableTypeFinal, 0); + if (storeWithVariableHeader) { + pwout.println(StringUtils.join(variableNameList, "\t")); + } // raw-case counter int j = 0; // case diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/rdata/RDATAFileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/rdata/RDATAFileReader.java index eb1353fd792..50f2f89e354 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/rdata/RDATAFileReader.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/rdata/RDATAFileReader.java @@ -473,7 +473,7 @@ private void init() throws IOException { * @throws java.io.IOException if a reading error occurs. */ @Override - public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException { + public TabularDataIngest read(BufferedInputStream stream, boolean saveWithVariableHeader, File dataFile) throws IOException { init(); @@ -509,7 +509,7 @@ public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws File tabFileDestination = File.createTempFile("data-", ".tab"); PrintWriter tabFileWriter = new PrintWriter(tabFileDestination.getAbsolutePath(), "UTF-8"); - int lineCount = csvFileReader.read(localBufferedReader, dataTable, tabFileWriter); + int lineCount = csvFileReader.read(localBufferedReader, dataTable, saveWithVariableHeader, tabFileWriter); LOG.fine("RDATAFileReader: successfully read "+lineCount+" lines of tab-delimited data."); diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/rdata/RTabFileParser.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/rdata/RTabFileParser.java index f60b7733463..fbe7e401b57 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/rdata/RTabFileParser.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/rdata/RTabFileParser.java @@ -61,8 +61,8 @@ public RTabFileParser (char delimiterChar) { // should be used. - public int read(BufferedReader csvReader, DataTable dataTable, PrintWriter pwout) throws IOException { - dbgLog.warning("RTabFileParser: Inside R Tab file parser"); + public int read(BufferedReader csvReader, DataTable dataTable, boolean saveWithVariableHeader, PrintWriter pwout) throws IOException { + dbgLog.fine("RTabFileParser: Inside R Tab file parser"); int varQnty = 0; @@ -94,14 +94,17 @@ public int read(BufferedReader csvReader, DataTable dataTable, PrintWriter pwout boolean[] isTimeVariable = new boolean[varQnty]; boolean[] isBooleanVariable = new boolean[varQnty]; + String variableNameHeader = null; + if (dataTable.getDataVariables() != null) { for (int i = 0; i < varQnty; i++) { DataVariable var = dataTable.getDataVariables().get(i); if (var == null) { - // throw exception! + throw new IOException ("null dataVariable passed to the parser"); + } if (var.getType() == null) { - // throw exception! + throw new IOException ("null dataVariable type passed to the parser"); } if (var.isTypeCharacter()) { isCharacterVariable[i] = true; @@ -128,13 +131,24 @@ public int read(BufferedReader csvReader, DataTable dataTable, PrintWriter pwout } } } else { - // throw excepion "unknown variable format type" - ? + throw new IOException ("unknown dataVariable format passed to the parser"); } - + if (saveWithVariableHeader) { + variableNameHeader = variableNameHeader == null + ? var.getName() + : variableNameHeader.concat("\t" + var.getName()); + } } } else { - // throw exception! + throw new IOException ("null dataVariables list passed to the parser"); + } + + if (saveWithVariableHeader) { + if (variableNameHeader == null) { + throw new IOException ("failed to generate the Variable Names header"); + } + pwout.println(variableNameHeader); } while ((line = csvReader.readLine()) != null) { diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/sav/SAVFileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/sav/SAVFileReader.java index 682b8f1166c..5eecbdfb666 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/sav/SAVFileReader.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/sav/SAVFileReader.java @@ -338,7 +338,7 @@ private void init() throws IOException { } } - public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException{ + public TabularDataIngest read(BufferedInputStream stream, boolean storeWithVariableHeader, File dataFile) throws IOException{ dbgLog.info("SAVFileReader: read() start"); if (dataFile != null) { @@ -422,7 +422,7 @@ public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws methodCurrentlyExecuted = "decodeRecordTypeData"; dbgLog.fine("***** SAVFileReader: executing method decodeRecordTypeData"); - decodeRecordTypeData(stream); + decodeRecordTypeData(stream, storeWithVariableHeader); } catch (IllegalArgumentException e) { @@ -2308,7 +2308,7 @@ void decodeRecordType999(BufferedInputStream stream) throws IOException { - void decodeRecordTypeData(BufferedInputStream stream) throws IOException { + void decodeRecordTypeData(BufferedInputStream stream, boolean storeWithVariableHeader) throws IOException { dbgLog.fine("decodeRecordTypeData(): start"); ///String fileUnfValue = null; @@ -2320,9 +2320,9 @@ void decodeRecordTypeData(BufferedInputStream stream) throws IOException { throw new IllegalArgumentException("stream == null!"); } if (isDataSectionCompressed){ - decodeRecordTypeDataCompressed(stream); + decodeRecordTypeDataCompressed(stream, storeWithVariableHeader); } else { - decodeRecordTypeDataUnCompressed(stream); + decodeRecordTypeDataUnCompressed(stream, storeWithVariableHeader); } /* UNF calculation was here... */ @@ -2362,7 +2362,7 @@ PrintWriter createOutputWriter (BufferedInputStream stream) throws IOException { } - void decodeRecordTypeDataCompressed(BufferedInputStream stream) throws IOException { + void decodeRecordTypeDataCompressed(BufferedInputStream stream, boolean storeWithVariableHeader) throws IOException { dbgLog.fine("***** decodeRecordTypeDataCompressed(): start *****"); @@ -2395,7 +2395,10 @@ void decodeRecordTypeDataCompressed(BufferedInputStream stream) throws IOExcepti dbgLog.fine("printFormatTable:\n" + printFormatTable); variableFormatTypeList = new String[varQnty]; - + // write the variable header out, if instructed to do so + if (storeWithVariableHeader) { + pwout.println(generateVariableHeader(dataTable.getDataVariables())); + } for (int i = 0; i < varQnty; i++) { variableFormatTypeList[i] = SPSSConstants.FORMAT_CATEGORY_TABLE.get( @@ -2947,7 +2950,7 @@ void decodeRecordTypeDataCompressed(BufferedInputStream stream) throws IOExcepti } - void decodeRecordTypeDataUnCompressed(BufferedInputStream stream) throws IOException { + void decodeRecordTypeDataUnCompressed(BufferedInputStream stream, boolean storeWithVariableHeader) throws IOException { dbgLog.fine("***** decodeRecordTypeDataUnCompressed(): start *****"); if (stream ==null){ @@ -3013,6 +3016,11 @@ void decodeRecordTypeDataUnCompressed(BufferedInputStream stream) throws IOExcep ///dataTable2 = new Object[varQnty][caseQnty]; // storage of date formats to pass to UNF ///dateFormats = new String[varQnty][caseQnty]; + + // write the variable header out, if instructed to do so + if (storeWithVariableHeader) { + pwout.println(generateVariableHeader(dataTable.getDataVariables())); + } try { for (int i = 0; ; i++){ // case-wise loop diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/xlsx/XLSXFileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/xlsx/XLSXFileReader.java index ea3f3868f24..ef91793690e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/xlsx/XLSXFileReader.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/xlsx/XLSXFileReader.java @@ -36,7 +36,6 @@ import org.apache.commons.lang3.StringUtils; import org.apache.poi.xssf.eventusermodel.XSSFReader; -import org.apache.poi.xssf.usermodel.XSSFRichTextString; import org.apache.poi.xssf.model.SharedStrings; import org.apache.poi.openxml4j.opc.OPCPackage; import org.xml.sax.Attributes; @@ -81,7 +80,9 @@ private void init() throws IOException { * @throws java.io.IOException if a reading error occurs. */ @Override - public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException { + public TabularDataIngest read(BufferedInputStream stream, boolean storeWithVariableHeader, File dataFile) throws IOException { + // @todo: implement handling of "saveWithVariableHeader" option + init(); TabularDataIngest ingesteddata = new TabularDataIngest(); @@ -118,6 +119,10 @@ public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws String[] caseRow = new String[varQnty]; String[] valueTokens; + // add the variable header here, if needed + if (storeWithVariableHeader) { + finalWriter.println(generateVariableHeader(dataTable.getDataVariables())); + } while ((line = secondPassReader.readLine()) != null) { // chop the line: @@ -549,7 +554,7 @@ public static void main(String[] args) throws Exception { BufferedInputStream xlsxInputStream = new BufferedInputStream(new FileInputStream(new File(args[0]))); - TabularDataIngest dataIngest = testReader.read(xlsxInputStream, null); + TabularDataIngest dataIngest = testReader.read(xlsxInputStream, false, null); dataTable = dataIngest.getDataTable(); diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index 627cef08d8b..3b7632f3d9e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -598,7 +598,12 @@ Whether Harvesting (OAI) service is enabled * Allows an instance admin to disable Solr search facets on the collection * and dataset pages instantly */ - DisableSolrFacets + DisableSolrFacets, + /** + * When ingesting tabular data files, store the generated tab-delimited + * files *with* the variable names line up top. + */ + StoreIngestedTabularFilesWithVarHeaders ; @Override diff --git a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java index 3c6992f8ec3..ded394833f1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java @@ -1173,4 +1173,12 @@ public boolean isStorageQuotasEnforced() { public Long getTestStorageQuotaLimit() { return settingsService.getValueForKeyAsLong(SettingsServiceBean.Key.StorageQuotaSizeInBytes); } + /** + * Should we store tab-delimited files produced during ingest *with* the + * variable name header line included? + * @return boolean - defaults to false. + */ + public boolean isStoringIngestedFilesWithHeaders() { + return settingsService.isTrueForKey(SettingsServiceBean.Key.StoreIngestedTabularFilesWithVarHeaders, false); + } } diff --git a/src/main/resources/db/migration/V6.1.0.2__8524-store-tabular-files-with-varheaders.sql b/src/main/resources/db/migration/V6.1.0.2__8524-store-tabular-files-with-varheaders.sql new file mode 100644 index 00000000000..7c52a00107a --- /dev/null +++ b/src/main/resources/db/migration/V6.1.0.2__8524-store-tabular-files-with-varheaders.sql @@ -0,0 +1 @@ +ALTER TABLE datatable ADD COLUMN IF NOT EXISTS storedWithVariableHeader BOOLEAN DEFAULT FALSE; diff --git a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java index 915f82a6de2..cfc6f9335b3 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java @@ -16,6 +16,7 @@ import io.restassured.path.xml.XmlPath; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; import edu.harvard.iq.dataverse.util.BundleUtil; +import edu.harvard.iq.dataverse.util.FileUtil; import edu.harvard.iq.dataverse.util.SystemConfig; import java.io.File; import java.io.IOException; @@ -33,6 +34,8 @@ import jakarta.json.JsonObjectBuilder; import static jakarta.ws.rs.core.Response.Status.*; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; import org.hamcrest.CoreMatchers; import org.hamcrest.Matchers; import org.junit.jupiter.api.AfterAll; @@ -2483,4 +2486,129 @@ public void testCollectionStorageQuotas() { UtilIT.deleteSetting(SettingsServiceBean.Key.UseStorageQuotas); } + + @Test + public void testIngestWithAndWithoutVariableHeader() throws NoSuchAlgorithmException { + msgt("testIngestWithAndWithoutVariableHeader"); + + // The compact Stata file we'll be using for this test: + // (this file is provided by Stata inc. - it's genuine quality) + String pathToFile = "scripts/search/data/tabular/stata13-auto.dta"; + // The pre-calculated MD5 signature of the *complete* tab-delimited + // file as seen by the final Access API user (i.e., with the variable + // header line in it): + String tabularFileMD5 = "f298c2567cc8eb544e36ad83edf6f595"; + // Expected byte sizes of the generated tab-delimited file as stored, + // with and without the header: + int tabularFileSizeWoutHeader = 4026; + int tabularFileSizeWithHeader = 4113; + + String apiToken = createUserGetToken(); + String dataverseAlias = createDataverseGetAlias(apiToken); + Integer datasetIdA = createDatasetGetId(dataverseAlias, apiToken); + + // Before we do anything else, make sure that the instance is configured + // the "old" way, i.e., to store ingested files without the headers: + UtilIT.deleteSetting(SettingsServiceBean.Key.StoreIngestedTabularFilesWithVarHeaders); + + Response addResponse = UtilIT.uploadFileViaNative(datasetIdA.toString(), pathToFile, apiToken); + addResponse.prettyPrint(); + + addResponse.then().assertThat() + .body("data.files[0].dataFile.contentType", equalTo("application/x-stata-13")) + .body("data.files[0].label", equalTo("stata13-auto.dta")) + .statusCode(OK.getStatusCode()); + + Long fileIdA = JsonPath.from(addResponse.body().asString()).getLong("data.files[0].dataFile.id"); + assertNotNull(fileIdA); + + // Give file time to ingest + assertTrue(UtilIT.sleepForLock(datasetIdA.longValue(), "Ingest", apiToken, UtilIT.MAXIMUM_INGEST_LOCK_DURATION), "Failed test if Ingest Lock exceeds max duration " + pathToFile + "(A)"); + + // Check the metadata to confirm that the file has ingested: + + Response fileDataResponse = UtilIT.getFileData(fileIdA.toString(), apiToken); + fileDataResponse.prettyPrint(); + fileDataResponse.then().assertThat() + .body("data.dataFile.filename", equalTo("stata13-auto.tab")) + .body("data.dataFile.contentType", equalTo("text/tab-separated-values")) + .body("data.dataFile.filesize", equalTo(tabularFileSizeWoutHeader)) + .statusCode(OK.getStatusCode()); + + + // Download the file, verify the checksum: + + Response fileDownloadResponse = UtilIT.downloadFile(fileIdA.intValue(), apiToken); + fileDownloadResponse.then().assertThat() + .statusCode(OK.getStatusCode()); + + byte[] fileDownloadBytes = fileDownloadResponse.body().asByteArray(); + MessageDigest messageDigest = MessageDigest.getInstance("MD5"); + messageDigest.update(fileDownloadBytes); + byte[] rawDigestBytes = messageDigest.digest(); + String tabularFileMD5calculated = FileUtil.checksumDigestToString(rawDigestBytes); + + msgt("md5 of the downloaded file (saved without the variable name header): "+tabularFileMD5calculated); + + assertEquals(tabularFileMD5, tabularFileMD5calculated); + + // Repeat the whole thing, in another dataset (because we will be uploading + // an identical file), but with the "store with the header setting enabled): + + UtilIT.enableSetting(SettingsServiceBean.Key.StoreIngestedTabularFilesWithVarHeaders); + + Integer datasetIdB = createDatasetGetId(dataverseAlias, apiToken); + + addResponse = UtilIT.uploadFileViaNative(datasetIdB.toString(), pathToFile, apiToken); + addResponse.prettyPrint(); + + addResponse.then().assertThat() + .body("data.files[0].dataFile.contentType", equalTo("application/x-stata-13")) + .body("data.files[0].label", equalTo("stata13-auto.dta")) + .statusCode(OK.getStatusCode()); + + Long fileIdB = JsonPath.from(addResponse.body().asString()).getLong("data.files[0].dataFile.id"); + assertNotNull(fileIdB); + + // Give file time to ingest + assertTrue(UtilIT.sleepForLock(datasetIdB.longValue(), "Ingest", apiToken, UtilIT.MAXIMUM_INGEST_LOCK_DURATION), "Failed test if Ingest Lock exceeds max duration " + pathToFile + "(B)"); + + // Check the metadata to confirm that the file has ingested: + + fileDataResponse = UtilIT.getFileData(fileIdB.toString(), apiToken); + fileDataResponse.prettyPrint(); + fileDataResponse.then().assertThat() + .body("data.dataFile.filename", equalTo("stata13-auto.tab")) + .body("data.dataFile.contentType", equalTo("text/tab-separated-values")) + .body("data.dataFile.filesize", equalTo(tabularFileSizeWithHeader)) + .statusCode(OK.getStatusCode()); + + + // Download the file, verify the checksum, again + + fileDownloadResponse = UtilIT.downloadFile(fileIdB.intValue(), apiToken); + fileDownloadResponse.then().assertThat() + .statusCode(OK.getStatusCode()); + + fileDownloadBytes = fileDownloadResponse.body().asByteArray(); + messageDigest.reset(); + messageDigest.update(fileDownloadBytes); + rawDigestBytes = messageDigest.digest(); + tabularFileMD5calculated = FileUtil.checksumDigestToString(rawDigestBytes); + + msgt("md5 of the downloaded file (saved with the variable name header): "+tabularFileMD5calculated); + + assertEquals(tabularFileMD5, tabularFileMD5calculated); + + // In other words, whether the file was saved with, or without the header, + // as downloaded by the user, the end result must be the same in both cases! + // In other words, whether that first line with the variable names is already + // in the physical file, or added by Dataverse on the fly, the downloaded + // content must be identical. + + UtilIT.deleteSetting(SettingsServiceBean.Key.StoreIngestedTabularFilesWithVarHeaders); + + // @todo: cleanup? + } + } diff --git a/src/test/java/edu/harvard/iq/dataverse/ingest/IngestFrequencyTest.java b/src/test/java/edu/harvard/iq/dataverse/ingest/IngestFrequencyTest.java index 96e314324ab..ca64bcc794f 100644 --- a/src/test/java/edu/harvard/iq/dataverse/ingest/IngestFrequencyTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/ingest/IngestFrequencyTest.java @@ -99,7 +99,7 @@ private DataFile readFileCalcFreq(String fileName, String type ) { TabularDataIngest tabDataIngest = null; try { - tabDataIngest = ingestPlugin.read(fileInputStream, null); + tabDataIngest = ingestPlugin.read(fileInputStream, false, null); } catch (IOException ingestEx) { tabDataIngest = null; System.out.println("Caught an exception trying to ingest file " + fileName + ": " + ingestEx.getLocalizedMessage()); diff --git a/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReaderTest.java b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReaderTest.java index fc066ef195e..9afb35918a4 100644 --- a/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReaderTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReaderTest.java @@ -52,7 +52,7 @@ public void testRead() { try (BufferedInputStream stream = new BufferedInputStream( new FileInputStream(testFile))) { CSVFileReader instance = new CSVFileReader(new CSVFileReaderSpi(), ','); - File outFile = instance.read(stream, null).getTabDelimitedFile(); + File outFile = instance.read(stream, false, null).getTabDelimitedFile(); result = new BufferedReader(new FileReader(outFile)); logger.fine("Final pass: " + outFile.getPath()); } catch (IOException ex) { @@ -104,7 +104,7 @@ public void testVariables() { try (BufferedInputStream stream = new BufferedInputStream( new FileInputStream(testFile))) { CSVFileReader instance = new CSVFileReader(new CSVFileReaderSpi(), ','); - result = instance.read(stream, null).getDataTable(); + result = instance.read(stream, false, null).getDataTable(); } catch (IOException ex) { fail("" + ex); } @@ -154,7 +154,7 @@ public void testSubset() { new FileInputStream(testFile))) { CSVFileReader instance = new CSVFileReader(new CSVFileReaderSpi(), ','); - ingestResult = instance.read(stream, null); + ingestResult = instance.read(stream, false, null); generatedTabFile = ingestResult.getTabDelimitedFile(); generatedDataTable = ingestResult.getDataTable(); @@ -195,7 +195,7 @@ public void testSubset() { fail("Failed to open generated tab-delimited file for reading" + ioex); } - Double[] columnVector = TabularSubsetGenerator.subsetDoubleVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue()); + Double[] columnVector = TabularSubsetGenerator.subsetDoubleVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue(), false); assertArrayEquals(floatVectors[vectorCount++], columnVector, "column " + i + ":"); } @@ -229,7 +229,7 @@ public void testSubset() { fail("Failed to open generated tab-delimited file for reading" + ioex); } - Long[] columnVector = TabularSubsetGenerator.subsetLongVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue()); + Long[] columnVector = TabularSubsetGenerator.subsetLongVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue(), false); assertArrayEquals(longVectors[vectorCount++], columnVector, "column " + i + ":"); } @@ -256,7 +256,7 @@ public void testSubset() { fail("Failed to open generated tab-delimited file for reading" + ioex); } - String[] columnVector = TabularSubsetGenerator.subsetStringVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue()); + String[] columnVector = TabularSubsetGenerator.subsetStringVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue(), false); assertArrayEquals(stringVectors[vectorCount++], columnVector, "column " + i + ":"); } @@ -298,7 +298,7 @@ public void testVariableUNFs() { new FileInputStream(testFile))) { CSVFileReader instance = new CSVFileReader(new CSVFileReaderSpi(), ','); - ingestResult = instance.read(stream, null); + ingestResult = instance.read(stream, false, null); generatedTabFile = ingestResult.getTabDelimitedFile(); generatedDataTable = ingestResult.getDataTable(); @@ -327,7 +327,7 @@ public void testVariableUNFs() { fail("Failed to open generated tab-delimited file for reading" + ioex); } - Double[] columnVector = TabularSubsetGenerator.subsetDoubleVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue()); + Double[] columnVector = TabularSubsetGenerator.subsetDoubleVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue(), false); try { unf = UNFUtil.calculateUNF(columnVector); } catch (IOException | UnfException ioex) { @@ -345,7 +345,7 @@ public void testVariableUNFs() { fail("Failed to open generated tab-delimited file for reading" + ioex); } - Long[] columnVector = TabularSubsetGenerator.subsetLongVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue()); + Long[] columnVector = TabularSubsetGenerator.subsetLongVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue(), false); try { unf = UNFUtil.calculateUNF(columnVector); @@ -363,7 +363,7 @@ public void testVariableUNFs() { fail("Failed to open generated tab-delimited file for reading" + ioex); } - String[] columnVector = TabularSubsetGenerator.subsetStringVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue()); + String[] columnVector = TabularSubsetGenerator.subsetStringVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue(), false); String[] dateFormats = null; @@ -401,7 +401,7 @@ public void testVariableUNFs() { public void testBrokenCSV() { String brokenFile = "src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/BrokenCSV.csv"; try { - new CSVFileReader(new CSVFileReaderSpi(), ',').read(null, null); + new CSVFileReader(new CSVFileReaderSpi(), ',').read(null, false, null); fail("IOException not thrown on null csv"); } catch (NullPointerException ex) { String expMessage = null; @@ -412,7 +412,7 @@ public void testBrokenCSV() { } try (BufferedInputStream stream = new BufferedInputStream( new FileInputStream(brokenFile))) { - new CSVFileReader(new CSVFileReaderSpi(), ',').read(stream, null); + new CSVFileReader(new CSVFileReaderSpi(), ',').read(stream, false, null); fail("IOException was not thrown when collumns do not align."); } catch (IOException ex) { String expMessage = BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", diff --git a/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReaderTest.java b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReaderTest.java index 113e9be6b54..8af36d6466d 100644 --- a/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReaderTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReaderTest.java @@ -16,7 +16,7 @@ public class DTAFileReaderTest { @Test public void testOs() throws IOException { - TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("scripts/search/data/tabular/50by1000.dta"))), nullDataFile); + TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("scripts/search/data/tabular/50by1000.dta"))), false, nullDataFile); assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat()); assertEquals("rel_8_or_9", result.getDataTable().getOriginalFormatVersion()); assertEquals(50, result.getDataTable().getDataVariables().size()); diff --git a/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/NewDTAFileReaderTest.java b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/NewDTAFileReaderTest.java index c963346b05e..0f14054f472 100644 --- a/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/NewDTAFileReaderTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/NewDTAFileReaderTest.java @@ -25,7 +25,7 @@ public void testAuto() throws IOException { instance = new NewDTAFileReader(null, 117); // From https://www.stata-press.com/data/r13/auto.dta // `strings` shows "
117" - TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("scripts/search/data/tabular/stata13-auto.dta"))), nullDataFile); + TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("scripts/search/data/tabular/stata13-auto.dta"))), false, nullDataFile); assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat()); assertEquals("STATA 13", result.getDataTable().getOriginalFormatVersion()); assertEquals(12, result.getDataTable().getDataVariables().size()); @@ -39,7 +39,7 @@ public void testAuto() throws IOException { @Test public void testStrl() throws IOException { instance = new NewDTAFileReader(null, 118); - TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File(base + "strl.dta"))), nullDataFile); + TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File(base + "strl.dta"))), false, nullDataFile); DataTable table = result.getDataTable(); assertEquals("application/x-stata", table.getOriginalFileFormat()); assertEquals("STATA 14", table.getOriginalFormatVersion()); @@ -58,7 +58,7 @@ public void testStrl() throws IOException { @Test public void testDates() throws IOException { instance = new NewDTAFileReader(null, 118); - TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File(base + "dates.dta"))), nullDataFile); + TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File(base + "dates.dta"))), false, nullDataFile); DataTable table = result.getDataTable(); assertEquals("application/x-stata", table.getOriginalFileFormat()); assertEquals("STATA 14", table.getOriginalFormatVersion()); @@ -77,7 +77,7 @@ public void testDates() throws IOException { @Test void testNull() { instance = new NewDTAFileReader(null, 117); - assertThrows(IOException.class, () -> instance.read(null, new File(""))); + assertThrows(IOException.class, () -> instance.read(null, false, new File(""))); } // TODO: Can we create a small file to check into the code base that exercises the value-label names non-zero offset issue? @@ -87,7 +87,7 @@ public void testFirstCategoryNonZeroOffset() throws IOException { instance = new NewDTAFileReader(null, 117); // https://dataverse.harvard.edu/file.xhtml?fileId=2865667 Stata 13 HouseImputingCivilRightsInfo.dta md5=7dd144f27cdb9f8d1c3f4eb9c4744c42 - TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("/tmp/HouseImputingCivilRightsInfo.dta"))), nullDataFile); + TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("/tmp/HouseImputingCivilRightsInfo.dta"))), false, nullDataFile); assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat()); assertEquals("STATA 13", result.getDataTable().getOriginalFormatVersion()); assertEquals(5, result.getDataTable().getDataVariables().size()); @@ -107,7 +107,7 @@ public void testFirstCategoryNonZeroOffset() throws IOException { public void testFirstCategoryNonZeroOffset1() throws IOException { instance = new NewDTAFileReader(null, 118); // https://dataverse.harvard.edu/file.xhtml?fileId=3140457 Stata 14: 2018_04_06_Aggregated_dataset_v2.dta - TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("/tmp/2018_04_06_Aggregated_dataset_v2.dta"))), nullDataFile); + TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("/tmp/2018_04_06_Aggregated_dataset_v2.dta"))), false, nullDataFile); assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat()); assertEquals("STATA 14", result.getDataTable().getOriginalFormatVersion()); assertEquals(227, result.getDataTable().getDataVariables().size()); @@ -136,7 +136,7 @@ public void test33k() throws IOException { @Test public void testCharacteristics() throws IOException { instance = new NewDTAFileReader(null, 117); - TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("/tmp/15aa6802ee5-5d2ed1bf55a5.dta"))), nullDataFile); + TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("/tmp/15aa6802ee5-5d2ed1bf55a5.dta"))), false, nullDataFile); assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat()); assertEquals("STATA 13", result.getDataTable().getOriginalFormatVersion()); assertEquals(441, result.getDataTable().getDataVariables().size());