diff --git a/pom.xml b/pom.xml index 993e11a7..76b89361 100644 --- a/pom.xml +++ b/pom.xml @@ -43,7 +43,7 @@ 2.8.0 lib/tika - 6.5.2 + 6.7.0 lib/droid 1.28.0 lib/jhove diff --git a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/ArchiveContentIdentifier.java b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/ArchiveContentIdentifier.java deleted file mode 100644 index 7c71baec..00000000 --- a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/ArchiveContentIdentifier.java +++ /dev/null @@ -1,247 +0,0 @@ -/** - * This file has been modified by Harvard University, June, 2017, for the purposes of incorporating - * into the FITS application. The original can be found here: https://github.com/digital-preservation/droid - * - * Copyright (c) 2016, The National Archives - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following - * conditions are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * * Neither the name of the The National Archives nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ -package edu.harvard.hul.ois.fits.tools.droid; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.util.Map; -import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import uk.gov.nationalarchives.droid.command.action.CommandExecutionException; -import uk.gov.nationalarchives.droid.container.ContainerSignatureDefinitions; -import uk.gov.nationalarchives.droid.core.BinarySignatureIdentifier; -import uk.gov.nationalarchives.droid.core.interfaces.IdentificationRequest; -import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResultCollection; -import uk.gov.nationalarchives.droid.profile.referencedata.Format; - -/** - * Parent class for Containers. - * - * @author G.Seaman - * - */ -public abstract class ArchiveContentIdentifier { - - protected String slash; - protected String slash1; - protected BinarySignatureIdentifier binarySignatureIdentifier; - protected ContainerSignatureDefinitions containerSignatureDefinitions; - protected File tmpDir; - protected String path; - private Boolean expandWebArchives; - private final Map puidFormatMap; - - private static final Logger logger = LoggerFactory.getLogger(ArchiveContentIdentifier.class); - - /** - * Initialization of instance values must be explicitly called by all children. - * @param binarySignatureIdentifier binary signature identifier - * @param containerSignatureDefinitions container signatures - * @param path current archive path - * @param slash local path element delimiter - * @param slash1 local first container prefix delimiter - * @param expandWebArchives optionally expand (W)ARC files - * @param puidFormatMap map of puids to formats - */ - public ArchiveContentIdentifier( - final BinarySignatureIdentifier binarySignatureIdentifier, - final ContainerSignatureDefinitions containerSignatureDefinitions, - final String path, - final String slash, - final String slash1, - final Boolean expandWebArchives, - final Map puidFormatMap) { - - synchronized (this) { - setBinarySignatureIdentifier(binarySignatureIdentifier); - setContainerSignatureDefinitions(containerSignatureDefinitions); - setPath(path); - setSlash(slash); - setSlash1(slash1); - setExpandWebArchives(expandWebArchives); - if (getTmpDir() == null) { - setTmpDir(new File(System.getProperty("java.io.tmpdir"))); - } - this.puidFormatMap = puidFormatMap; - } - } - /** - * @return local path element delimiter - */ - protected String getSlash() { - return slash; - } - /** - * @param newSlash path element delimiter - */ - protected void setSlash(String newSlash) { - this.slash = newSlash; - } - /** - * @return container element delimiter - */ - protected String getSlash1() { - return slash1; - } - /** - * @param newSlash1 container element delimiter - */ - protected void setSlash1(String newSlash1) { - this.slash1 = newSlash1; - } - /** - * @return binary signature identifier - */ - protected BinarySignatureIdentifier getBinarySignatureIdentifier() { - return binarySignatureIdentifier; - } - /** - * @param bis binary signature identifier - */ - protected void setBinarySignatureIdentifier(BinarySignatureIdentifier bis) { - this.binarySignatureIdentifier = bis; - } - /** - * @return container signatures - */ - protected ContainerSignatureDefinitions getContainerSignatureDefinitions() { - return containerSignatureDefinitions; - } - /** - * @param csd container signatures - */ - protected void setContainerSignatureDefinitions(ContainerSignatureDefinitions csd) { - this.containerSignatureDefinitions = csd; - } - - /** - * @return temporary file directory - */ - protected File getTmpDir() { - return tmpDir; - } - /** - * @param tmpDir temporary file directory - */ - protected void setTmpDir(File tmpDir) { - this.tmpDir = tmpDir; - } - /** - * @return archive path - */ - protected String getPath() { - return path; - } - /** - * @param path archive path - */ - protected void setPath(String path) { - this.path = path; - } - /** - * @return whether to expand (W)ARCs - */ - protected Boolean getExpandWebArchives() { - return expandWebArchives; - } - /** - * @param ewa whether to expand (W)ARCs - */ - protected void setExpandWebArchives(Boolean ewa) { - this.expandWebArchives = ewa; - } - - /** - * - * @param prefix String describing container-type - * @param filename Name of file - * @return URI for container - */ - protected String makeContainerURI(String prefix, String filename) { - return prefix + ":" + getSlash1() + getPath() + filename + "!" + getSlash(); - } - - /** - * @param request The request - * @param in The container input stream - * @param newPath Path for the Container file - * @param aggregator Aggregates ZIP file container information - * @throws CommandExecutionException When an exception happens during execution - */ - protected void expandContainer( - IdentificationRequest request, InputStream in, String newPath, ContainerAggregator aggregator) - throws CommandExecutionException, UnsupportedZipFeatureException { - - try { - request.open(in); - IdentificationResultCollection results = - getBinarySignatureIdentifier().matchBinarySignatures(request); - - if (results.getResults().isEmpty()) { - results = binarySignatureIdentifier.matchExtensions(request, true); - } - - final ResultPrinter resultPrinter = new ResultPrinter( - getBinarySignatureIdentifier(), - getContainerSignatureDefinitions(), - newPath, - getSlash(), - getSlash1(), - true, - getExpandWebArchives(), - aggregator, - puidFormatMap); - - resultPrinter.print(results, request); - request.close(); - } catch (UnsupportedZipFeatureException e) { - // output: org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException: unsupported feature - // encryption used in entry Book_pdfx1a.pdf - throw e; - } catch (IOException ioe) { - logger.warn(ioe + " " + newPath); - } finally { - try { - // make sure no temp files are left behind - request.close(); - } catch (IOException ioe) { - logger.warn("Failed to close temp file for Container request:" + ioe); - // not a lot we can do here - warning msg already given and deleteOnExit set - } - } - } -} diff --git a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/CollectingResultHandler.java b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/CollectingResultHandler.java new file mode 100644 index 00000000..58a90489 --- /dev/null +++ b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/CollectingResultHandler.java @@ -0,0 +1,79 @@ +// +// Copyright (c) 2023 by The President and Fellows of Harvard College +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. You may obtain a copy of the License at: +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the License is +// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permission and limitations under the License. +// + +package edu.harvard.hul.ois.fits.tools.droid; + +import java.util.ArrayList; +import java.util.List; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import uk.gov.nationalarchives.droid.core.interfaces.IdentificationException; +import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResult; +import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResultCollection; +import uk.gov.nationalarchives.droid.core.interfaces.ResourceId; +import uk.gov.nationalarchives.droid.core.interfaces.ResultHandler; +import uk.gov.nationalarchives.droid.core.interfaces.filter.Filter; + +/** + * Droid calls this class whenever it identifies a file, and we collect the results. This is necessary because it's + * the only way to get the results from archive contents. Droid only returns the result for the specified file directly, + * and will not return the results for any files that are contained within the specified file. + *

+ * This class is NOT THREAD SAFE. You must use a different instance per thread, and you must call {@link #reset()} + * between files. + */ +class CollectingResultHandler implements ResultHandler { + + private static final Logger log = LoggerFactory.getLogger(CollectingResultHandler.class); + + private final List results = new ArrayList<>(); + + /** + * Clears the accumulated results in preparation for processing a new file. + */ + public void reset() { + results.clear(); + } + + /** + * @return the accumulated identification results + */ + public List getResults() { + return List.copyOf(results); + } + + @Override + public ResourceId handle(IdentificationResultCollection identificationResultCollection) { + results.add(identificationResultCollection); + return new ResourceId(DroidId.nextId(), ""); + } + + @Override + public ResourceId handleDirectory(IdentificationResult identificationResult, ResourceId resourceId, boolean b) { + return new ResourceId(DroidId.nextId(), ""); + } + + @Override + public void handleError(IdentificationException e) { + log.warn("DROID identification error", e); + } + + @Override + public void deleteCascade(Long aLong) {} + + @Override + public void commit() {} + + @Override + public void init() {} + + @Override + public void setResultsFilter(Filter filter) {} +} diff --git a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/ContainerAggregator.java b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/ContainerAggregator.java deleted file mode 100644 index 98ff538f..00000000 --- a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/ContainerAggregator.java +++ /dev/null @@ -1,163 +0,0 @@ -// -// Copyright (c) 2017 by The President and Fellows of Harvard College -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. You may obtain a copy of the License at: -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software distributed under the License is -// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permission and limitations under the License. -// - -package edu.harvard.hul.ois.fits.tools.droid; - -import java.util.Collections; -import java.util.Map; -import java.util.TreeMap; -import java.util.zip.ZipEntry; - -/** - * This class aggregates data from Droid about container (ZIP) type files. - * - * @author dan179 - */ -public class ContainerAggregator { - - // Maps format type to number of these types of files with a ZIP file - private final Map formatToCount; - - // Aggregated original size of all files contained within the ZIP file. - private long originalSize; - - // Aggregated compressed size of all files contained within the ZIP file. - // If ZIP is not compressed then should equal the original size. - private long compressedSize; - - private boolean isEncrypted = false; - - private static final String UNKNOWN_FORMAT = "Unknown"; - - public ContainerAggregator() { - formatToCount = new TreeMap<>(); // order entries by key for the sake of XMLUnit tests - } - - /** - * Aggregated original size of files with a ZIP file. - * - * @return Original size of all files within ZIP file in bytes. - */ - public long getOriginalSize() { - return originalSize; - } - - /** - * Increment the calculated original size of the examined ZIP file by the original size of a contained file. - */ - public void incrementOriginalSize(long originalSize) { - this.originalSize += originalSize; - } - - /** - * Aggregated compressed size of files with a ZIP file. - * - * @return Compressed size of all files within ZIP file in bytes. - */ - public long getCompressedSize() { - return compressedSize; - } - - /** - * Increment the compressed size of the examined ZIP file by the original size of a contained file. - */ - public void incrementCompressedSize(long compressedSize) { - this.compressedSize += compressedSize; - } - - /** - * Add a format type to this collection and increment count for this type. - */ - public void addFormat(String format) { - if (format != null) { - Integer cnt = formatToCount.get(format); - if (cnt == null) { - formatToCount.put(format, 1); - } else { - cnt++; - formatToCount.put(format, cnt); - } - } - } - - public void incrementUnknownFormat() { - Integer cnt = formatToCount.get(UNKNOWN_FORMAT); - if (cnt == null) { - formatToCount.put(UNKNOWN_FORMAT, 1); - } else { - cnt++; - formatToCount.put(UNKNOWN_FORMAT, cnt); - } - } - - /** - * A Map of format type to number of each format type. - * - * @return Format to count mapping - */ - public Map getFormatCounts() { - return Collections.unmodifiableMap(formatToCount); - } - - /** - * Total number of all format types added to this collection. - * - * @return Total number for formats added to this collection. - */ - public int getTotalEntriesCount() { - int total = 0; - for (Integer val : formatToCount.values()) { - total += val; - } - return total; - } - - /** - * The compression method as defined the Java ZipEntry. Currently only values for 'stored' (uncompressed) - * and 'deflate' (compressed) are used. - * - * @return The value corresponding to - * @see java.util.zip.ZipEntry - */ - public int getCompressionMethod() { - return getCompressedSize() < getOriginalSize() ? ZipEntry.DEFLATED : ZipEntry.STORED; - } - - /** - * Whether the container being examined is encrypted. - */ - public boolean isEncrypted() { - return isEncrypted; - } - - /** - * Sets whether this container being examined is encrypted. - */ - public void setEncrypted(boolean isEncrypted) { - this.isEncrypted = isEncrypted; - } - - @Override - public String toString() { - StringBuilder builder = new StringBuilder(); - builder.append("ContainerAggregator [formatToCount="); - builder.append(formatToCount); - builder.append("]"); - builder.append(" total count: "); - builder.append(getTotalEntriesCount()); - builder.append(", originalSize: "); - builder.append(originalSize); - builder.append(", compressedSize: "); - builder.append(compressedSize); - builder.append(", isEncrypted: "); - builder.append(isEncrypted); - return builder.toString(); - } -} diff --git a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/Droid.java b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/Droid.java index a6733e0f..15537745 100644 --- a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/Droid.java +++ b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/Droid.java @@ -11,41 +11,19 @@ package edu.harvard.hul.ois.fits.tools.droid; import edu.harvard.hul.ois.fits.Fits; -import edu.harvard.hul.ois.fits.FitsMetadataValues; import edu.harvard.hul.ois.fits.exceptions.FitsToolException; import edu.harvard.hul.ois.fits.tools.ToolBase; import edu.harvard.hul.ois.fits.tools.ToolInfo; import edu.harvard.hul.ois.fits.tools.ToolOutput; import java.io.File; -import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.time.Duration; +import java.time.Instant; import org.apache.commons.configuration.XMLConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import uk.gov.nationalarchives.droid.command.action.VersionCommand; -import uk.gov.nationalarchives.droid.container.ContainerFileIdentificationRequestFactory; -import uk.gov.nationalarchives.droid.container.ContainerSignatureDefinitions; -import uk.gov.nationalarchives.droid.container.ContainerSignatureFileReader; -import uk.gov.nationalarchives.droid.container.ole2.Ole2Identifier; -import uk.gov.nationalarchives.droid.container.ole2.Ole2IdentifierEngine; -import uk.gov.nationalarchives.droid.container.zip.ZipIdentifier; -import uk.gov.nationalarchives.droid.container.zip.ZipIdentifierEngine; -import uk.gov.nationalarchives.droid.core.BinarySignatureIdentifier; -import uk.gov.nationalarchives.droid.core.SignatureParseException; -import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResult; -import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResultCollection; -import uk.gov.nationalarchives.droid.core.interfaces.archive.ArchiveFormatResolver; -import uk.gov.nationalarchives.droid.core.interfaces.archive.ArchiveFormatResolverImpl; -import uk.gov.nationalarchives.droid.core.interfaces.archive.ContainerIdentifierFactory; -import uk.gov.nationalarchives.droid.core.interfaces.archive.ContainerIdentifierFactoryImpl; -import uk.gov.nationalarchives.droid.profile.referencedata.Format; -import uk.gov.nationalarchives.droid.signature.SaxSignatureFileParser; -import uk.gov.nationalarchives.droid.signature.SignatureParser; /** The principal glue class for invoking DROID under FITS. */ @@ -53,17 +31,8 @@ public class Droid extends ToolBase { private boolean enabled = true; private final Fits fits; - private final List includeExts; - private long kbReadLimit; - private static File sigFile; - private static final BinarySignatureIdentifier sigIdentifier = new BinarySignatureIdentifier(); - private static final ContainerIdentifierFactory containerIdentifierFactory = new ContainerIdentifierFactoryImpl(); - private static final ArchiveFormatResolver containerFormatResolver = new ArchiveFormatResolverImpl(); - private static ContainerSignatureDefinitions containerSignatureDefinitions; - private static final Map puidFormatMap = new HashMap<>(2500); - - private static final List CONTAINER_TYPE_MIMETYPES = Arrays.asList("application/zip"); + private final DroidWrapper droidWrapper; private static final Logger logger = LoggerFactory.getLogger(Droid.class); @@ -74,118 +43,33 @@ public Droid(Fits fits) throws FitsToolException { info = new ToolInfo("Droid", getDroidVersion(), null); try { - String droid_conf = Fits.FITS_TOOLS_DIR + "droid" + File.separator; XMLConfiguration config = fits.getConfig(); - // only need a single Droid signature file. - if (sigFile == null) { - synchronized (this) { - if (sigFile == null) { - sigFile = new File(droid_conf + config.getString("droid_sigfile")); - sigIdentifier.setSignatureFile(sigFile.getAbsolutePath()); - sigIdentifier.init(); - - // The following is necessary to init the code that identifies formats like docx, xlsx, etc - SignatureParser sigParser = new SaxSignatureFileParser(sigFile.toURI()); - sigParser.formats(format -> { - puidFormatMap.put(format.getPuid(), format); - }); - - String containerSigFile = droid_conf + config.getString("droid_container_sigfile"); - ContainerSignatureFileReader signatureReader = - new ContainerSignatureFileReader(containerSigFile); - - containerSignatureDefinitions = signatureReader.getDefinitions(); - - ZipIdentifierEngine zipIdentifierEngine = new ZipIdentifierEngine(); - zipIdentifierEngine.setRequestFactory(new ContainerFileIdentificationRequestFactory()); - - ZipIdentifier zipIdentifier = new ZipIdentifier(); - zipIdentifier.setContainerType("ZIP"); - zipIdentifier.setContainerIdentifierFactory(containerIdentifierFactory); - zipIdentifier.setContainerFormatResolver(containerFormatResolver); - zipIdentifier.setDroidCore(sigIdentifier); - zipIdentifier.setIdentifierEngine(zipIdentifierEngine); - zipIdentifier.setSignatureReader(signatureReader); - zipIdentifier.init(); - - Ole2IdentifierEngine ole2IdentifierEngine = new Ole2IdentifierEngine(); - ole2IdentifierEngine.setRequestFactory(new ContainerFileIdentificationRequestFactory()); - - Ole2Identifier ole2Identifier = new Ole2Identifier(); - ole2Identifier.setContainerType("OLE2"); - ole2Identifier.setContainerIdentifierFactory(containerIdentifierFactory); - ole2Identifier.setContainerFormatResolver(containerFormatResolver); - ole2Identifier.setDroidCore(sigIdentifier); - ole2Identifier.setIdentifierEngine(ole2IdentifierEngine); - ole2Identifier.setSignatureReader(signatureReader); - ole2Identifier.init(); - } - } - } - includeExts = (List) (List) config.getList("droid_read_limit[@include-exts]"); - String limit = config.getString("droid_read_limit[@read-limit-kb]"); - kbReadLimit = -1L; - if (limit != null) { - try { - kbReadLimit = Long.parseLong(limit); - } catch (NumberFormatException nfe) { - throw new FitsToolException( - "Invalid long value in fits.xml droid_read_limit[@read-limit-kb]: " + limit, nfe); - } - } + droidWrapper = DroidWrapperFactory.getOrCreateFactory(DroidConfig.fromFitsConfig(config)) + .createInstance(); } catch (Throwable e) { - throw new FitsToolException("Error initilizing DROID", e); + throw new FitsToolException("Error initializing DROID", e); } } @Override public ToolOutput extractInfo(File file) throws FitsToolException { - logger.debug("Droid.extractInfo starting on " + file.getName()); - long startTime = System.currentTimeMillis(); - IdentificationResultCollection results; - ContainerAggregator aggregator = null; + logger.debug("Droid.extractInfo starting on {}", file.getName()); + Instant startTime = Instant.now(); + + DroidResult result; + try { - DroidQuery droidQuery = new DroidQuery( - sigIdentifier, - containerIdentifierFactory, - containerFormatResolver, - puidFormatMap, - containerSignatureDefinitions, - includeExts, - kbReadLimit, - file); - // the following will almost always return a single result - results = droidQuery.queryFile(); - for (IdentificationResult res : results.getResults()) { - String mimeType = res.getMimeType(); - - if (FitsMetadataValues.getInstance().normalizeMimeType(mimeType) != null) { - mimeType = FitsMetadataValues.getInstance().normalizeMimeType(mimeType); - } - - String fileName = file.getName(); - int lastDot = fileName.lastIndexOf('.'); - String extension = ""; - if (lastDot > -1) { - extension = fileName.substring(lastDot + 1); - } - - if (CONTAINER_TYPE_MIMETYPES.contains(mimeType) && "zip".equals(extension)) { - aggregator = droidQuery.queryContainerData(results); - } - } - - } catch (IOException e) { + result = droidWrapper.analyze(file.toPath()); + } catch (Exception e) { throw new FitsToolException("DROID can't query file " + file.getAbsolutePath(), e); - } catch (SignatureParseException e) { - throw new FitsToolException("Problem with DROID signature file"); } - DroidToolOutputter outputter = new DroidToolOutputter(this, results, fits, aggregator); + + DroidToolOutputter outputter = new DroidToolOutputter(this, fits, result); ToolOutput output = outputter.toToolOutput(); - duration = System.currentTimeMillis() - startTime; + duration = Duration.between(startTime, Instant.now()).toMillis(); runStatus = RunStatus.SUCCESSFUL; - logger.debug("Droid.extractInfo finished on " + file.getName()); + logger.debug("Droid.extractInfo finished on {}", file.getName()); return output; } diff --git a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidConfig.java b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidConfig.java new file mode 100644 index 00000000..7a52b09c --- /dev/null +++ b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidConfig.java @@ -0,0 +1,203 @@ +// +// Copyright (c) 2023 by The President and Fellows of Harvard College +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. You may obtain a copy of the License at: +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the License is +// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permission and limitations under the License. +// + +package edu.harvard.hul.ois.fits.tools.droid; + +import edu.harvard.hul.ois.fits.Fits; +import edu.harvard.hul.ois.fits.exceptions.FitsToolException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.commons.configuration.XMLConfiguration; + +class DroidConfig { + + private Path sigFile; + private Path containerSigFile; + private Path tempDir; + private Set extsToLimitBytesRead = Collections.emptySet(); + private long byteReadLimit = -1; + private boolean processZip = true; + private boolean processTar = true; + private boolean processGzip = true; + private boolean processRar = true; + private boolean process7zip = true; + private boolean processIso = true; + private boolean processBzip2 = true; + private boolean processArc = true; + private boolean processWarc = true; + + public static DroidConfig fromFitsConfig(XMLConfiguration fitsConfig) throws FitsToolException { + var droidConfig = new DroidConfig(); + + Path droidConfigDir = Paths.get(Fits.FITS_TOOLS_DIR + "droid"); + droidConfig.setSigFile(droidConfigDir.resolve(fitsConfig.getString("droid_sigfile"))); + droidConfig.setContainerSigFile(droidConfigDir.resolve(fitsConfig.getString("droid_container_sigfile"))); + + String tempStr = fitsConfig.getString("process.tmpdir", System.getProperty("java.io.tmpdir")); + droidConfig.setTempDir(tempStr == null ? null : Paths.get(tempStr)); + + droidConfig.setExtsToLimitBytesRead( + new HashSet<>((List) (List) fitsConfig.getList("droid_read_limit[@include-exts]"))); + String limit = fitsConfig.getString("droid_read_limit[@read-limit-kb]"); + long kbReadLimit = -1L; + if (limit != null) { + try { + kbReadLimit = Long.parseLong(limit); + } catch (NumberFormatException nfe) { + throw new FitsToolException( + "Invalid long value in fits.xml droid_read_limit[@read-limit-kb]: " + limit, nfe); + } + } + + droidConfig.setByteReadLimit(kbReadLimit == -1 ? -1 : 1024 * kbReadLimit); + + droidConfig.setProcessZip(fitsConfig.getBoolean("droid.process.zip", true)); + droidConfig.setProcessTar(fitsConfig.getBoolean("droid.process.tar", true)); + droidConfig.setProcessGzip(fitsConfig.getBoolean("droid.process.gzip", true)); + droidConfig.setProcessArc(fitsConfig.getBoolean("droid.process.arc", true)); + droidConfig.setProcessWarc(fitsConfig.getBoolean("droid.process.warc", true)); + droidConfig.setProcessBzip2(fitsConfig.getBoolean("droid.process.bzip2", true)); + droidConfig.setProcess7zip(fitsConfig.getBoolean("droid.process.seven-zip", true)); + droidConfig.setProcessIso(fitsConfig.getBoolean("droid.process.iso", true)); + droidConfig.setProcessRar(fitsConfig.getBoolean("droid.process.rar", true)); + + return droidConfig; + } + + public Path getSigFile() { + return sigFile; + } + + public DroidConfig setSigFile(Path sigFile) { + this.sigFile = sigFile; + return this; + } + + public Path getContainerSigFile() { + return containerSigFile; + } + + public DroidConfig setContainerSigFile(Path containerSigFile) { + this.containerSigFile = containerSigFile; + return this; + } + + public Path getTempDir() { + return tempDir; + } + + public DroidConfig setTempDir(Path tempDir) { + this.tempDir = tempDir; + return this; + } + + public Set getExtsToLimitBytesRead() { + return extsToLimitBytesRead; + } + + public DroidConfig setExtsToLimitBytesRead(Set extsToLimitBytesRead) { + this.extsToLimitBytesRead = extsToLimitBytesRead; + return this; + } + + public long getByteReadLimit() { + return byteReadLimit; + } + + public DroidConfig setByteReadLimit(long byteReadLimit) { + this.byteReadLimit = byteReadLimit; + return this; + } + + public boolean isProcessZip() { + return processZip; + } + + public DroidConfig setProcessZip(boolean processZip) { + this.processZip = processZip; + return this; + } + + public boolean isProcessTar() { + return processTar; + } + + public DroidConfig setProcessTar(boolean processTar) { + this.processTar = processTar; + return this; + } + + public boolean isProcessGzip() { + return processGzip; + } + + public DroidConfig setProcessGzip(boolean processGzip) { + this.processGzip = processGzip; + return this; + } + + public boolean isProcessRar() { + return processRar; + } + + public DroidConfig setProcessRar(boolean processRar) { + this.processRar = processRar; + return this; + } + + public boolean isProcess7zip() { + return process7zip; + } + + public DroidConfig setProcess7zip(boolean process7zip) { + this.process7zip = process7zip; + return this; + } + + public boolean isProcessIso() { + return processIso; + } + + public DroidConfig setProcessIso(boolean processIso) { + this.processIso = processIso; + return this; + } + + public boolean isProcessBzip2() { + return processBzip2; + } + + public DroidConfig setProcessBzip2(boolean processBzip2) { + this.processBzip2 = processBzip2; + return this; + } + + public boolean isProcessArc() { + return processArc; + } + + public DroidConfig setProcessArc(boolean processArc) { + this.processArc = processArc; + return this; + } + + public boolean isProcessWarc() { + return processWarc; + } + + public DroidConfig setProcessWarc(boolean processWarc) { + this.processWarc = processWarc; + return this; + } +} diff --git a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidId.java b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidId.java new file mode 100644 index 00000000..3146c073 --- /dev/null +++ b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidId.java @@ -0,0 +1,33 @@ +// +// Copyright (c) 2023 by The President and Fellows of Harvard College +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. You may obtain a copy of the License at: +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the License is +// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permission and limitations under the License. +// + +package edu.harvard.hul.ois.fits.tools.droid; + +import java.util.concurrent.atomic.AtomicLong; + +/** + * Every file Droid process needs an ID. These ids are normally assigned by adding the resource to a db, but FITS + * doesn't use a DB. Instead, we just generate the id here. + */ +final class DroidId { + + private static final AtomicLong ID = new AtomicLong(1); + + private DroidId() { + // noop + } + + /** + * @return new id + */ + public static long nextId() { + return ID.getAndIncrement(); + } +} diff --git a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidQuery.java b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidQuery.java deleted file mode 100644 index 0e7d26bf..00000000 --- a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidQuery.java +++ /dev/null @@ -1,203 +0,0 @@ -// -// Copyright (c) 2016 by The President and Fellows of Harvard College -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. You may obtain a copy of the License at: -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software distributed under the License is -// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permission and limitations under the License. -// - -/* Droid 6.1 has no nicely packaged way to make simple queries. This - * class attempts to fill that gap for FITS, in a way that will let it - * be lifted for other uses and perhaps incorporated into Droid itself. - */ -package edu.harvard.hul.ois.fits.tools.droid; - -import edu.harvard.hul.ois.fits.exceptions.FitsToolException; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.List; -import java.util.Map; -import uk.gov.nationalarchives.droid.command.action.CommandExecutionException; -import uk.gov.nationalarchives.droid.container.ContainerSignatureDefinitions; -import uk.gov.nationalarchives.droid.core.BinarySignatureIdentifier; -import uk.gov.nationalarchives.droid.core.SignatureParseException; -import uk.gov.nationalarchives.droid.core.interfaces.IdentificationRequest; -import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResult; -import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResultCollection; -import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResultImpl; -import uk.gov.nationalarchives.droid.core.interfaces.RequestIdentifier; -import uk.gov.nationalarchives.droid.core.interfaces.archive.ArchiveFormatResolver; -import uk.gov.nationalarchives.droid.core.interfaces.archive.ContainerIdentifier; -import uk.gov.nationalarchives.droid.core.interfaces.archive.ContainerIdentifierFactory; -import uk.gov.nationalarchives.droid.core.interfaces.resource.FileSystemIdentificationRequest; -import uk.gov.nationalarchives.droid.core.interfaces.resource.RequestMetaData; -import uk.gov.nationalarchives.droid.profile.referencedata.Format; - -public class DroidQuery { - - private final BinarySignatureIdentifier sigIdentifier; - private final ContainerIdentifierFactory containerIdentifierFactory; - private final ArchiveFormatResolver containerFormatResolver; - private final Map puidFormatMap; - private final ContainerSignatureDefinitions containerSignatureDefinitions; - // Certain file types (possibly really large file), we only want to examine the beginning of the file. - private long bytesToRead = -1; - private final List fileExtensions; // file extensions for files on which to apply file read limit - private final File file; // input file that is being processed - - /** - * Create a DroidQuery object. This can be retained for any number of - * different queries. - * - * @param sigIdentifier BinarySignatureIdentifier for a Droid signature file - * @param containerIdentifierFactory container identifier - * @param containerFormatResolver container format resolver - * @param puidFormatMap map of puids to formats - * @param containerSignatureDefinitions container sig definitions - * @param includeExts File extensions to include for possibly limiting number of bytes to read of file to process. - * @param kbReadLimit Number of bytes to process in KB from the beginning of the file. -1 indicates read entire file. - * @param file The file to be processed by DROID. - * @throws SignatureParseException If there is a problem processing the DROID signature file. - */ - public DroidQuery( - BinarySignatureIdentifier sigIdentifier, - ContainerIdentifierFactory containerIdentifierFactory, - ArchiveFormatResolver containerFormatResolver, - Map puidFormatMap, - ContainerSignatureDefinitions containerSignatureDefinitions, - List includeExts, - long kbReadLimit, - File file) - throws SignatureParseException, FileNotFoundException { - this.sigIdentifier = sigIdentifier; - this.containerIdentifierFactory = containerIdentifierFactory; - this.containerFormatResolver = containerFormatResolver; - this.puidFormatMap = puidFormatMap; - this.containerSignatureDefinitions = containerSignatureDefinitions; - this.fileExtensions = includeExts; - if (kbReadLimit > 0) { - this.bytesToRead = (kbReadLimit * 1024) - 1; - } - this.file = file; - } - - /** - * Process the file by DROID. - * @return A collection of results from DROID. Usually a single result. - * @throws IOException - */ - IdentificationResultCollection queryFile() throws IOException { - - // For certain file types, set max. number of bytes at beginning of file to process. - // See https://groups.google.com/forum/#!msg/droid-list/HqN6lKOATJk/i-qTEI-XEwAJ;context-place=forum/droid-list - // which indicates minimum number of bytes required to identify certain input file types. - long bytesToExamine = file.length(); - String filename = file.getName(); - int lastDot = filename.lastIndexOf('.'); - if (lastDot > 0 && filename.length() > lastDot) { - String fileExtension = filename.substring(++lastDot).toLowerCase(); // examine extension past the last dot - if (fileExtensions != null && fileExtensions.contains(fileExtension) && bytesToRead > 0) { - bytesToExamine = Math.min(file.length(), bytesToRead); - } - } - RequestMetaData metadata = new RequestMetaData(bytesToExamine, file.lastModified(), file.getName()); - RequestIdentifier identifier = new RequestIdentifier(file.toURI()); - FileSystemIdentificationRequest req = null; - try { - req = new FileSystemIdentificationRequest(metadata, identifier); - req.open(file.toPath()); - - // This logic is based on - // https://github.com/digital-preservation/droid/blob/master/droid-results/src/main/java/uk/gov/nationalarchives/droid/submitter/SubmissionGateway.java - - IdentificationResultCollection results = sigIdentifier.matchBinarySignatures(req); - IdentificationResultCollection containerResults = handleContainer(req, results); - - if (containerResults != null) { - results = containerResults; - } - - sigIdentifier.removeLowerPriorityHits(results); - if (results.getResults().isEmpty()) { - results = sigIdentifier.matchExtensions(req, false); - } - - return results; - } finally { - if (req != null) { - req.close(); - } - } - } - - private IdentificationResultCollection handleContainer( - IdentificationRequest request, IdentificationResultCollection results) throws IOException { - String containerFormat = getContainerFormat(results); - - if (containerFormat != null) { - ContainerIdentifier containerIdentifier = containerIdentifierFactory.getIdentifier(containerFormat); - IdentificationResultCollection containerResults = containerIdentifier.submit(request); - sigIdentifier.removeLowerPriorityHits(containerResults); - - // container results only have the PUID filled in - for (IdentificationResult result : containerResults.getResults()) { - IdentificationResultImpl impl = (IdentificationResultImpl) result; - Format format = puidFormatMap.get(result.getPuid()); - if (format != null) { - impl.setName(format.getName()); - impl.setMimeType(format.getMimeType()); - impl.setVersion(format.getVersion()); - } - } - - return containerResults.getResults().isEmpty() ? null : containerResults; - } - - return null; - } - - private String getContainerFormat(IdentificationResultCollection results) { - for (IdentificationResult result : results.getResults()) { - final String format = containerFormatResolver.forPuid(result.getPuid()); - if (format != null) { - return format; - } - } - - return null; - } - - /** - * Provides additional results from DROID for processing ZIP files. - * - * @param results This is the same value returned from the call to queryFile(). - * @return Aggregated data of all files contained within the ZIP file. - * @throws IOException If the file cannot be read. - * @throws FitsToolException If the file is not a ZIP file. - */ - ContainerAggregator queryContainerData(IdentificationResultCollection results) - throws IOException, FitsToolException { - - RequestMetaData metadata = new RequestMetaData(bytesToRead, file.lastModified(), file.getName()); - RequestIdentifier identifier = new RequestIdentifier(file.toURI()); - FileSystemIdentificationRequest request = null; - request = new FileSystemIdentificationRequest(metadata, identifier); - request.open(file.toPath()); - - ZipArchiveContentIdentifier zipArchiveIdentifier = new ZipArchiveContentIdentifier( - this.sigIdentifier, containerSignatureDefinitions, "", File.separator, File.separator, puidFormatMap); - try { - ContainerAggregator aggregator = zipArchiveIdentifier.identify(results.getUri(), request); - return aggregator; - } catch (CommandExecutionException e) { - throw new FitsToolException("DROID can't execute zipArchiveIdentifier", e); - } finally { - if (request != null) { - request.close(); - } - } - } -} diff --git a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidResult.java b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidResult.java new file mode 100644 index 00000000..0c6b30a6 --- /dev/null +++ b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidResult.java @@ -0,0 +1,61 @@ +// +// Copyright (c) 2023 by The President and Fellows of Harvard College +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. You may obtain a copy of the License at: +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the License is +// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permission and limitations under the License. +// + +package edu.harvard.hul.ois.fits.tools.droid; + +import java.nio.file.Path; +import java.util.List; +import java.util.Objects; +import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResultCollection; + +/** + * Encapsulates the identification results of a file and any files that the file contains. + */ +class DroidResult { + + private final Path file; + private final IdentificationResultCollection primaryResult; + private final List innerResults; + + /** + * @param file the file that was analyzed + * @param primaryResult the primary identification result + * @param innerResults the identifications result of any files the primary file contained + */ + public DroidResult( + Path file, + IdentificationResultCollection primaryResult, + List innerResults) { + this.file = Objects.requireNonNull(file, "file cannot be null"); + this.primaryResult = Objects.requireNonNull(primaryResult, "primaryResult cannot be null"); + this.innerResults = List.copyOf(Objects.requireNonNull(innerResults, "innerResults cannot be null")); + } + + /** + * @return the file that was analyzed + */ + public Path getFile() { + return file; + } + + /** + * @return the primary identification result + */ + public IdentificationResultCollection getPrimaryResult() { + return primaryResult; + } + + /** + * @return the identification results for any files contained within the primary file, or an empty list + */ + public List getInnerResults() { + return innerResults; + } +} diff --git a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidToolOutputter.java b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidToolOutputter.java index cf09b6d0..6bd0a206 100644 --- a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidToolOutputter.java +++ b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidToolOutputter.java @@ -18,17 +18,20 @@ import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; -import java.util.HashMap; +import java.io.UncheckedIOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collections; import java.util.List; import java.util.Map; -import java.util.zip.ZipEntry; +import java.util.TreeMap; +import java.util.function.Function; +import java.util.stream.Collectors; import org.jdom2.Attribute; import org.jdom2.Document; import org.jdom2.Element; import org.jdom2.Namespace; import org.jdom2.input.SAXBuilder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResult; import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResultCollection; @@ -38,47 +41,62 @@ * * @author Gary McGath */ -public class DroidToolOutputter { +class DroidToolOutputter { private static final Namespace fitsNS = Namespace.getNamespace(Fits.XML_NAMESPACE); - private static final Map COMPRESSION_METHOD_TO_STRING_VALUE; - private static final Logger logger = LoggerFactory.getLogger(DroidToolOutputter.class); + private static final String UNKNOWN_FORMAT = "Unknown"; + private static final String ZIP_PUID = "x-fmt/263"; - private final IdentificationResultCollection results; private final ToolBase toolBase; private final Fits fits; - private final ContainerAggregator aggregator; // could be null!!! + private final DroidResult result; - static { - COMPRESSION_METHOD_TO_STRING_VALUE = new HashMap<>(); - COMPRESSION_METHOD_TO_STRING_VALUE.put(ZipEntry.STORED, "stored"); - COMPRESSION_METHOD_TO_STRING_VALUE.put(ZipEntry.DEFLATED, "deflate"); - } + // The following fields are only relevant when the file was an archive containing other files. They're stored at + // the class level because we need the values for both the fits output and the raw output and this avoids computing + // them twice. + private final Map countsByFormat; + private final long originalSize; + private final long fileSize; + private final String compressionMethod; - public DroidToolOutputter( - ToolBase toolBase, IdentificationResultCollection results, Fits fits, ContainerAggregator aggregator) { + public DroidToolOutputter(ToolBase toolBase, Fits fits, DroidResult result) { this.toolBase = toolBase; - this.results = results; this.fits = fits; - this.aggregator = aggregator; + this.result = result; + + if (!result.getInnerResults().isEmpty()) { + countsByFormat = countByFormat(result.getInnerResults()); + originalSize = calculateTotalSize(result.getInnerResults()); + fileSize = fileSize(result.getFile()); + if (isZip(result.getPrimaryResult())) { + compressionMethod = zipCompressionMethod(fileSize, originalSize); + } else { + compressionMethod = null; + } + } else { + countsByFormat = Collections.emptyMap(); + originalSize = -1; + fileSize = -1; + compressionMethod = null; + } } /** Produce a JDOM document with fits as its root element. This * will contain just identification, not metadata elements. */ public ToolOutput toToolOutput() throws FitsToolException { - List resList = results.getResults(); + List resList = result.getPrimaryResult().getResults(); + Document fitsXml = createToolData(); Document rawOut = buildRawData(resList); - ToolOutput output = new ToolOutput(toolBase, fitsXml, rawOut, fits); - return output; + return new ToolOutput(toolBase, fitsXml, rawOut, fits); } /** Create a base tool data document and add elements * for each format. */ - private Document createToolData() throws FitsToolException { - List resList = results.getResults(); + private Document createToolData() { + List resList = result.getPrimaryResult().getResults(); Element fitsElem = new Element("fits", fitsNS); Document toolDoc = new Document(fitsElem); Element idElem = new Element("identification", fitsNS); @@ -93,7 +111,6 @@ private Document createToolData() throws FitsToolException { mimeType = FitsMetadataValues.getInstance().normalizeMimeType(mimeType); } - // maybe this block should be moved to mapFormatName() ??? if (formatName.equals("Digital Negative (DNG)")) { mimeType = "image/x-adobe-dng"; } @@ -102,11 +119,9 @@ private Document createToolData() throws FitsToolException { version = mapVersion(version); Element identityElem = new Element("identity", fitsNS); - Attribute attr = null; - if (formatName != null) { - attr = new Attribute("format", formatName); - identityElem.setAttribute(attr); - } + Attribute attr = new Attribute("format", formatName); + identityElem.setAttribute(attr); + if (mimeType != null) { attr = new Attribute("mimetype", mimeType); identityElem.setAttribute(attr); @@ -133,46 +148,106 @@ private Document createToolData() throws FitsToolException { } } + List innerResults = result.getInnerResults(); + // The only time there will be a metadata section from DROID is when // there is an aggregator for ZIP files and there are file entries. - if (aggregator != null && aggregator.getTotalEntriesCount() > 0) { + if (!innerResults.isEmpty()) { Element metadataElem = new Element("metadata", fitsNS); fitsElem.addContent(metadataElem); Element containerElem = new Element("container", fitsNS); metadataElem.addContent(containerElem); Element origSizeElem = new Element("originalSize", fitsNS); - origSizeElem.addContent(String.valueOf(aggregator.getOriginalSize())); + origSizeElem.addContent(String.valueOf(originalSize)); containerElem.addContent(origSizeElem); - Element compressionMethodElem = new Element("compressionMethod", fitsNS); - compressionMethodElem.addContent(COMPRESSION_METHOD_TO_STRING_VALUE.get(aggregator.getCompressionMethod())); - containerElem.addContent(compressionMethodElem); + if (compressionMethod != null) { + Element compressionMethodElem = new Element("compressionMethod", fitsNS); + compressionMethodElem.addContent(compressionMethod); + containerElem.addContent(compressionMethodElem); + } Element entriesElem = new Element("entries", fitsNS); - Attribute totalEntriesCountAttr = - new Attribute("totalEntries", String.valueOf(aggregator.getTotalEntriesCount())); + Attribute totalEntriesCountAttr = new Attribute("totalEntries", String.valueOf(innerResults.size())); entriesElem.setAttribute(totalEntriesCountAttr); containerElem.addContent(entriesElem); - for (Map.Entry formatEntry : - aggregator.getFormatCounts().entrySet()) { + countsByFormat.forEach((format, count) -> { Element entryElem = new Element("format", fitsNS); - Attribute nameAttr = new Attribute("name", formatEntry.getKey()); + Attribute nameAttr = new Attribute("name", format); entryElem.setAttribute(nameAttr); - Attribute numberAttr = new Attribute("number", String.valueOf(formatEntry.getValue())); + Attribute numberAttr = new Attribute("number", String.valueOf(count)); entryElem.setAttribute(numberAttr); entriesElem.addContent(entryElem); - } + }); } return toolDoc; } - public static String mapFormatName(String formatName) { + /** + * Groups and counts the results by format name. + * + * @param innerResults the identification results of files within an archive + * @return a map of format names to the number of occurrences of that format + */ + private Map countByFormat(List innerResults) { + return innerResults.stream() + .map(r -> { + if (r.getResults().isEmpty()) { + return UNKNOWN_FORMAT; + } + return mapFormatName(r.getResults().get(0).getName()); + }) + .collect(Collectors.groupingBy(Function.identity(), TreeMap::new, Collectors.counting())); + } + + /** + * Sums the combined file size based on the file size reported in the identification results. + * + * @param innerResults the identification results of files within an archive + * @return total file size + */ + private long calculateTotalSize(List innerResults) { + return innerResults.stream() + .map(IdentificationResultCollection::getFileLength) + .reduce(0L, Long::sum); + } + + /** + * @return the file size of the target file on disk + */ + private long fileSize(Path file) { + try { + return Files.size(file); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /** + * @return true if the target file was determined to be a zip file + */ + private boolean isZip(IdentificationResultCollection identificationResult) { + return identificationResult.getResults().stream() + .map(IdentificationResult::getPuid) + .anyMatch(ZIP_PUID::equals); + } + + /** + * @param fileSize the size of the file on disk + * @param originalSize the reported size of all of the components of a file, this will be different from the file + * size if the file is compressed + * @return the zip compression method + */ + private String zipCompressionMethod(long fileSize, long originalSize) { + return fileSize < originalSize ? "deflate" : "stored"; + } + private static String mapFormatName(String formatName) { if (formatName == null || formatName.length() == 0) { return FitsMetadataValues.DEFAULT_FORMAT; } else if (formatName.startsWith("JPEG2000") || formatName.startsWith("JP2 (JPEG 2000")) { @@ -199,7 +274,6 @@ public static String mapFormatName(String formatName) { } private String mapVersion(String version) { - if (version == null || version.length() == 0) { return version; } else if (version.equals("1987a")) { @@ -217,7 +291,6 @@ private String mapVersion(String version) { * @throws SAXException */ private Document buildRawData(List resList) throws FitsToolException { - StringWriter out = new StringWriter(); out.write(""); @@ -241,30 +314,33 @@ private Document buildRawData(List resList) throws FitsToo out.write(""); } - if (aggregator != null && aggregator.getTotalEntriesCount() > 0) { + var innerResults = result.getInnerResults(); + + if (!innerResults.isEmpty()) { out.write(""); out.write("\n"); out.write(""); out.write("\n"); - for (Map.Entry entry : aggregator.getFormatCounts().entrySet()) { + countsByFormat.forEach((format, count) -> { out.write(""); out.write("\n"); - } + }); out.write(""); out.write("\n"); diff --git a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidWrapper.java b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidWrapper.java new file mode 100644 index 00000000..aed00412 --- /dev/null +++ b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidWrapper.java @@ -0,0 +1,142 @@ +// +// Copyright (c) 2023 by The President and Fellows of Harvard College +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. You may obtain a copy of the License at: +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the License is +// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permission and limitations under the License. +// + +package edu.harvard.hul.ois.fits.tools.droid; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import org.apache.commons.io.FilenameUtils; +import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResultCollection; +import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResultImpl; +import uk.gov.nationalarchives.droid.core.interfaces.RequestIdentifier; +import uk.gov.nationalarchives.droid.core.interfaces.resource.FileSystemIdentificationRequest; +import uk.gov.nationalarchives.droid.core.interfaces.resource.RequestMetaData; +import uk.gov.nationalarchives.droid.profile.referencedata.Format; +import uk.gov.nationalarchives.droid.submitter.SubmissionGateway; + +/** + * Submits a file to Droid to identify and returns the identification results for the submitted file and any files that + * it contains. + *

+ * The intended use is to construct one instance of this class per thread using the {@link DroidWrapperFactory}. This + * allows the reuse of expensive, thread-safe components. + *

+ * This class is NOT THREAD SAFE. + */ +class DroidWrapper { + + private final SubmissionGateway submissionGateway; + private final CollectingResultHandler resultHandler; + private final Map puidFormatMap; + private final Set extsToLimitBytesRead; + private final long byteReadLimit; + + /** + * @param submissionGateway the Droid entry point + * @param resultHandler the handler for collecting identification results + * @param puidFormatMap the map of puids to formats + * @param extsToLimitBytesRead set of file extensions where the number of bytes read should be restricted + * @param byteReadLimit the max number of bytes to read of files with byte restrictions + */ + public DroidWrapper( + SubmissionGateway submissionGateway, + CollectingResultHandler resultHandler, + Map puidFormatMap, + Set extsToLimitBytesRead, + long byteReadLimit) { + this.submissionGateway = Objects.requireNonNull(submissionGateway, "submissionGateway cannot be null"); + this.resultHandler = Objects.requireNonNull(resultHandler, "resultHandler cannot be null"); + this.puidFormatMap = Objects.requireNonNull(puidFormatMap, "puidFormatMap cannot be null"); + this.extsToLimitBytesRead = Objects.requireNonNull(extsToLimitBytesRead, "extsToLimitBytesRead cannot be null"); + this.byteReadLimit = byteReadLimit; + } + + /** + * Submits a file to be analyzed by Droid, and returns the identification results of the file and any files that + * it contains. + *

+ * Recursion of archive formats is restricted to a depth of 1. + * + * @param file the file to analyze + * @return the identification results + * @throws IOException + * @throws InterruptedException + * @throws ExecutionException + */ + public DroidResult analyze(Path file) throws IOException, InterruptedException, ExecutionException { + var bytesToRead = Files.size(file); + var filename = file.getFileName().toString(); + var ext = FilenameUtils.getExtension(file.getFileName().toString()); + + if (byteReadLimit > 0 && extsToLimitBytesRead.contains(ext)) { + bytesToRead = Math.min(byteReadLimit, bytesToRead); + } + + var meta = + new RequestMetaData(bytesToRead, Files.getLastModifiedTime(file).toMillis(), filename); + var id = new RequestIdentifier(file.toUri()); + id.setParentId(DroidId.nextId()); + id.setParentPrefix(""); + var request = new FileSystemIdentificationRequest(meta, id); + + try { + request.open(file); + + resultHandler.reset(); + submissionGateway.submit(request).get(); + submissionGateway.awaitFinished(); + + var results = resultHandler.getResults(); + + results.forEach(this::augmentContainerResults); + + List innerResults = + results.size() == 1 ? Collections.emptyList() : results.subList(1, results.size()); + + return new DroidResult(file, results.get(0), innerResults); + } finally { + request.close(); + } + } + + /** + * Closes the object and any underlying resources + * + * @throws IOException + */ + public void close() throws IOException { + submissionGateway.close(); + } + + /** + * Modifies the result objects to include mime type and version. This is necessary because, for some reason Droid + * does not include this information for files that were identified by container signature. + * + * @param result the result to modify + */ + private void augmentContainerResults(IdentificationResultCollection result) { + result.getResults().stream().filter(r -> r.getMimeType() == null).forEach(r -> { + var format = puidFormatMap.get(r.getPuid()); + if (format != null) { + var ri = (IdentificationResultImpl) r; + ri.setName(format.getName()); + ri.setMimeType(format.getMimeType()); + ri.setVersion(format.getVersion()); + } + }); + } +} diff --git a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidWrapperFactory.java b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidWrapperFactory.java new file mode 100644 index 00000000..5f76d0e7 --- /dev/null +++ b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/DroidWrapperFactory.java @@ -0,0 +1,319 @@ +// +// Copyright (c) 2023 by The President and Fellows of Harvard College +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. You may obtain a copy of the License at: +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the License is +// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permission and limitations under the License. +// + +package edu.harvard.hul.ois.fits.tools.droid; + +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.Executors; +import java.util.concurrent.ThreadFactory; +import uk.gov.nationalarchives.droid.container.ContainerFileIdentificationRequestFactory; +import uk.gov.nationalarchives.droid.container.ContainerSignatureFileReader; +import uk.gov.nationalarchives.droid.container.ole2.Ole2Identifier; +import uk.gov.nationalarchives.droid.container.ole2.Ole2IdentifierEngine; +import uk.gov.nationalarchives.droid.container.zip.ZipIdentifier; +import uk.gov.nationalarchives.droid.container.zip.ZipIdentifierEngine; +import uk.gov.nationalarchives.droid.core.BinarySignatureIdentifier; +import uk.gov.nationalarchives.droid.core.SignatureParseException; +import uk.gov.nationalarchives.droid.core.interfaces.RequestIdentifier; +import uk.gov.nationalarchives.droid.core.interfaces.archive.ArcArchiveHandler; +import uk.gov.nationalarchives.droid.core.interfaces.archive.ArchiveFormatResolverImpl; +import uk.gov.nationalarchives.droid.core.interfaces.archive.ArchiveHandlerFactoryImpl; +import uk.gov.nationalarchives.droid.core.interfaces.archive.BZipArchiveHandler; +import uk.gov.nationalarchives.droid.core.interfaces.archive.BZipRequestFactory; +import uk.gov.nationalarchives.droid.core.interfaces.archive.ContainerIdentifierFactoryImpl; +import uk.gov.nationalarchives.droid.core.interfaces.archive.FatArchiveHandler; +import uk.gov.nationalarchives.droid.core.interfaces.archive.FatEntryRequestFactory; +import uk.gov.nationalarchives.droid.core.interfaces.archive.GZipArchiveHandler; +import uk.gov.nationalarchives.droid.core.interfaces.archive.GZipRequestFactory; +import uk.gov.nationalarchives.droid.core.interfaces.archive.ISOEntryRequestFactory; +import uk.gov.nationalarchives.droid.core.interfaces.archive.ISOImageArchiveHandler; +import uk.gov.nationalarchives.droid.core.interfaces.archive.RarArchiveHandler; +import uk.gov.nationalarchives.droid.core.interfaces.archive.RarEntryRequestFactory; +import uk.gov.nationalarchives.droid.core.interfaces.archive.SevenZipArchiveHandler; +import uk.gov.nationalarchives.droid.core.interfaces.archive.SevenZipRequestFactory; +import uk.gov.nationalarchives.droid.core.interfaces.archive.TarArchiveHandler; +import uk.gov.nationalarchives.droid.core.interfaces.archive.TarEntryRequestFactory; +import uk.gov.nationalarchives.droid.core.interfaces.archive.TrueVfsArchiveHandler; +import uk.gov.nationalarchives.droid.core.interfaces.archive.WarcArchiveHandler; +import uk.gov.nationalarchives.droid.core.interfaces.archive.WebArchiveEntryRequestFactory; +import uk.gov.nationalarchives.droid.core.interfaces.archive.ZipEntryRequestFactory; +import uk.gov.nationalarchives.droid.core.interfaces.control.PauseAspect; +import uk.gov.nationalarchives.droid.core.interfaces.signature.SignatureFileException; +import uk.gov.nationalarchives.droid.profile.referencedata.Format; +import uk.gov.nationalarchives.droid.signature.SaxSignatureFileParser; +import uk.gov.nationalarchives.droid.signature.SignatureParser; +import uk.gov.nationalarchives.droid.submitter.SubmissionQueue; +import uk.gov.nationalarchives.droid.submitter.SubmissionQueueData; + +/** + * Factory for generating {@link DroidWrapper} instances. This is necessary because {@link DroidWrapper} is not thread + * safe, but many of the components that it uses are expensive and can be shared between instances. The setup in this + * class is based on Droid's spring-result.xml. + */ +class DroidWrapperFactory { + + private static DroidWrapperFactory instance; + + /** + * Creates a new DroidWrapperFactory instance if one does not exist, or returns the existing instance if one does. + * + * @param config the droid config + * @return the DroidWrapperFactory + * @throws SignatureParseException + * @throws SignatureFileException + */ + public static synchronized DroidWrapperFactory getOrCreateFactory(DroidConfig config) + throws SignatureParseException, SignatureFileException { + if (instance == null) { + instance = new DroidWrapperFactory(Objects.requireNonNull(config, "config cannot be null")); + } + return instance; + } + + private final DroidConfig config; + private final Map puidFormatMap; + private final BinarySignatureIdentifier droid; + private final ContainerSignatureFileReader signatureFileReader; + private final ContainerIdentifierFactoryImpl containerIdentifierFactory; + private final ArchiveFormatResolverImpl containerPuidResolver; + private final ZipIdentifier zipContainerHandler; + private final Ole2Identifier ole2ContainerHandler; + private final ArchiveFormatResolverImpl archivePuidResolver; + private final ZipEntryRequestFactory zipFactory; + private final TarEntryRequestFactory tarFactory; + private final SevenZipRequestFactory sevenZipFactory; + private final BZipRequestFactory bzipFactory; + private final GZipRequestFactory gzipFactory; + private final WebArchiveEntryRequestFactory arcFactory; + private final WebArchiveEntryRequestFactory warcFactory; + private final ISOEntryRequestFactory isoFactory; + private final RarEntryRequestFactory rarFactory; + private final FatEntryRequestFactory fatFactory; + + private DroidWrapperFactory(DroidConfig config) throws SignatureParseException, SignatureFileException { + this.config = config; + + // The following is necessary to init the code that identifies formats like docx, xlsx, etc + puidFormatMap = new HashMap<>(); + SignatureParser sigParser = + new SaxSignatureFileParser(config.getSigFile().toUri()); + sigParser.formats(format -> { + puidFormatMap.put(format.getPuid(), format); + }); + + droid = new BinarySignatureIdentifier(); + droid.setSignatureFile(config.getSigFile().toAbsolutePath().toString()); + droid.init(); + + signatureFileReader = new ContainerSignatureFileReader(); + signatureFileReader.setFilePath( + config.getContainerSigFile().toAbsolutePath().toString()); + + containerIdentifierFactory = new ContainerIdentifierFactoryImpl(); + containerPuidResolver = new ArchiveFormatResolverImpl(); + var containerFileIdentificationRequestFactory = new ContainerFileIdentificationRequestFactory(); + + var zipIdentifierEngine = new ZipIdentifierEngine(); + zipIdentifierEngine.setRequestFactory(containerFileIdentificationRequestFactory); + + var ole2IdentifierEngine = new Ole2IdentifierEngine(); + ole2IdentifierEngine.setRequestFactory(containerFileIdentificationRequestFactory); + + zipContainerHandler = new ZipIdentifier(); + zipContainerHandler.setContainerType("ZIP"); + zipContainerHandler.setContainerIdentifierFactory(containerIdentifierFactory); + zipContainerHandler.setContainerFormatResolver(containerPuidResolver); + zipContainerHandler.setDroidCore(droid); + zipContainerHandler.setIdentifierEngine(zipIdentifierEngine); + zipContainerHandler.setSignatureReader(signatureFileReader); + zipContainerHandler.init(); + + ole2ContainerHandler = new Ole2Identifier(); + ole2ContainerHandler.setContainerType("OLE2"); + ole2ContainerHandler.setContainerIdentifierFactory(containerIdentifierFactory); + ole2ContainerHandler.setContainerFormatResolver(containerPuidResolver); + ole2ContainerHandler.setDroidCore(droid); + ole2ContainerHandler.setIdentifierEngine(ole2IdentifierEngine); + ole2ContainerHandler.setSignatureReader(signatureFileReader); + ole2ContainerHandler.init(); + + archivePuidResolver = new ArchiveFormatResolverImpl(); + archivePuidResolver.setPuids(Map.of( + "ZIP", "x-fmt/263", + "TAR", "x-fmt/265", + "GZ", "x-fmt/266", + "ARC", "x-fmt/219, fmt/410", + "WARC", "fmt/289, fmt/1281, fmt/1355", + "BZ", "x-fmt/267, x-fmt/268", + "7Z", "fmt/484", + "ISO", "fmt/468, fmt/1739", + "RAR", "x-fmt/264, fmt/411", + "FAT", "fmt/1087")); + + zipFactory = new ZipEntryRequestFactory(); + zipFactory.setTempDirLocation(config.getTempDir()); + tarFactory = new TarEntryRequestFactory(); + tarFactory.setTempDirLocation(config.getTempDir()); + sevenZipFactory = new SevenZipRequestFactory(); + sevenZipFactory.setTempDirLocation(config.getTempDir()); + bzipFactory = new BZipRequestFactory(); + bzipFactory.setTempDirLocation(config.getTempDir()); + gzipFactory = new GZipRequestFactory(); + gzipFactory.setTempDirLocation(config.getTempDir()); + arcFactory = new WebArchiveEntryRequestFactory(); + arcFactory.setTempDirLocation(config.getTempDir()); + warcFactory = new WebArchiveEntryRequestFactory(); + warcFactory.setTempDirLocation(config.getTempDir()); + isoFactory = new ISOEntryRequestFactory(); + isoFactory.setTempDirLocation(config.getTempDir()); + rarFactory = new RarEntryRequestFactory(); + rarFactory.setTempDirLocation(config.getTempDir()); + fatFactory = new FatEntryRequestFactory(); + fatFactory.setTempDirLocation(config.getTempDir()); + } + + /** + * Creates a new {@link DroidWrapper} instance. {@link DroidWrapper} is NOT THREAD SAFE. + * + * @return {@link DroidWrapper} + */ + public DroidWrapper createInstance() { + var submissionGateway = new RecursionRestrictedSubmissionGateway(); + submissionGateway.setDroidCore(droid); + submissionGateway.setContainerFormatResolver(containerPuidResolver); + submissionGateway.setContainerIdentifierFactory(containerIdentifierFactory); + submissionGateway.setArchiveFormatResolver(archivePuidResolver); + submissionGateway.setPauseAspect(new PauseAspect()); + submissionGateway.setSubmissionQueue(new NoOpSubmissionQueue()); + + // We need these threads to be daemon threads so that an application that uses FITS can exit. FITS has not + // historically required that users shut it down, so without this user application would hang. + submissionGateway.setExecutorService(Executors.newSingleThreadExecutor(new ThreadFactory() { + private final ThreadFactory delegate = Executors.defaultThreadFactory(); + + @Override + public Thread newThread(Runnable runnable) { + var thread = delegate.newThread(runnable); + thread.setDaemon(true); + return thread; + } + })); + + submissionGateway.setProcessZip(config.isProcessZip()); + submissionGateway.setProcessTar(config.isProcessTar()); + submissionGateway.setProcessGzip(config.isProcessGzip()); + submissionGateway.setProcessArc(config.isProcessArc()); + submissionGateway.setProcessWarc(config.isProcessWarc()); + submissionGateway.setProcessBzip2(config.isProcessBzip2()); + submissionGateway.setProcess7zip(config.isProcess7zip()); + submissionGateway.setProcessIso(config.isProcessIso()); + submissionGateway.setProcessRar(config.isProcessRar()); + + var resultHandler = new CollectingResultHandler(); + + submissionGateway.setResultHandler(resultHandler); + + var zipHandler = new TrueVfsArchiveHandler(); + zipHandler.setDroidCore(submissionGateway); + zipHandler.setResultHandler(resultHandler); + zipHandler.setFactory(zipFactory); + + var tarHandler = new TarArchiveHandler(); + tarHandler.setDroidCore(submissionGateway); + tarHandler.setResultHandler(resultHandler); + tarHandler.setFactory(tarFactory); + + var sevenZipHandler = new SevenZipArchiveHandler(); + sevenZipHandler.setDroid(submissionGateway); + sevenZipHandler.setResultHandler(resultHandler); + sevenZipHandler.setFactory(sevenZipFactory); + + var bzipHandler = new BZipArchiveHandler(); + bzipHandler.setDroidCore(submissionGateway); + bzipHandler.setResultHandler(resultHandler); + bzipHandler.setFactory(bzipFactory); + + var gzHandler = new GZipArchiveHandler(); + gzHandler.setDroidCore(submissionGateway); + gzHandler.setFactory(gzipFactory); + + var arcHandler = new ArcArchiveHandler(); + arcHandler.setDroidCore(submissionGateway); + arcHandler.setResultHandler(resultHandler); + arcHandler.setFactory(arcFactory); + + var warcHandler = new WarcArchiveHandler(); + warcHandler.setDroidCore(submissionGateway); + warcHandler.setResultHandler(resultHandler); + warcHandler.setFactory(warcFactory); + + var isoHandler = new ISOImageArchiveHandler(); + isoHandler.setDroid(submissionGateway); + isoHandler.setResultHandler(resultHandler); + isoHandler.setFactory(isoFactory); + + var rarHandler = new RarArchiveHandler(); + rarHandler.setDroid(submissionGateway); + rarHandler.setResultHandler(resultHandler); + rarHandler.setIdentificationRequestFactory(rarFactory); + + var fatHandler = new FatArchiveHandler(); + fatHandler.setDroid(submissionGateway); + fatHandler.setResultHandler(resultHandler); + fatHandler.setFactory(fatFactory); + + var archiveHandlerLocator = new ArchiveHandlerFactoryImpl(); + archiveHandlerLocator.setHandlers(Map.of( + "ZIP", zipHandler, + "TAR", tarHandler, + "GZ", gzHandler, + "ARC", arcHandler, + "WARC", warcHandler, + "BZ", bzipHandler, + "7Z", sevenZipHandler, + "ISO", isoHandler, + "RAR", rarHandler, + "FAT", fatHandler)); + + submissionGateway.setArchiveHandlerFactory(archiveHandlerLocator); + + return new DroidWrapper( + submissionGateway, + resultHandler, + puidFormatMap, + config.getExtsToLimitBytesRead(), + config.getByteReadLimit()); + } + + private static class NoOpSubmissionQueue implements SubmissionQueue { + @Override + public void add(RequestIdentifier request) { + // noop + } + + @Override + public void remove(RequestIdentifier request) { + // noop + } + + @Override + public void save() { + // noop + } + + @Override + public SubmissionQueueData list() { + // noop + return null; + } + } +} diff --git a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/RecursionRestrictedSubmissionGateway.java b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/RecursionRestrictedSubmissionGateway.java new file mode 100644 index 00000000..126cce8f --- /dev/null +++ b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/RecursionRestrictedSubmissionGateway.java @@ -0,0 +1,48 @@ +// +// Copyright (c) 2023 by The President and Fellows of Harvard College +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. You may obtain a copy of the License at: +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the License is +// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permission and limitations under the License. +// + +package edu.harvard.hul.ois.fits.tools.droid; + +import java.net.URI; +import java.util.Optional; +import java.util.concurrent.Future; +import org.apache.commons.lang.StringUtils; +import uk.gov.nationalarchives.droid.core.interfaces.AsynchDroid; +import uk.gov.nationalarchives.droid.core.interfaces.IdentificationRequest; +import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResultCollection; +import uk.gov.nationalarchives.droid.core.interfaces.RequestIdentifier; +import uk.gov.nationalarchives.droid.submitter.SubmissionGateway; + +/** + * When identifying the contents of an archive, Droid will recurse into nested archives. However, FITS prefers not + * to recurse into inner archives, and this class is used to restrict the recursion. + *

+ * Unfortunately, I had to implement this as a subclass of {@link SubmissionGateway} because {@link AsynchDroid} does + * not define a close() method. + */ +class RecursionRestrictedSubmissionGateway extends SubmissionGateway { + + @Override + public Future submit(IdentificationRequest identificationRequest) { + // Droid inserts "!/" every time it enters an archive, so by counting the occurrences of that string we can + // limit the recursion. + var url = Optional.ofNullable(identificationRequest.getIdentifier()) + .map(RequestIdentifier::getUri) + .map(URI::toString) + .orElse(""); + var depth = StringUtils.countMatches(url, "!/"); + + if (depth > 1) { + return null; + } + + return super.submit(identificationRequest); + } +} diff --git a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/ResultPrinter.java b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/ResultPrinter.java deleted file mode 100644 index 413edd41..00000000 --- a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/ResultPrinter.java +++ /dev/null @@ -1,261 +0,0 @@ -/** - * This file has been modified by Harvard University, June, 2017, for the purposes of incorporating - * into the FITS application. The original can be found here: https://github.com/digital-preservation/droid - * - * Copyright (c) 2016, The National Archives - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following - * conditions are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * * Neither the name of the The National Archives nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ -package edu.harvard.hul.ois.fits.tools.droid; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import uk.gov.nationalarchives.droid.command.action.CommandExecutionException; -import uk.gov.nationalarchives.droid.command.container.Ole2ContainerContentIdentifier; -import uk.gov.nationalarchives.droid.command.container.ZipContainerContentIdentifier; -import uk.gov.nationalarchives.droid.container.ContainerFileIdentificationRequestFactory; -import uk.gov.nationalarchives.droid.container.ContainerSignatureDefinitions; -import uk.gov.nationalarchives.droid.container.TriggerPuid; -import uk.gov.nationalarchives.droid.container.ole2.Ole2IdentifierEngine; -import uk.gov.nationalarchives.droid.container.zip.ZipIdentifierEngine; -import uk.gov.nationalarchives.droid.core.BinarySignatureIdentifier; -import uk.gov.nationalarchives.droid.core.interfaces.IdentificationRequest; -import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResult; -import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResultCollection; -import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResultImpl; -import uk.gov.nationalarchives.droid.core.interfaces.archive.IdentificationRequestFactory; -import uk.gov.nationalarchives.droid.profile.referencedata.Format; - -/** - * File identification results printer. - * - * NB: This class is called recursively when archive files are opened - * - * @author rbrennan - */ -public class ResultPrinter { - - private static final String R_SLASH = "/"; - private static final String L_BRACKET = "("; - private static final String R_BRACKET = ")"; - private static final String SPACE = " "; - - private final BinarySignatureIdentifier binarySignatureIdentifier; - private final ContainerSignatureDefinitions containerSignatureDefinitions; - private List triggerPuids; - private IdentificationRequestFactory requestFactory; - private final String path; - private final String slash; - private final String slash1; - private final String wrongSlash; - private final boolean archives; - private final boolean webArchives; - private final String OLE2_CONTAINER = "OLE2"; - private final String ZIP_CONTAINER = "ZIP"; - private final String ZIP_ARCHIVE = "x-fmt/263"; - private final String JIP_ARCHIVE = "x-fmt/412"; - private final String TAR_ARCHIVE = "x-fmt/265"; - private final String GZIP_ARCHIVE = "x-fmt/266"; - private final String ARC_ARCHIVE = "x-fmt/219"; - private final String OTHERARC_ARCHIVE = "fmt/410"; - private final String WARC_ARCHIVE = "fmt/289"; - - private final ContainerAggregator aggregator; - private final Map puidFormatMap; - - private static final Logger logger = LoggerFactory.getLogger(ResultPrinter.class); - - /** - * Store signature files. - * - * @param binarySignatureIdentifier binary signature identifier - * @param containerSignatureDefinitions container signatures - * @param path current file/container path - * @param slash local path element delimiter - * @param slash1 local first container prefix delimiter - * @param archives Should archives be examined? - * @param webArchives Should web archives be examined? - * @param aggregator - * @param puidFormatMap map of puids to formats - */ - public ResultPrinter( - final BinarySignatureIdentifier binarySignatureIdentifier, - final ContainerSignatureDefinitions containerSignatureDefinitions, - final String path, - final String slash, - final String slash1, - boolean archives, - boolean webArchives, - final ContainerAggregator aggregator, - final Map puidFormatMap) { - - this.binarySignatureIdentifier = binarySignatureIdentifier; - this.containerSignatureDefinitions = containerSignatureDefinitions; - this.path = path; - this.slash = slash; - this.slash1 = slash1; - this.wrongSlash = this.slash.equals(R_SLASH) ? "\\" : R_SLASH; - this.archives = archives; - this.webArchives = webArchives; - if (containerSignatureDefinitions != null) { - triggerPuids = containerSignatureDefinitions.getTiggerPuids(); - } - this.aggregator = aggregator; - this.puidFormatMap = puidFormatMap; - } - - /** - * Output identification for this file. - * - * @param results identification Results - * @param request identification Request - * - * @throws CommandExecutionException if unexpected container type encountered - */ - public void print(final IdentificationResultCollection results, final IdentificationRequest request) - throws CommandExecutionException { - - final String fileName = (path + request.getFileName()).replace(wrongSlash, slash); - final IdentificationResultCollection containerResults = getContainerResults(results, request, fileName); - - IdentificationResultCollection finalResults = new IdentificationResultCollection(request); - boolean container = false; - if (containerResults.getResults().size() > 0) { - container = true; - finalResults = containerResults; - } else if (results.getResults().size() > 0) { - finalResults = results; - } - if (finalResults.getResults().size() > 0) { - binarySignatureIdentifier.removeLowerPriorityHits(finalResults); - } - if (finalResults.getResults().size() > 0) { - int cnt = 0; - for (IdentificationResult identResult : finalResults.getResults()) { - if (+cnt > 1) { - logger.warn("Count: " + cnt); - } - String formatName = identResult.getName(); - String puid = identResult.getPuid(); - if (!container && JIP_ARCHIVE.equals(puid)) { - puid = ZIP_ARCHIVE; - } - - String normalizedFormat = DroidToolOutputter.mapFormatName(formatName); - String output = String.format( - "fileName: %s,\n mimeType: %s,\n formatName: %s,\n normalizedFormat: %s,\n puid: %s", - fileName, identResult.getMimeType(), formatName, normalizedFormat, puid); - logger.debug(output); - // add a single format type - aggregator.addFormat(normalizedFormat); - } - } else { - aggregator.incrementUnknownFormat(); - logger.debug(fileName + " -- Unknown filetype"); - } - } - - private IdentificationResultCollection getContainerResults( - final IdentificationResultCollection results, final IdentificationRequest request, final String fileName) - throws CommandExecutionException { - - IdentificationResultCollection containerResults = new IdentificationResultCollection(request); - - if (results.getResults().size() > 0 && containerSignatureDefinitions != null) { - int cnt = 0; - for (IdentificationResult identResult : results.getResults()) { - if (+cnt > 1) { - logger.info("IdentificationResult count: " + cnt); - } - String filePuid = identResult.getPuid(); - if (filePuid != null) { - TriggerPuid containerPuid = getTriggerPuidByPuid(filePuid); - if (containerPuid != null) { - - requestFactory = new ContainerFileIdentificationRequestFactory(); - String containerType = containerPuid.getContainerType(); - - if (OLE2_CONTAINER.equals(containerType)) { - try { - Ole2ContainerContentIdentifier ole2Identifier = new Ole2ContainerContentIdentifier(); - ole2Identifier.init(containerSignatureDefinitions, containerType); - Ole2IdentifierEngine ole2IdentifierEngine = new Ole2IdentifierEngine(); - ole2IdentifierEngine.setRequestFactory(requestFactory); - ole2Identifier.setIdentifierEngine(ole2IdentifierEngine); - containerResults = - ole2Identifier.process(request.getSourceInputStream(), containerResults); - } catch (IOException e) { // carry on after container i/o problems - logger.warn(e + SPACE + L_BRACKET + fileName + R_BRACKET); - } - } else if (ZIP_CONTAINER.equals(containerType)) { - try { - ZipContainerContentIdentifier zipIdentifier = new ZipContainerContentIdentifier(); - zipIdentifier.init(containerSignatureDefinitions, containerType); - ZipIdentifierEngine zipIdentifierEngine = new ZipIdentifierEngine(); - zipIdentifierEngine.setRequestFactory(requestFactory); - zipIdentifier.setIdentifierEngine(zipIdentifierEngine); - containerResults = - zipIdentifier.process(request.getSourceInputStream(), containerResults); - } catch (IOException e) { // carry on after container i/o problems - logger.warn(e + SPACE + L_BRACKET + fileName + R_BRACKET); - } - } else { - throw new CommandExecutionException("Unknown container type: " + containerPuid); - } - } - } - } - } - - // container results only have the PUID filled in - for (IdentificationResult result : containerResults.getResults()) { - IdentificationResultImpl impl = (IdentificationResultImpl) result; - Format format = puidFormatMap.get(result.getPuid()); - if (format != null) { - impl.setName(format.getName()); - impl.setMimeType(format.getMimeType()); - impl.setVersion(format.getVersion()); - } - } - - return containerResults; - } - - private TriggerPuid getTriggerPuidByPuid(final String puid) { - for (final TriggerPuid tp : triggerPuids) { - if (tp.getPuid().equals(puid)) { - return tp; - } - } - return null; - } -} diff --git a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/ZipArchiveContentIdentifier.java b/src/main/java/edu/harvard/hul/ois/fits/tools/droid/ZipArchiveContentIdentifier.java deleted file mode 100644 index adad010a..00000000 --- a/src/main/java/edu/harvard/hul/ois/fits/tools/droid/ZipArchiveContentIdentifier.java +++ /dev/null @@ -1,157 +0,0 @@ -/** - * This file has been modified by Harvard University, June, 2017, for the purposes of incorporating - * into the FITS application. The original can be found here: https://github.com/digital-preservation/droid - * - * Copyright (c) 2016, The National Archives - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following - * conditions are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * * Neither the name of the The National Archives nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ -package edu.harvard.hul.ois.fits.tools.droid; - -import java.io.IOException; -import java.io.InputStream; -import java.net.URI; -import java.util.Map; -import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException; -import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; -import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import uk.gov.nationalarchives.droid.command.action.CommandExecutionException; -import uk.gov.nationalarchives.droid.container.ContainerSignatureDefinitions; -import uk.gov.nationalarchives.droid.core.BinarySignatureIdentifier; -import uk.gov.nationalarchives.droid.core.interfaces.IdentificationRequest; -import uk.gov.nationalarchives.droid.core.interfaces.RequestIdentifier; -import uk.gov.nationalarchives.droid.core.interfaces.resource.RequestMetaData; -import uk.gov.nationalarchives.droid.core.interfaces.resource.ZipEntryIdentificationRequest; -import uk.gov.nationalarchives.droid.profile.referencedata.Format; - -/** - * Identifier for files held in a ZIP archive. - * - * @author rbrennan - */ -public class ZipArchiveContentIdentifier extends ArchiveContentIdentifier { - - private static final Logger logger = LoggerFactory.getLogger(ZipArchiveContentIdentifier.class); - - /** - * - * @param binarySignatureIdentifier binary signature identifier - * @param containerSignatureDefinitions container signatures - * @param path current archive path - * @param slash local path element delimiter - * @param slash1 local first container prefix delimiter - * @param puidFormatMap map of puids to formats - */ - public ZipArchiveContentIdentifier( - final BinarySignatureIdentifier binarySignatureIdentifier, - final ContainerSignatureDefinitions containerSignatureDefinitions, - final String path, - final String slash, - final String slash1, - final Map puidFormatMap) { - - super(binarySignatureIdentifier, containerSignatureDefinitions, path, slash, slash1, false, puidFormatMap); - } - - /** - * @param uri The URI of the file to identify - * @param request The Identification Request - * @return The aggregated data of the examined ZIP file - * @throws CommandExecutionException When an exception happens during execution - * @throws CommandExecutionException When an exception happens during archive file access - */ - public ContainerAggregator identify(final URI uri, final IdentificationRequest request) - throws CommandExecutionException { - - final String newPath = makeContainerURI("zip", request.getFileName()); - setSlash1(""); - InputStream zipIn = null; - ContainerAggregator aggregator = new ContainerAggregator(); - try { - zipIn = request.getSourceInputStream(); - final ZipArchiveInputStream in = new ZipArchiveInputStream(zipIn); - try { - ZipArchiveEntry entry = null; - Integer compressionMethod = null; - while ((entry = in.getNextZipEntry()) != null) { - final String name = entry.getName(); - if (!entry.isDirectory()) { - final RequestMetaData metaData = new RequestMetaData(entry.getSize(), 2L, name); - final RequestIdentifier identifier = new RequestIdentifier(uri); - final ZipEntryIdentificationRequest zipRequest = new ZipEntryIdentificationRequest( - metaData, identifier, getTmpDir().toPath(), false); - - if (compressionMethod != null && !compressionMethod.equals(entry.getMethod())) { - logger.warn("Different compression method: " + compressionMethod + ", entry method: " - + entry.getMethod()); - } - - compressionMethod = entry.getMethod(); // throws UnsupportedZipFeatureException - expandContainer(zipRequest, in, newPath, aggregator); // zipRequest.size() is uncompressed - logger.debug("zipRequest size(): " + zipRequest.size() + " -- entry.getCompressedSize(): " - + entry.getCompressedSize() + " -- entry.getSize(): " + entry.getSize()); - if (entry.getCompressedSize() > 0) { - aggregator.incrementCompressedSize(entry.getCompressedSize()); - } - // in some situations the value returned is -1 - if (entry.getSize() > 0) { - aggregator.incrementOriginalSize(entry.getSize()); - } else if (zipRequest.size() > 0) { - aggregator.incrementOriginalSize(zipRequest.size()); - } - } - } - } catch (UnsupportedZipFeatureException e) { - // For now this indicates that we're attempting (and failing) to read from an encrypted ZIP file. - aggregator.setEncrypted(true); - } finally { - if (in != null) { - in.close(); - } - // shows collection of files within ZIP file - logger.debug("--------------"); - logger.debug("{}", aggregator); - logger.debug("--------------"); - } - } catch (IOException ioe) { - logger.warn(ioe + " (" + newPath + ")"); // continue after corrupt archive - } finally { - if (zipIn != null) { - try { - zipIn.close(); - } catch (IOException ioe) { - throw new CommandExecutionException(ioe.getMessage(), ioe); - } - } - } - return aggregator; - } -} diff --git a/src/test/java/edu/harvard/hul/ois/fits/junit/DocMDXmlUnitTest.java b/src/test/java/edu/harvard/hul/ois/fits/junit/DocMDXmlUnitTest.java index b4c97f30..7d598003 100644 --- a/src/test/java/edu/harvard/hul/ois/fits/junit/DocMDXmlUnitTest.java +++ b/src/test/java/edu/harvard/hul/ois/fits/junit/DocMDXmlUnitTest.java @@ -137,9 +137,9 @@ public void testEpubOutput() throws Exception { // process multiple files to examine different types of output String[] inputFilenames = { - "Winnie-the-Pooh-protected.epub", // not properly identified as epub mimetype + "Winnie-the-Pooh-protected.epub", "GeographyofBliss_oneChapter.epub", - "aliceDynamic_images_metadata_tableOfContents.epub", // not properly identified as epub mimetype + "aliceDynamic_images_metadata_tableOfContents.epub", // Missing mimetype file; DROID cannot id it "epub30-test-font-embedding-obfuscation.epub", "Calibre_hasTable_of_Contents.epub" }; diff --git a/src/test/java/edu/harvard/hul/ois/fits/junit/ZipDisabledXmlUnitTest.java b/src/test/java/edu/harvard/hul/ois/fits/junit/ZipDisabledXmlUnitTest.java new file mode 100644 index 00000000..942a1ea1 --- /dev/null +++ b/src/test/java/edu/harvard/hul/ois/fits/junit/ZipDisabledXmlUnitTest.java @@ -0,0 +1,41 @@ +/* + * Copyright 2016 Harvard University Library + * + * This file is part of FITS (File Information Tool Set). + * + * FITS is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * FITS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with FITS. If not, see . + */ +package edu.harvard.hul.ois.fits.junit; + +import edu.harvard.hul.ois.fits.tests.AbstractXmlUnitTest; +import org.junit.Test; + +/** + * These tests compare actual FITS output with expected output on ZIP files. + * These tests should be run with <display-tool-output>false</display-tool-output> in fits.xml. + * + * @author dan179 + */ +public class ZipDisabledXmlUnitTest extends AbstractXmlUnitTest { + + @Override + protected String fitsConfigFile() { + return "fits_archives_disabled.xml"; + } + + @Test + public void testCompressedZipFile() throws Exception { + testFile("assorted-files.zip", fits, OutputType.DEFAULT); + } +} diff --git a/src/test/java/edu/harvard/hul/ois/fits/tests/AbstractXmlUnitTest.java b/src/test/java/edu/harvard/hul/ois/fits/tests/AbstractXmlUnitTest.java index 8cda3b9c..f59070d3 100644 --- a/src/test/java/edu/harvard/hul/ois/fits/tests/AbstractXmlUnitTest.java +++ b/src/test/java/edu/harvard/hul/ois/fits/tests/AbstractXmlUnitTest.java @@ -201,7 +201,10 @@ protected void writeAndValidate(FitsOutput fitsOut, String inputFilename, Output } // Read in the expected XML file - String expectedFile = OUTPUT_DIR + inputFilename + namePart + EXPECTED_OUTPUT_FILE_SUFFIX; + String expectedFile = OUTPUT_DIR + inputFilename + namePart + "_" + className + ACTUAL_OUTPUT_FILE_SUFFIX; + if (Files.notExists(Paths.get(expectedFile))) { + expectedFile = OUTPUT_DIR + inputFilename + namePart + EXPECTED_OUTPUT_FILE_SUFFIX; + } String expectedXmlStr = FileUtils.readFileToString(new File(expectedFile), StandardCharsets.UTF_8); if (overwrite) { diff --git a/testfiles/output/aliceDynamic_images_metadata_tableOfContents.epub_XmlUnitExpectedOutput.xml b/testfiles/output/aliceDynamic_images_metadata_tableOfContents.epub_XmlUnitExpectedOutput.xml index d82ecca6..ae4e233f 100644 --- a/testfiles/output/aliceDynamic_images_metadata_tableOfContents.epub_XmlUnitExpectedOutput.xml +++ b/testfiles/output/aliceDynamic_images_metadata_tableOfContents.epub_XmlUnitExpectedOutput.xml @@ -16,18 +16,31 @@ - - yes - fiction - urn:uuid:1a16ce38-82bd-4e9b-861e-773c2e787a50 - en-GB - Lewis Carroll - Alice's Adventures in Wonderland + + 1409497 + deflate + + + + + + + + + en-GB + + + yes + fiction + urn:uuid:1a16ce38-82bd-4e9b-861e-773c2e787a50 + en-GB + Lewis Carroll + Alice's Adventures in Wonderland diff --git a/testfiles/output/assorted-files.zip-default_ZipDisabledXmlUnitTest_XmlUnitExpectedOutput.xml b/testfiles/output/assorted-files.zip-default_ZipDisabledXmlUnitTest_XmlUnitExpectedOutput.xml new file mode 100644 index 00000000..3deddbfd --- /dev/null +++ b/testfiles/output/assorted-files.zip-default_ZipDisabledXmlUnitTest_XmlUnitExpectedOutput.xml @@ -0,0 +1,40 @@ + + + + + + + + + + 1.0 + x-fmt/263 + + + + /fits/testfiles/input/assorted-files.zip + assorted-files.zip + 30400659 + 381dd28336fef8e188ebec5c6c29c596 + 1666562273086 + + + + + + + + + + + + + + + + + + + + + diff --git a/testfiles/properties/fits_archives_disabled.xml b/testfiles/properties/fits_archives_disabled.xml new file mode 100644 index 00000000..910c7579 --- /dev/null +++ b/testfiles/properties/fits_archives_disabled.xml @@ -0,0 +1,69 @@ + + + + + + + + + + + + + + + + + + + + + + + + false + false + true + false + xml/fits_output.xsd + http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd + http://hul.harvard.edu/ois/xml/ns/fits/fits_output + true + true + + + + + + + + + 20 + + + + DROID_SignatureFile_V109_Alt.xml + container-signature-20221102.xml + + + + + + + + false + false + false + false + false + false + false + false + false + + + + diff --git a/xml/fits.xml b/xml/fits.xml index 8f5a9397..7a695b91 100644 --- a/xml/fits.xml +++ b/xml/fits.xml @@ -50,5 +50,20 @@ - + + + + + true + true + true + true + true + true + true + true + true + + +