diff --git a/src/main/java/de/gwdg/metadataqa/marc/analysis/BLClassifier.java b/src/main/java/de/gwdg/metadataqa/marc/analysis/BLClassifier.java index fc9bed6db..47a4ca949 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/analysis/BLClassifier.java +++ b/src/main/java/de/gwdg/metadataqa/marc/analysis/BLClassifier.java @@ -70,7 +70,7 @@ private boolean satisfy(BibliographicRecord marcRecord, UseCase useCase) { } } } - logger.log(Level.INFO, "failed for {} ({} -- {} -- {})", + logger.log(Level.INFO, "failed for {รถ} ({1} -- {2} -- {3})", new Object[]{useCase.name(), useCase.getUseCase(), useCase.getEncoding(), useCase.getDataElelemntsNormalized()}); return false; } diff --git a/src/main/java/de/gwdg/metadataqa/marc/analysis/ClassificationAnalyzer.java b/src/main/java/de/gwdg/metadataqa/marc/analysis/ClassificationAnalyzer.java index 77a1a2a39..1b636e3b1 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/analysis/ClassificationAnalyzer.java +++ b/src/main/java/de/gwdg/metadataqa/marc/analysis/ClassificationAnalyzer.java @@ -206,7 +206,7 @@ private int processFieldsWithSchemePica(int total, List fieldsW updateSchemaSubfieldStatistics(field, currentSchema); count++; } else { - logger.log(Level.SEVERE, "undetected subfield in record {} {}", new Object[]{marcRecord.getId(), field.toString()}); + logger.log(Level.SEVERE, "undetected subfield in record {0} {1}", new Object[]{marcRecord.getId(), field.toString()}); } } registerSchemas(schemas); diff --git a/src/main/java/de/gwdg/metadataqa/marc/analysis/ThompsonTraillAnalysis.java b/src/main/java/de/gwdg/metadataqa/marc/analysis/ThompsonTraillAnalysis.java index ee1be5d86..0703ba48b 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/analysis/ThompsonTraillAnalysis.java +++ b/src/main/java/de/gwdg/metadataqa/marc/analysis/ThompsonTraillAnalysis.java @@ -207,7 +207,7 @@ else if (field.getInd2().equals("2")) else if (field.getInd2().equals("7")) { List subfield2 = field.getSubfield("2"); if (subfield2 == null) { - logger.log(Level.SEVERE, "Error in {}: ind2 = 7, but there is no $2", marcRecord.getControl001().getContent()); + logger.log(Level.SEVERE, "Error in {0}: ind2 = 7, but there is no $2", marcRecord.getControl001().getContent()); } else switch (field.getSubfield("2").get(0).getValue()) { case "fast": ttScores.count(ThompsonTraillFields.FAST); break; diff --git a/src/main/java/de/gwdg/metadataqa/marc/analysis/completeness/RecordCompleteness.java b/src/main/java/de/gwdg/metadataqa/marc/analysis/completeness/RecordCompleteness.java index fe942c78e..76b04ba20 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/analysis/completeness/RecordCompleteness.java +++ b/src/main/java/de/gwdg/metadataqa/marc/analysis/completeness/RecordCompleteness.java @@ -223,7 +223,7 @@ private String getPackageName(DataField field) { else { packageName = plugin.getPackageName(field); if (StringUtils.isBlank(packageName)) { - logger.log(Level.WARNING, "{} has no package. /{}", new Object[]{field, field.getDefinition().getClass()}); + logger.log(Level.WARNING, "{} has no package. /{0}", new Object[]{field, field.getDefinition().getClass()}); packageName = TagCategory.OTHER.getPackageName(); } completenessDAO.getPackageNameCache().put(field.getDefinition(), packageName); diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysis.java b/src/main/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysis.java index ac5019e68..9b3cdc095 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysis.java +++ b/src/main/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysis.java @@ -116,7 +116,7 @@ private void printToFile(File file, String message) { private File prepareReportFile(String outputDir, String fileName) { File reportFile = new File(outputDir, fileName); if (reportFile.exists() && !reportFile.delete()) - logger.log(Level.SEVERE, "File {} hasn't been deleted", reportFile.getAbsolutePath()); + logger.log(Level.SEVERE, "File {0} hasn't been deleted", reportFile.getAbsolutePath()); return reportFile; } @@ -155,7 +155,7 @@ private void printClassificationsCollocation() { Integer total = statistics.recordCountWithClassification(); logger.info("total: " + total); if (!total1.equals(total)) - logger.log(Level.SEVERE, "total from hasClassifications ({}) != from collation ({})", new Object[]{total1, total}); + logger.log(Level.SEVERE, "total from hasClassifications ({0}) != from collation ({1})", new Object[]{total1, total}); statistics.getCollocationHistogram() .entrySet() diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java b/src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java index 47ff5f68d..195e8170c 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java +++ b/src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java @@ -23,7 +23,6 @@ import org.apache.commons.cli.ParseException; import org.marc4j.marc.Record; -import java.io.File; import java.io.IOException; import java.io.Serializable; import java.nio.file.Files; @@ -52,18 +51,20 @@ public class Completeness extends QACli implements BibliographicInputProcessor, private CompletenessPlugin plugin; private RecordFilter recordFilter; private RecordIgnorator recordIgnorator; - private File idCollectorFile; public Completeness(String[] args) throws ParseException { parameters = new CompletenessParameters(args); plugin = CompletenessFactory.create(parameters); recordFilter = parameters.getRecordFilter(); recordIgnorator = parameters.getRecordIgnorator(); - initializeGroups(parameters.getGroupBy(), parameters.isPica()); readyToProcess = true; + initializeGroups(parameters.getGroupBy(), parameters.isPica()); if (doGroups()) { - idCollectorFile = prepareReportFile(parameters.getOutputDir(), "id-groupid.csv"); - printToFile(idCollectorFile, CsvUtils.createCsv("id", "groupId")); + initializeMeta(parameters); + if (saveGroupIds) { + idCollectorFile = prepareReportFile(parameters.getOutputDir(), "id-groupid.csv"); + printToFile(idCollectorFile, CsvUtils.createCsv("id", "groupId")); + } } } @@ -109,12 +110,11 @@ public void processRecord(BibliographicRecord bibliographicRecord, int recordNum RecordCompleteness recordCompleteness = new RecordCompleteness(bibliographicRecord, parameters, completenessDAO, plugin, groupBy); recordCompleteness.process(); - if (doGroups()) { - for (String id : recordCompleteness.getGroupIds()) { + if (saveGroupIds) + saveGroupIds(bibliographicRecord.getId(true), recordCompleteness.getGroupIds()); + if (doGroups()) + for (String id : recordCompleteness.getGroupIds()) Utils.count(id, completenessDAO.getGroupCounter()); - printToFile(idCollectorFile, CsvUtils.createCsv(bibliographicRecord.getId(true), id)); - } - } for (String path : recordCompleteness.getRecordFrequency().keySet()) { if (groupBy != null) { @@ -406,12 +406,12 @@ private String formatCardinality(String marcPath, if (groupId != null) { histogram = completenessDAO.getGrouppedFieldHistogram().get(groupId).get(marcPath); if (!completenessDAO.getGrouppedFieldHistogram().get(groupId).containsKey(marcPath)) { - logger.log(Level.WARNING,"Field {} is not registered in histogram", marcPath); + logger.log(Level.WARNING,"Field {0} is not registered in histogram", marcPath); } } else { histogram = completenessDAO.getFieldHistogram().get(marcPath); if (!completenessDAO.getFieldHistogram().containsKey(marcPath)) { - logger.log(Level.WARNING,"Field {} is not registered in histogram", marcPath); + logger.log(Level.WARNING,"Field {0} is not registered in histogram", marcPath); } } BasicStatistics statistics = new BasicStatistics(histogram); diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/QACli.java b/src/main/java/de/gwdg/metadataqa/marc/cli/QACli.java index 2d7451906..36d92535e 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/cli/QACli.java +++ b/src/main/java/de/gwdg/metadataqa/marc/cli/QACli.java @@ -2,15 +2,27 @@ import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; +import de.gwdg.metadataqa.marc.CsvUtils; +import de.gwdg.metadataqa.marc.Utils; import de.gwdg.metadataqa.marc.cli.parameters.CommonParameters; +import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord; import de.gwdg.metadataqa.marc.utils.BibiographicPath; +import de.gwdg.metadataqa.marc.utils.pica.path.PicaPath; import de.gwdg.metadataqa.marc.utils.pica.path.PicaPathParser; +import org.apache.avro.generic.GenericData; import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; import java.io.File; import java.io.IOException; +import java.net.URISyntaxException; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.nio.file.Paths; +import java.nio.file.attribute.BasicFileAttributes; +import java.nio.file.attribute.FileTime; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; @@ -23,6 +35,10 @@ public abstract class QACli { private static final Logger logger = Logger.getLogger(QACli.class.getCanonicalName()); public static final String ALL = "0"; protected BibiographicPath groupBy = null; + protected File idCollectorFile; + private FileTime jarModifiedTime; + private boolean isJarModifiedTimeDetected = false; + protected boolean saveGroupIds = true; protected void initializeGroups(String groupBy, boolean isPica) { if (groupBy != null) { @@ -46,6 +62,21 @@ protected void saveParameters(String fileName, T pa } } + protected Set getGroupIds(CommonParameters parameters, BibliographicRecord bibliographicRecord) { + if (this.groupBy != null) { + List idLists = parameters.isPica() ? bibliographicRecord.select((PicaPath) groupBy) : null; // TODO: MARC21 + return QACli.extractGroupIds(idLists); + } + return new HashSet<>(); + } + + protected void saveGroupIds(String recordId, Set groupIds) { + if (doGroups() && !groupIds.isEmpty()) + for (String groupId : groupIds) + printToFile(idCollectorFile, CsvUtils.createCsv(recordId, groupId)); + } + + public static Set extractGroupIds(List idLists) { Set groupIds = new HashSet<>(); groupIds.add(ALL); @@ -62,6 +93,56 @@ public boolean doGroups() { return groupBy != null; } + protected boolean isJarNewerThan(String outputDir, String fileName) { + try { + initializeJarModifiedTime(); + File reportFile = new File(outputDir, fileName); + if (!reportFile.exists()) + return true; + if (reportFile.isFile()) { + FileTime groupModifiedTime = Files.readAttributes(reportFile.toPath(), BasicFileAttributes.class).lastModifiedTime(); + return (jarModifiedTime == null || jarModifiedTime.compareTo(groupModifiedTime) == 1); + } + } catch (IOException e) { + logger.severe("Error during prepareReportFile: " + e); + } + return false; + } + + protected void initializeMeta(CommonParameters parameters) { + File idCollectorMeta = new File(parameters.getOutputDir(), "id-groupid.meta.txt"); + String currentFileList = getFilesWithDate(parameters.getArgs()); + if (isJarNewerThan(parameters.getOutputDir(), "id-groupid.csv")) { + if (!idCollectorMeta.delete()) + logger.severe("id-groupid.meta.txt has not been deleted."); + } else { + if (idCollectorMeta.exists()) { + try { + String storedFileList = FileUtils.readFileToString(idCollectorMeta, StandardCharsets.UTF_8).trim(); + saveGroupIds = ! currentFileList.equals(storedFileList); + if (!idCollectorMeta.delete()) + logger.severe("id-groupid.meta.txt has not been deleted."); + } catch (IOException e) { + logger.severe(e.getLocalizedMessage()); + } + } + } + printToFile(idCollectorMeta, currentFileList); + } + + private String getFilesWithDate(String[] fileNames) { + List filesWithDate = new ArrayList<>(); + for (String fileName : fileNames) { + try { + FileTime modifiedTime = Files.readAttributes(new File(fileName).toPath(), BasicFileAttributes.class).lastModifiedTime(); + filesWithDate.add(fileName + ":" + modifiedTime.toString()); + } catch (IOException e) { + logger.warning(e.getLocalizedMessage()); + } + } + return StringUtils.join(filesWithDate, ","); + } + protected File prepareReportFile(String outputDir, String fileName) { File reportFile = new File(outputDir, fileName); if (reportFile.exists() && !reportFile.delete()) @@ -81,4 +162,18 @@ protected void printToFile(File file, String content) { logger.log(Level.SEVERE, "printToFile", e); } } + + private void initializeJarModifiedTime() { + if (!isJarModifiedTimeDetected) { + try { + File currentJar = new File(this.getClass().getProtectionDomain().getCodeSource().getLocation().toURI()); + if (currentJar.isFile()) { + jarModifiedTime = Files.readAttributes(currentJar.toPath(), BasicFileAttributes.class).lastModifiedTime(); + } + } catch (URISyntaxException | IOException e) { + throw new RuntimeException(e); + } + isJarModifiedTimeDetected = true; + } + } } diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/ValidatorCli.java b/src/main/java/de/gwdg/metadataqa/marc/cli/ValidatorCli.java index b23cac04e..4fbba7e8d 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/cli/ValidatorCli.java +++ b/src/main/java/de/gwdg/metadataqa/marc/cli/ValidatorCli.java @@ -23,9 +23,13 @@ import java.io.File; import java.io.IOException; import java.io.Serializable; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.nio.file.attribute.BasicFileAttributes; +import java.nio.file.attribute.FileTime; import java.util.*; import java.util.logging.Level; import java.util.logging.Logger; @@ -46,8 +50,6 @@ public class ValidatorCli extends QACli implements BibliographicInputProcessor, private final ValidatorParameters parameters; private final Map hashedIndex = new HashMap<>(); - private final Map> isbnCollector = new TreeMap<>(); - private final Map> issnCollector = new TreeMap<>(); private File detailsFile = null; private File summaryFile = null; private File collectorFile = null; @@ -79,6 +81,14 @@ public ValidatorCli(ValidatorParameters parameters) { .withSchemaType(parameters.getSchemaType()) ; initializeGroups(parameters.getGroupBy(), parameters.isPica()); + if (doGroups()) { + initializeMeta(parameters); + if (saveGroupIds) { + logger.info("saveGroupIds!"); + idCollectorFile = prepareReportFile(parameters.getOutputDir(), "id-groupid.csv"); + printToFile(idCollectorFile, CsvUtils.createCsv("id", "groupId")); + } + } } public static void main(String[] args) { @@ -172,11 +182,9 @@ public void processRecord(BibliographicRecord bibliographicRecord, int i) { return; } - Set groupIds = new HashSet<>(); - if (groupBy != null) { - List idLists = parameters.isPica() ? bibliographicRecord.select((PicaPath) groupBy) : null; // TODO: MARC21 - groupIds = extractGroupIds(idLists); - } + Set groupIds = getGroupIds(parameters, bibliographicRecord); + if (saveGroupIds) + saveGroupIds(bibliographicRecord.getId(true), groupIds); Validator validator = new Validator(validatorConfiguration); boolean isValid = validator.validate(bibliographicRecord); diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/utils/RecordIterator.java b/src/main/java/de/gwdg/metadataqa/marc/cli/utils/RecordIterator.java index 43a984d15..2516956f2 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/cli/utils/RecordIterator.java +++ b/src/main/java/de/gwdg/metadataqa/marc/cli/utils/RecordIterator.java @@ -112,7 +112,7 @@ private void processFile(String inputFileName) { MarcReader reader = getMarcFileReader(processor.getParameters(), path); processContent(reader, fileName); if (processor.getParameters().doLog()) - logger.log(Level.INFO, "Finished processing file. Processed {} records.", decimalFormat.format(i)); + logger.log(Level.INFO, "Finished processing file. Processed {0} records.", decimalFormat.format(i)); } catch (SolrServerException ex) { if (processor.getParameters().doLog())