Skip to content

Commit

Permalink
Creation of id-groupid.csv required for validation #253
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed May 4, 2023
1 parent 52ffd20 commit d74ce95
Show file tree
Hide file tree
Showing 9 changed files with 129 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ private boolean satisfy(BibliographicRecord marcRecord, UseCase useCase) {
}
}
}
logger.log(Level.INFO, "failed for {} ({} -- {} -- {})",
logger.log(Level.INFO, "failed for {ö} ({1} -- {2} -- {3})",
new Object[]{useCase.name(), useCase.getUseCase(), useCase.getEncoding(), useCase.getDataElelemntsNormalized()});
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ private int processFieldsWithSchemePica(int total, List<FieldWithScheme> fieldsW
updateSchemaSubfieldStatistics(field, currentSchema);
count++;
} else {
logger.log(Level.SEVERE, "undetected subfield in record {} {}", new Object[]{marcRecord.getId(), field.toString()});
logger.log(Level.SEVERE, "undetected subfield in record {0} {1}", new Object[]{marcRecord.getId(), field.toString()});
}
}
registerSchemas(schemas);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ else if (field.getInd2().equals("2"))
else if (field.getInd2().equals("7")) {
List<MarcSubfield> subfield2 = field.getSubfield("2");
if (subfield2 == null) {
logger.log(Level.SEVERE, "Error in {}: ind2 = 7, but there is no $2", marcRecord.getControl001().getContent());
logger.log(Level.SEVERE, "Error in {0}: ind2 = 7, but there is no $2", marcRecord.getControl001().getContent());
} else
switch (field.getSubfield("2").get(0).getValue()) {
case "fast": ttScores.count(ThompsonTraillFields.FAST); break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ private String getPackageName(DataField field) {
else {
packageName = plugin.getPackageName(field);
if (StringUtils.isBlank(packageName)) {
logger.log(Level.WARNING, "{} has no package. /{}", new Object[]{field, field.getDefinition().getClass()});
logger.log(Level.WARNING, "{} has no package. /{0}", new Object[]{field, field.getDefinition().getClass()});
packageName = TagCategory.OTHER.getPackageName();
}
completenessDAO.getPackageNameCache().put(field.getDefinition(), packageName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ private void printToFile(File file, String message) {
private File prepareReportFile(String outputDir, String fileName) {
File reportFile = new File(outputDir, fileName);
if (reportFile.exists() && !reportFile.delete())
logger.log(Level.SEVERE, "File {} hasn't been deleted", reportFile.getAbsolutePath());
logger.log(Level.SEVERE, "File {0} hasn't been deleted", reportFile.getAbsolutePath());
return reportFile;
}

Expand Down Expand Up @@ -155,7 +155,7 @@ private void printClassificationsCollocation() {
Integer total = statistics.recordCountWithClassification();
logger.info("total: " + total);
if (!total1.equals(total))
logger.log(Level.SEVERE, "total from hasClassifications ({}) != from collation ({})", new Object[]{total1, total});
logger.log(Level.SEVERE, "total from hasClassifications ({0}) != from collation ({1})", new Object[]{total1, total});

statistics.getCollocationHistogram()
.entrySet()
Expand Down
24 changes: 12 additions & 12 deletions src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import org.apache.commons.cli.ParseException;
import org.marc4j.marc.Record;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Files;
Expand Down Expand Up @@ -52,18 +51,20 @@ public class Completeness extends QACli implements BibliographicInputProcessor,
private CompletenessPlugin plugin;
private RecordFilter recordFilter;
private RecordIgnorator recordIgnorator;
private File idCollectorFile;

public Completeness(String[] args) throws ParseException {
parameters = new CompletenessParameters(args);
plugin = CompletenessFactory.create(parameters);
recordFilter = parameters.getRecordFilter();
recordIgnorator = parameters.getRecordIgnorator();
initializeGroups(parameters.getGroupBy(), parameters.isPica());
readyToProcess = true;
initializeGroups(parameters.getGroupBy(), parameters.isPica());
if (doGroups()) {
idCollectorFile = prepareReportFile(parameters.getOutputDir(), "id-groupid.csv");
printToFile(idCollectorFile, CsvUtils.createCsv("id", "groupId"));
initializeMeta(parameters);
if (saveGroupIds) {
idCollectorFile = prepareReportFile(parameters.getOutputDir(), "id-groupid.csv");
printToFile(idCollectorFile, CsvUtils.createCsv("id", "groupId"));
}
}
}

Expand Down Expand Up @@ -109,12 +110,11 @@ public void processRecord(BibliographicRecord bibliographicRecord, int recordNum
RecordCompleteness recordCompleteness = new RecordCompleteness(bibliographicRecord, parameters, completenessDAO, plugin, groupBy);
recordCompleteness.process();

if (doGroups()) {
for (String id : recordCompleteness.getGroupIds()) {
if (saveGroupIds)
saveGroupIds(bibliographicRecord.getId(true), recordCompleteness.getGroupIds());
if (doGroups())
for (String id : recordCompleteness.getGroupIds())
Utils.count(id, completenessDAO.getGroupCounter());
printToFile(idCollectorFile, CsvUtils.createCsv(bibliographicRecord.getId(true), id));
}
}

for (String path : recordCompleteness.getRecordFrequency().keySet()) {
if (groupBy != null) {
Expand Down Expand Up @@ -406,12 +406,12 @@ private String formatCardinality(String marcPath,
if (groupId != null) {
histogram = completenessDAO.getGrouppedFieldHistogram().get(groupId).get(marcPath);
if (!completenessDAO.getGrouppedFieldHistogram().get(groupId).containsKey(marcPath)) {
logger.log(Level.WARNING,"Field {} is not registered in histogram", marcPath);
logger.log(Level.WARNING,"Field {0} is not registered in histogram", marcPath);
}
} else {
histogram = completenessDAO.getFieldHistogram().get(marcPath);
if (!completenessDAO.getFieldHistogram().containsKey(marcPath)) {
logger.log(Level.WARNING,"Field {} is not registered in histogram", marcPath);
logger.log(Level.WARNING,"Field {0} is not registered in histogram", marcPath);
}
}
BasicStatistics statistics = new BasicStatistics(histogram);
Expand Down
95 changes: 95 additions & 0 deletions src/main/java/de/gwdg/metadataqa/marc/cli/QACli.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,27 @@

import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import de.gwdg.metadataqa.marc.CsvUtils;
import de.gwdg.metadataqa.marc.Utils;
import de.gwdg.metadataqa.marc.cli.parameters.CommonParameters;
import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord;
import de.gwdg.metadataqa.marc.utils.BibiographicPath;
import de.gwdg.metadataqa.marc.utils.pica.path.PicaPath;
import de.gwdg.metadataqa.marc.utils.pica.path.PicaPathParser;
import org.apache.avro.generic.GenericData;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;

import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileTime;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
Expand All @@ -23,6 +35,10 @@ public abstract class QACli {
private static final Logger logger = Logger.getLogger(QACli.class.getCanonicalName());
public static final String ALL = "0";
protected BibiographicPath groupBy = null;
protected File idCollectorFile;
private FileTime jarModifiedTime;
private boolean isJarModifiedTimeDetected = false;
protected boolean saveGroupIds = true;

protected void initializeGroups(String groupBy, boolean isPica) {
if (groupBy != null) {
Expand All @@ -46,6 +62,21 @@ protected <T extends CommonParameters> void saveParameters(String fileName, T pa
}
}

protected Set<String> getGroupIds(CommonParameters parameters, BibliographicRecord bibliographicRecord) {
if (this.groupBy != null) {
List<String> idLists = parameters.isPica() ? bibliographicRecord.select((PicaPath) groupBy) : null; // TODO: MARC21
return QACli.extractGroupIds(idLists);
}
return new HashSet<>();
}

protected void saveGroupIds(String recordId, Set<String> groupIds) {
if (doGroups() && !groupIds.isEmpty())
for (String groupId : groupIds)
printToFile(idCollectorFile, CsvUtils.createCsv(recordId, groupId));
}


public static Set<String> extractGroupIds(List<String> idLists) {
Set<String> groupIds = new HashSet<>();
groupIds.add(ALL);
Expand All @@ -62,6 +93,56 @@ public boolean doGroups() {
return groupBy != null;
}

protected boolean isJarNewerThan(String outputDir, String fileName) {
try {
initializeJarModifiedTime();
File reportFile = new File(outputDir, fileName);
if (!reportFile.exists())
return true;
if (reportFile.isFile()) {
FileTime groupModifiedTime = Files.readAttributes(reportFile.toPath(), BasicFileAttributes.class).lastModifiedTime();
return (jarModifiedTime == null || jarModifiedTime.compareTo(groupModifiedTime) == 1);
}
} catch (IOException e) {
logger.severe("Error during prepareReportFile: " + e);
}
return false;
}

protected void initializeMeta(CommonParameters parameters) {
File idCollectorMeta = new File(parameters.getOutputDir(), "id-groupid.meta.txt");
String currentFileList = getFilesWithDate(parameters.getArgs());
if (isJarNewerThan(parameters.getOutputDir(), "id-groupid.csv")) {
if (!idCollectorMeta.delete())
logger.severe("id-groupid.meta.txt has not been deleted.");
} else {
if (idCollectorMeta.exists()) {
try {
String storedFileList = FileUtils.readFileToString(idCollectorMeta, StandardCharsets.UTF_8).trim();
saveGroupIds = ! currentFileList.equals(storedFileList);
if (!idCollectorMeta.delete())
logger.severe("id-groupid.meta.txt has not been deleted.");
} catch (IOException e) {
logger.severe(e.getLocalizedMessage());
}
}
}
printToFile(idCollectorMeta, currentFileList);
}

private String getFilesWithDate(String[] fileNames) {
List<String> filesWithDate = new ArrayList<>();
for (String fileName : fileNames) {
try {
FileTime modifiedTime = Files.readAttributes(new File(fileName).toPath(), BasicFileAttributes.class).lastModifiedTime();
filesWithDate.add(fileName + ":" + modifiedTime.toString());
} catch (IOException e) {
logger.warning(e.getLocalizedMessage());
}
}
return StringUtils.join(filesWithDate, ",");
}

protected File prepareReportFile(String outputDir, String fileName) {
File reportFile = new File(outputDir, fileName);
if (reportFile.exists() && !reportFile.delete())
Expand All @@ -81,4 +162,18 @@ protected void printToFile(File file, String content) {
logger.log(Level.SEVERE, "printToFile", e);
}
}

private void initializeJarModifiedTime() {
if (!isJarModifiedTimeDetected) {
try {
File currentJar = new File(this.getClass().getProtectionDomain().getCodeSource().getLocation().toURI());
if (currentJar.isFile()) {
jarModifiedTime = Files.readAttributes(currentJar.toPath(), BasicFileAttributes.class).lastModifiedTime();
}
} catch (URISyntaxException | IOException e) {
throw new RuntimeException(e);
}
isJarModifiedTimeDetected = true;
}
}
}
22 changes: 15 additions & 7 deletions src/main/java/de/gwdg/metadataqa/marc/cli/ValidatorCli.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,13 @@
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileTime;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
Expand All @@ -46,8 +50,6 @@ public class ValidatorCli extends QACli implements BibliographicInputProcessor,

private final ValidatorParameters parameters;
private final Map<Integer, Integer> hashedIndex = new HashMap<>();
private final Map<String, Set<String>> isbnCollector = new TreeMap<>();
private final Map<String, Set<String>> issnCollector = new TreeMap<>();
private File detailsFile = null;
private File summaryFile = null;
private File collectorFile = null;
Expand Down Expand Up @@ -79,6 +81,14 @@ public ValidatorCli(ValidatorParameters parameters) {
.withSchemaType(parameters.getSchemaType())
;
initializeGroups(parameters.getGroupBy(), parameters.isPica());
if (doGroups()) {
initializeMeta(parameters);
if (saveGroupIds) {
logger.info("saveGroupIds!");
idCollectorFile = prepareReportFile(parameters.getOutputDir(), "id-groupid.csv");
printToFile(idCollectorFile, CsvUtils.createCsv("id", "groupId"));
}
}
}

public static void main(String[] args) {
Expand Down Expand Up @@ -172,11 +182,9 @@ public void processRecord(BibliographicRecord bibliographicRecord, int i) {
return;
}

Set<String> groupIds = new HashSet<>();
if (groupBy != null) {
List<String> idLists = parameters.isPica() ? bibliographicRecord.select((PicaPath) groupBy) : null; // TODO: MARC21
groupIds = extractGroupIds(idLists);
}
Set<String> groupIds = getGroupIds(parameters, bibliographicRecord);
if (saveGroupIds)
saveGroupIds(bibliographicRecord.getId(true), groupIds);

Validator validator = new Validator(validatorConfiguration);
boolean isValid = validator.validate(bibliographicRecord);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ private void processFile(String inputFileName) {
MarcReader reader = getMarcFileReader(processor.getParameters(), path);
processContent(reader, fileName);
if (processor.getParameters().doLog())
logger.log(Level.INFO, "Finished processing file. Processed {} records.", decimalFormat.format(i));
logger.log(Level.INFO, "Finished processing file. Processed {0} records.", decimalFormat.format(i));

} catch (SolrServerException ex) {
if (processor.getParameters().doLog())
Expand Down

0 comments on commit d74ce95

Please sign in to comment.