Skip to content

Commit

Permalink
WarcDigest: Add raw() method to access the raw uncanonicalized string…
Browse files Browse the repository at this point in the history
… value

By putting this on WarcDigest itself it means we don't have to double up all the digest accessors.

#74
  • Loading branch information
ato committed Jun 28, 2023
1 parent a810738 commit c6e40dc
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 34 deletions.
63 changes: 45 additions & 18 deletions src/org/netpreserve/jwarc/WarcDigest.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,20 @@
import java.util.Objects;

public class WarcDigest {

private final String algorithm;
private final String value;
private final String raw;
private String algorithm;
private String value;

public WarcDigest(String digest) {
int i = digest.indexOf(':');
if (i == -1) {
throw new IllegalArgumentException("Invalid WARC-Digest");
}
this.algorithm = digest.substring(0, i);
this.value = base32Encode(digest.substring(i + 1), this.algorithm);
Objects.requireNonNull(digest);
raw = digest;
}

public WarcDigest(String algorithm, String value) {
this.algorithm = algorithm;
Objects.requireNonNull(algorithm);
Objects.requireNonNull(value);
raw = algorithm + ":" + value;
this.algorithm = canonicalizeAlgorithm(algorithm);
this.value = base32Encode(value, algorithm);
}

Expand All @@ -36,14 +35,35 @@ public WarcDigest(String algorithm, byte[] value) {
}

public WarcDigest(MessageDigest messageDigest) {
algorithm = messageDigest.getAlgorithm().replace("-", "").toLowerCase(Locale.US);
algorithm = canonicalizeAlgorithm(messageDigest.getAlgorithm());
value = base32Encode(messageDigest.digest());
raw = algorithm + ":" + value;
}

private static String canonicalizeAlgorithm(String algorithm) {
return algorithm.replace("-", "").toLowerCase(Locale.US);
}

private void parse() {
if (value != null) return;
int i = raw.indexOf(':');
if (i == -1) {
throw new IllegalArgumentException("Invalid WARC-Digest");
}
this.algorithm = canonicalizeAlgorithm(raw.substring(0, i));
this.value = base32Encode(raw.substring(i + 1), this.algorithm);
}

public String algorithm() {
parse();
return algorithm;
}

private String value() {
parse();
return value;
}

public String hex() {
return hexEncode(bytes());
}
Expand All @@ -53,22 +73,29 @@ public String base16() {
}

public String base32() {
return value;
return value();
}

public String base64() {
return base64Encode(bytes());
}

public byte[] bytes() { return base32Decode(value); }
public byte[] bytes() { return base32Decode(value()); }

/**
* Returns the original digest string without any canonicalization.
*/
public String raw() {
return raw;
}

@Override
public String toString() {
return prefixedBase32();
}

public String prefixedBase32() {
return algorithm + ":" + value;
return algorithm() + ":" + value();
}

static String hexEncode(byte[] data) {
Expand Down Expand Up @@ -197,13 +224,13 @@ public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
WarcDigest digest = (WarcDigest) o;
return Objects.equals(algorithm, digest.algorithm) &&
Objects.equals(value, digest.value);
return Objects.equals(algorithm(), digest.algorithm()) &&
Objects.equals(value(), digest.value());
}

@Override
public int hashCode() {
return Objects.hash(algorithm, value);
return Objects.hash(algorithm(), value());
}

/**
Expand All @@ -223,6 +250,6 @@ public static MessageDigest getDigester(String algorithm) throws NoSuchAlgorithm
}

public MessageDigest getDigester() throws NoSuchAlgorithmException {
return getDigester(algorithm);
return getDigester(algorithm());
}
}
8 changes: 0 additions & 8 deletions src/org/netpreserve/jwarc/WarcTargetRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,6 @@ public URI targetURI() {
public Optional<WarcDigest> payloadDigest() {
return headers().sole("WARC-Payload-Digest").map(WarcDigest::new);
}

/**
* Return the payload digest value directly
*/
public Optional<String> payloadDigestUnchanged() {
return headers().sole("WARC-Payload-Digest");
}


/**
* A content-type that was identified by an independent check (not just what the server said).
Expand Down
9 changes: 3 additions & 6 deletions src/org/netpreserve/jwarc/cdx/CdxFields.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,9 @@ public static String format(byte field, WarcCaptureRecord record, boolean digest
try {
switch (field) {
case CHECKSUM:
if (digestUnchanged) {
return record.payloadDigestUnchanged().get();
}
else {
return record.payloadDigest().map(WarcDigest::base32).orElse("-");
}
return record.payloadDigest()
.map(digestUnchanged ? WarcDigest::raw : WarcDigest::base32)
.orElse("-");
case DATE:
return DATE_FORMAT.format(record.date());
case MIME_TYPE:
Expand Down
11 changes: 9 additions & 2 deletions test/org/netpreserve/jwarc/WarcDigestTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,20 @@
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.*;

public class WarcDigestTest {

private byte[] contentBytes = "hello world".getBytes();

@Test
public void testParsing() {
WarcDigest digest = new WarcDigest("Sha1:FKXGYNOJJ7H3IFO35FPUBC445EPOQRXN");
assertEquals("Sha1:FKXGYNOJJ7H3IFO35FPUBC445EPOQRXN", digest.raw());
assertEquals("sha1", digest.algorithm());
assertEquals("sha1:FKXGYNOJJ7H3IFO35FPUBC445EPOQRXN", digest.prefixedBase32());
}

@Test
public void testSha1() throws NoSuchAlgorithmException {
MessageDigest md = MessageDigest.getInstance("SHA-1");
Expand Down

0 comments on commit c6e40dc

Please sign in to comment.