Skip to content

Commit

Permalink
feat: improve detection of non-UTF-8 file names
Browse files Browse the repository at this point in the history
- explicitly read ZIP archives as UTF-8 encoded
- report a (new) fatal error PKG-027 when the a zip could not be read
  due to encoding issues (detected by matching the exception message)
- add tests

Fixes #1236
  • Loading branch information
rdeltour committed Nov 14, 2022
1 parent 62d67e7 commit 5248914
Show file tree
Hide file tree
Showing 10 changed files with 104 additions and 65 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ private void initialize()
severities.put(MessageId.PKG_024, Severity.INFO);
severities.put(MessageId.PKG_025, Severity.ERROR);
severities.put(MessageId.PKG_026, Severity.ERROR);
severities.put(MessageId.PKG_027, Severity.FATAL);

// Resources
severities.put(MessageId.RSC_001, Severity.ERROR);
Expand Down
1 change: 1 addition & 0 deletions src/main/java/com/adobe/epubcheck/messages/MessageId.java
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ public enum MessageId implements Comparable<MessageId>
PKG_024("PKG-024"),
PKG_025("PKG-025"),
PKG_026("PKG-026"),
PKG_027("PKG-027"),

// Messages relating to resources
RSC_001("RSC-001"),
Expand Down
139 changes: 76 additions & 63 deletions src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,11 @@ public void check()
// Check the OCF Container file structure
// --------------------------------------
//
checkContainerStructure(state);
if (!checkContainerStructure(state))
{
return;
}
;
OCFContainer container = state.getContainer();

//
Expand Down Expand Up @@ -270,83 +274,92 @@ private boolean checkContainerFile(OCFCheckerState state)
return true;
}

private void checkContainerStructure(OCFCheckerState state)
private boolean checkContainerStructure(OCFCheckerState state)
{
// Get a container
Iterable<OCFResource> resourcesProvider;
try
{
// FIXME 2022 build resourcesProvider depending on MIME type
resourcesProvider = new OCFZipResources(context.url);
} catch (IOException e)
{
// FIXME 2022 see how to propagate fatal IOError
report.message(MessageId.PKG_008, EPUBLocation.of(context), e.getLocalizedMessage());
return;
}
// Map to store the container resource files
Map<String, OCFResource> resources = new HashMap<>();
// List to store the container resource directories
List<String> directories = new LinkedList<>();

// Loop through the entries
OCFFilenameChecker filenameChecker = new OCFFilenameChecker(state.context().build());
for (OCFResource resource : resourcesProvider)
{
Preconditions.checkNotNull(resource.getPath());
Preconditions.checkNotNull(resource.getProperties());
// Get a container
Iterable<OCFResource> resourcesProvider = new OCFZipResources(context.url);
// Map to store the container resource files
Map<String, OCFResource> resources = new HashMap<>();
// List to store the container resource directories
List<String> directories = new LinkedList<>();

// Loop through the entries
OCFFilenameChecker filenameChecker = new OCFFilenameChecker(state.context().build());
// FIXME catch IAE MALFORMED entries
for (OCFResource resource : resourcesProvider)
{
Preconditions.checkNotNull(resource.getPath());
Preconditions.checkNotNull(resource.getProperties());

// FIXME 2022 report symbolic links and continue
// FIXME 2022 report symbolic links and continue

// Check duplicate entries
if (resources.containsKey(resource.getPath().toLowerCase(Locale.ROOT)))
{
context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath());
}
// Check duplicate entries after NFC normalization
else if (resources.containsKey(
Normalizer.normalize(resource.getPath().toLowerCase(Locale.ROOT), Normalizer.Form.NFC)))
{
context.report.message(MessageId.OPF_061, EPUBLocation.of(context), resource.getPath());
}
// Check duplicate entries
if (resources.containsKey(resource.getPath().toLowerCase(Locale.ROOT)))
{
context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath());
}
// Check duplicate entries after NFC normalization
else if (resources.containsKey(
Normalizer.normalize(resource.getPath().toLowerCase(Locale.ROOT), Normalizer.Form.NFC)))
{
context.report.message(MessageId.OPF_061, EPUBLocation.of(context), resource.getPath());
}

// Store the resource in the data structure
if (resource.isDirectory())
{
// the container resource is a directory,
// store it for later checking of empty directories
directories.add(resource.getPath());
}
else
{
// Check file name requirements
filenameChecker.checkCompatiblyEscaped(resource.getPath());

// report entry metadata
reportFeatures(resource.getProperties());
// the container resource is a file,
// add the resource to the container model
resources.put(resource.getPath().toLowerCase(Locale.ROOT), resource);
state.addResource(resource);
// Store the resource in the data structure
if (resource.isDirectory())
{
// the container resource is a directory,
// store it for later checking of empty directories
directories.add(resource.getPath());
}
else
{
// Check file name requirements
filenameChecker.checkCompatiblyEscaped(resource.getPath());

// report entry metadata
reportFeatures(resource.getProperties());
// the container resource is a file,
// add the resource to the container model
resources.put(resource.getPath().toLowerCase(Locale.ROOT), resource);
state.addResource(resource);
}
}
}

// Report empty directories
for (String directory : directories)
{
boolean hasContents = false;
for (OCFResource resource : resources.values())
// Report empty directories
for (String directory : directories)
{
if (resource.getPath().startsWith(directory))
boolean hasContents = false;
for (OCFResource resource : resources.values())
{
if (resource.getPath().startsWith(directory))
{
hasContents = true;
break;
}
}
if (!hasContents)
{
hasContents = true;
break;
report.message(MessageId.PKG_014, EPUBLocation.of(context), directory);
}
}
if (!hasContents)
return true;
} catch (Exception e)
{
switch (e.getMessage())
{
report.message(MessageId.PKG_014, EPUBLocation.of(context), directory);
case "invalid CEN header (bad entry name)": // reported by OpenJDK
case "MALFORMED": // reported by Oracle JDK 1.8
report.message(MessageId.PKG_027, EPUBLocation.of(context), e.getLocalizedMessage());
break;
default:
report.message(MessageId.PKG_008, EPUBLocation.of(context), e.getLocalizedMessage());
break;
}
return false;
}
}

Expand Down
3 changes: 2 additions & 1 deletion src/main/java/com/adobe/epubcheck/ocf/OCFZipResources.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.util.Enumeration;
import java.util.Iterator;
Expand Down Expand Up @@ -32,7 +33,7 @@ public OCFZipResources(URL url) throws IOException
{
new IllegalArgumentException("Not a file URL: " + url);
}
this.zip = new ZipFile(file);
this.zip = new ZipFile(file, StandardCharsets.UTF_8);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,8 @@ PKG_023=Validating the EPUB against version 2.0, default validation profile will
PKG_024=Uncommon EPUB file extension.
PKG_024_SUG=For maximum compatibility, use ".epub".
PKG_025=Publication resource must not be located in the META-INF directory
PKG_026=Obfuscated resource must be a Font Core Media Type (was declared as "%1$s" in "%2$s").
PKG_026=Obfuscated resource must be a Font Core Media Type (was declared as "%1$s" in "%2$s").
PKG_027=Could not extract EPUB ZIP content, probably due to file names not encoded in UTF-8.

#Resources
RSC_001=File "%1$s" could not be found.
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
22 changes: 22 additions & 0 deletions src/test/resources/epub3/04-ocf/ocf.feature
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,28 @@ Feature: EPUB 3 — Open Container Format
Then error OPF-060 is reported
And no other errors or warnings are reported

@spec @xref:sec-zip-container-zipreqs
Scenario: Verify file names with non-ASCII UTF-8-encoded character are allowed
When checking EPUB 'ocf-filename-utf8-valid.epub'
Then no errors or warnings are reported

@spec @xref:sec-zip-container-zipreqs
Scenario: Report file names that are not encoded as UTF-8
When checking EPUB 'ocf-filename-not-utf8-error.epub'
Then fatal error PKG-027 is reported
Then no errors or warnings are reported

@spec @xref:sec-zip-container-zipreqs
Scenario: Verify path names with non-ASCII UTF-8-encoded character are allowed
When checking EPUB 'ocf-filepath-utf8-valid.epub'
Then no errors or warnings are reported

@spec @xref:sec-zip-container-zipreqs
Scenario: Report file names that are not encoded as UTF-8
When checking EPUB 'ocf-filepath-not-utf8-error.epub'
Then fatal error PKG-027 is reported
Then no errors or warnings are reported


### 4.2.3 OCF ZIP container media type idenfication

Expand Down

0 comments on commit 5248914

Please sign in to comment.