From cdceff95b787e8babf5e5e379839153120dbcdc3 Mon Sep 17 00:00:00 2001 From: Gabriel Vasile Date: Sun, 2 Jun 2024 23:31:14 +0900 Subject: [PATCH] zip: use []byte instead of string to prevent allocs (#537) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit goos: linux goarch: amd64 pkg: github.com/gabriel-vasile/mimetype cpu: Intel(R) Core(TM) i7-10510U CPU @ 1.80GHz │ master │ dev │ │ sec/op │ sec/op vs base │ Common/.xlsx-8 482.0n ± 4% 446.7n ± 3% -7.33% (p=0.000 n=10) Common/.pptx-8 1.999µ ± 3% 1.542µ ± 3% -22.89% (p=0.000 n=10) Common/.docx-8 1.501µ ± 2% 1.139µ ± 3% -24.13% (p=0.000 n=10) Common/.tar-8 1.134µ ± 3% 1.127µ ± 2% ~ (p=0.670 n=10) Common/.zip-8 882.4n ± 1% 778.6n ± 4% -11.75% (p=0.000 n=10) Common/.pdf-8 363.9n ± 2% 352.9n ± 2% -3.02% (p=0.005 n=10) Common/.jpg-8 508.9n ± 4% 504.3n ± 2% ~ (p=0.190 n=10) Common/.png-8 497.1n ± 3% 495.2n ± 2% ~ (p=0.971 n=10) Common/.gif-8 594.9n ± 5% 591.8n ± 2% ~ (p=0.481 n=10) Common/.xls-8 602.7n ± 4% 598.2n ± 11% ~ (p=0.436 n=10) Common/.webm-8 1.541µ ± 3% 1.549µ ± 3% ~ (p=0.645 n=10) Common/.csv-8 10.89µ ± 1% 10.96µ ± 1% ~ (p=0.225 n=10) geomean 981.7n 919.3n -6.36% │ master │ dev │ │ B/op │ B/op vs base │ Common/.xlsx-8 312.0 ± 0% 288.0 ± 0% -7.69% (p=0.000 n=10) Common/.pptx-8 592.0 ± 0% 288.0 ± 0% -51.35% (p=0.000 n=10) Common/.docx-8 504.0 ± 0% 288.0 ± 0% -42.86% (p=0.000 n=10) Common/.tar-8 200.0 ± 0% 200.0 ± 0% ~ (p=1.000 n=10) ¹ Common/.zip-8 224.0 ± 0% 192.0 ± 0% -14.29% (p=0.000 n=10) Common/.pdf-8 192.0 ± 0% 192.0 ± 0% ~ (p=1.000 n=10) ¹ Common/.jpg-8 192.0 ± 0% 192.0 ± 0% ~ (p=1.000 n=10) ¹ Common/.png-8 192.0 ± 0% 192.0 ± 0% ~ (p=1.000 n=10) ¹ Common/.gif-8 192.0 ± 0% 192.0 ± 0% ~ (p=1.000 n=10) ¹ Common/.xls-8 288.0 ± 0% 288.0 ± 0% ~ (p=1.000 n=10) ¹ Common/.webm-8 192.0 ± 0% 192.0 ± 0% ~ (p=1.000 n=10) ¹ Common/.csv-8 7.331Ki ± 0% 7.332Ki ± 0% ~ (p=0.179 n=10) geomean 339.6 299.3 -11.85% ¹ all samples are equal │ master │ dev │ │ allocs/op │ allocs/op vs base │ Common/.xlsx-8 4.000 ± 0% 3.000 ± 0% -25.00% (p=0.000 n=10) Common/.pptx-8 15.000 ± 0% 3.000 ± 0% -80.00% (p=0.000 n=10) Common/.docx-8 13.000 ± 0% 3.000 ± 0% -76.92% (p=0.000 n=10) Common/.tar-8 3.000 ± 0% 3.000 ± 0% ~ (p=1.000 n=10) ¹ Common/.zip-8 6.000 ± 0% 2.000 ± 0% -66.67% (p=0.000 n=10) Common/.pdf-8 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=10) ¹ Common/.jpg-8 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=10) ¹ Common/.png-8 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=10) ¹ Common/.gif-8 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=10) ¹ Common/.xls-8 3.000 ± 0% 3.000 ± 0% ~ (p=1.000 n=10) ¹ Common/.webm-8 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=10) ¹ Common/.csv-8 33.00 ± 0% 33.00 ± 0% ~ (p=1.000 n=10) ¹ geomean 4.339 2.991 -31.05% ¹ all samples are equal --- internal/magic/ms_office.go | 70 ++++++++++++++++++------------------- internal/magic/zip.go | 13 ++++--- 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/internal/magic/ms_office.go b/internal/magic/ms_office.go index 5964ce59..a1180173 100644 --- a/internal/magic/ms_office.go +++ b/internal/magic/ms_office.go @@ -6,41 +6,41 @@ import ( ) var ( - xlsxSigFiles = []string{ - "xl/worksheets/", - "xl/drawings/", - "xl/theme/", - "xl/_rels/", - "xl/styles.xml", - "xl/workbook.xml", - "xl/sharedStrings.xml", - } - docxSigFiles = []string{ - "word/media/", - "word/_rels/document.xml.rels", - "word/document.xml", - "word/styles.xml", - "word/fontTable.xml", - "word/settings.xml", - "word/numbering.xml", - "word/header", - "word/footer", - } - pptxSigFiles = []string{ - "ppt/slides/", - "ppt/media/", - "ppt/slideLayouts/", - "ppt/theme/", - "ppt/slideMasters/", - "ppt/tags/", - "ppt/notesMasters/", - "ppt/_rels/", - "ppt/handoutMasters/", - "ppt/notesSlides/", - "ppt/presentation.xml", - "ppt/tableStyles.xml", - "ppt/presProps.xml", - "ppt/viewProps.xml", + xlsxSigFiles = [][]byte{ + []byte("xl/worksheets/"), + []byte("xl/drawings/"), + []byte("xl/theme/"), + []byte("xl/_rels/"), + []byte("xl/styles.xml"), + []byte("xl/workbook.xml"), + []byte("xl/sharedStrings.xml"), + } + docxSigFiles = [][]byte{ + []byte("word/media/"), + []byte("word/_rels/document.xml.rels"), + []byte("word/document.xml"), + []byte("word/styles.xml"), + []byte("word/fontTable.xml"), + []byte("word/settings.xml"), + []byte("word/numbering.xml"), + []byte("word/header"), + []byte("word/footer"), + } + pptxSigFiles = [][]byte{ + []byte("ppt/slides/"), + []byte("ppt/media/"), + []byte("ppt/slideLayouts/"), + []byte("ppt/theme/"), + []byte("ppt/slideMasters/"), + []byte("ppt/tags/"), + []byte("ppt/notesMasters/"), + []byte("ppt/_rels/"), + []byte("ppt/handoutMasters/"), + []byte("ppt/notesSlides/"), + []byte("ppt/presentation.xml"), + []byte("ppt/tableStyles.xml"), + []byte("ppt/presProps.xml"), + []byte("ppt/viewProps.xml"), } ) diff --git a/internal/magic/zip.go b/internal/magic/zip.go index dabee947..aaa27559 100644 --- a/internal/magic/zip.go +++ b/internal/magic/zip.go @@ -3,7 +3,6 @@ package magic import ( "bytes" "encoding/binary" - "strings" ) var ( @@ -43,7 +42,7 @@ func Zip(raw []byte, limit uint32) bool { // Jar matches a Java archive file. func Jar(raw []byte, limit uint32) bool { - return zipContains(raw, "META-INF/MANIFEST.MF") + return zipContains(raw, []byte("META-INF/MANIFEST.MF")) } // zipTokenizer holds the source zip file and scanned index. @@ -54,7 +53,7 @@ type zipTokenizer struct { // next returns the next file name from the zip headers. // https://web.archive.org/web/20191129114319/https://users.cs.jmu.edu/buchhofp/forensics/formats/pkzip.html -func (t *zipTokenizer) next() (fileName string) { +func (t *zipTokenizer) next() (fileName []byte) { if t.i > len(t.in) { return } @@ -74,15 +73,15 @@ func (t *zipTokenizer) next() (fileName string) { return } t.i += fNameOffset + fNameLen - return string(in[fNameOffset : fNameOffset+fNameLen]) + return in[fNameOffset : fNameOffset+fNameLen] } // zipContains returns true if the zip file headers from in contain any of the paths. -func zipContains(in []byte, paths ...string) bool { +func zipContains(in []byte, paths ...[]byte) bool { t := zipTokenizer{in: in} - for i, tok := 0, t.next(); tok != ""; i, tok = i+1, t.next() { + for tok := t.next(); len(tok) != 0; tok = t.next() { for p := range paths { - if strings.HasPrefix(tok, paths[p]) { + if bytes.HasPrefix(tok, paths[p]) { return true } }