From 0871c7dbc08e4549b3d6b3d9b1d1699f3e6293d6 Mon Sep 17 00:00:00 2001 From: Florian Loch Date: Tue, 5 Mar 2024 19:25:57 +0100 Subject: [PATCH] refactor: query should not return lines being prefixed with the k-proximity range in order to mimick upstream API --- README.md | 6 +++--- cmd/sync/main.go | 2 +- export.go | 43 ++++++++++++++++++++++++++++++++++++++++++- export_test.go | 10 +++++++--- lib.go | 6 +++++- lib_test.go | 28 ++++++++++++++++++++++++++++ sync.go | 38 +------------------------------------- 7 files changed, 87 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index d25aba7..52f656f 100644 --- a/README.md +++ b/README.md @@ -21,11 +21,11 @@ The API is really simple; one type, holding three methods, is exported (and addi ```go New(options ...CommonOption) *HIBP HIBP#Sync(options ...SyncOption) error // Syncs the local copy with the upstream database -HIBP#Export(w io.Writer, options ...ExportOption) error // Writes a continuous, decompressed and "free-of-etags" stream to the given io.Writer -HIBP#.Query("ABCDE") (io.ReadClose, error) // Returns the k-proximity API result as the upstream API would +HIBP#Export(w io.Writer, options ...ExportOption) error // Writes a continuous, decompressed and "free-of-etags" stream to the given io.Writer with the lines being prefix by the k-proximity range +HIBP#.Query("ABCDE") (io.ReadClose, error) // Returns the k-proximity API result as the upstream API would (without the k-proximity range as prefix) ``` -All operates operate on disk but, depending on the medium, should provide access times that are probably good enough for all scenarios. +All of them operate on disk but, depending on the medium, should provide access times that are probably good enough for all scenarios. A memory-based `tmpfs` will speed things up when necessary. diff --git a/cmd/sync/main.go b/cmd/sync/main.go index 9ae4bea..ae88ce0 100644 --- a/cmd/sync/main.go +++ b/cmd/sync/main.go @@ -69,7 +69,7 @@ func run(dataDir string) error { return nil } - h := hibp.New(hibp.WithDataDir(dataDir)) + h := hibp.New(hibp.WithDataDir(dataDir), hibp.WithNoCompression()) if err := h.Sync( hibp.SyncWithProgressFn(updateProgressBar), diff --git a/export.go b/export.go index e340923..1b65fad 100644 --- a/export.go +++ b/export.go @@ -1,6 +1,8 @@ package hibp import ( + "bufio" + "bytes" "fmt" "io" ) @@ -18,7 +20,17 @@ func export(from, to int64, store storage, w io.Writer) error { } defer dataReader.Close() - if _, err := io.Copy(w, dataReader); err != nil { + lines, err := io.ReadAll(dataReader) + if err != nil { + return fmt.Errorf("reading data for range %q: %w", rangePrefix, err) + } + + prefixedLines, err := prefixLines(lines, rangePrefix) + if err != nil { + return fmt.Errorf("prefixing lines for range %q: %w", rangePrefix, err) + } + + if _, err := w.Write(prefixedLines); err != nil { return fmt.Errorf("writing data for range %q: %w", rangePrefix, err) } @@ -42,3 +54,32 @@ func export(from, to int64, store storage, w io.Writer) error { return nil } + +func prefixLines(in []byte, prefix string) ([]byte, error) { + firstLine := true + + // Actually, we know that the size will be: len(in) + rows * len(prefix) + // But we do not know the number of rows - so starting from len(in) seems to be a good choice. + out := bytes.NewBuffer(make([]byte, 0, len(in))) + + scanner := bufio.NewScanner(bytes.NewReader(in)) + for scanner.Scan() { + if !firstLine { + if _, err := out.Write(lineSeparator); err != nil { + return nil, fmt.Errorf("adding line separator: %w", err) + } + } + + firstLine = false + + if _, err := out.Write([]byte(prefix)); err != nil { + return nil, fmt.Errorf("adding prefix: %w", err) + } + + if _, err := out.Write(scanner.Bytes()); err != nil { + return nil, fmt.Errorf("adding suffix and counter: %w", err) + } + } + + return out.Bytes(), nil +} diff --git a/export_test.go b/export_test.go index 732c90a..037d02c 100644 --- a/export_test.go +++ b/export_test.go @@ -12,9 +12,9 @@ func TestExport(t *testing.T) { ctrl := gomock.NewController(t) storageMock := NewMockstorage(ctrl) - storageMock.EXPECT().LoadData("00000").Return(io.NopCloser(bytes.NewReader([]byte("00000suffix:counter11\n00000suffix:counter12"))), nil) - storageMock.EXPECT().LoadData("00001").Return(io.NopCloser(bytes.NewReader([]byte("00001suffix:counter2"))), nil) - storageMock.EXPECT().LoadData("00002").Return(io.NopCloser(bytes.NewReader([]byte("00002suffix:counter3"))), nil) + storageMock.EXPECT().LoadData("00000").Return(io.NopCloser(bytes.NewReader([]byte("suffix:counter11\nsuffix:counter12"))), nil) + storageMock.EXPECT().LoadData("00001").Return(io.NopCloser(bytes.NewReader([]byte("suffix:counter2"))), nil) + storageMock.EXPECT().LoadData("00002").Return(io.NopCloser(bytes.NewReader([]byte("suffix:counter3"))), nil) buf := bytes.NewBuffer([]byte{}) @@ -22,6 +22,10 @@ func TestExport(t *testing.T) { t.Fatalf("unexpected error: %v", err) } + // We expect the lines to be prefixed with the range as this is what the response from the official + // HIBP API looks like. + // This has to be the case because `Export` iterates over all ranges; different from `Query` which only + // queries a single range. if buf.String() != "00000suffix:counter11\n00000suffix:counter12\n00001suffix:counter2\n00002suffix:counter3" { t.Fatalf("unexpected output: %q", buf.String()) } diff --git a/lib.go b/lib.go index fa423be..80ff857 100644 --- a/lib.go +++ b/lib.go @@ -91,7 +91,9 @@ func (h *HIBP) Sync(options ...SyncOption) error { } // Export writes the dataset to the given writer. -// The data is written in the same format as it is provided by the Have-I-Been-Pwned API itself. +// The data is written as a continuous stream with no indication of the "prefix boundaries", +// the format therefore differs from the official Have-I-Been-Pwned API and from `Query`, which is mimicking the API. +// Lines have the schema ":". func (h *HIBP) Export(w io.Writer) error { return export(0, defaultLastRange+1, h.store, w) } @@ -100,6 +102,8 @@ func (h *HIBP) Export(w io.Writer) error { // The function returns an io.ReadCloser that can be used to read the data, it should be closed as soon as possible // to release the read lock on the file. // It is the responsibility of the caller to close the returned io.ReadCloser. +// The resulting lines do NOT start with the prefix, they are following the schema ":". +// This is equivalent to the response of the official Have-I-Been-Pwned API. func (h *HIBP) Query(prefix string) (io.ReadCloser, error) { reader, err := h.store.LoadData(prefix) if err != nil { diff --git a/lib_test.go b/lib_test.go index 64df1bb..95311da 100644 --- a/lib_test.go +++ b/lib_test.go @@ -1,11 +1,39 @@ package hibp import ( + "bytes" + "go.uber.org/mock/gomock" "io" "math/rand" "testing" ) +func TestQuery(t *testing.T) { + ctrl := gomock.NewController(t) + storageMock := NewMockstorage(ctrl) + + storageMock.EXPECT().LoadData("00000").Return(io.NopCloser(bytes.NewReader([]byte("suffix:counter11\nsuffix:counter12"))), nil) + + i := HIBP{store: storageMock} + + reader, err := i.Query("00000") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + defer reader.Close() + + lines, err := io.ReadAll(reader) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // We expect the lines to not be prefixed with the range as this is what the response from the official + // HIBP API looks like. + if string(lines) != "suffix:counter11\nsuffix:counter12" { + t.Fatalf("unexpected output: %q", string(lines)) + } +} + func BenchmarkQuery(b *testing.B) { const lastRange = 0x0000A diff --git a/sync.go b/sync.go index 95fbeb8..de71d79 100644 --- a/sync.go +++ b/sync.go @@ -1,8 +1,6 @@ package hibp import ( - "bufio" - "bytes" "context" "errors" "fmt" @@ -51,12 +49,7 @@ func sync(ctx context.Context, from, to int64, client *hibpClient, store storage } if !resp.NotModified { - prefixedLines, err := prefixLines(resp.Data, rangePrefix) - if err != nil { - return fmt.Errorf("prefixing lines: %w", err) - } - - if err := store.Save(rangePrefix, resp.ETag, prefixedLines); err != nil { + if err := store.Save(rangePrefix, resp.ETag, resp.Data); err != nil { return fmt.Errorf("saving range: %w", err) } } @@ -98,35 +91,6 @@ func toRangeString(i int64) string { return fmt.Sprintf("%05X", i) } -func prefixLines(in []byte, prefix string) ([]byte, error) { - firstLine := true - - // Actually, we know that the size will be: len(in) + rows * len(prefix) - // But we do not know the number of rows - so starting from len(in) seems to be a good choice. - out := bytes.NewBuffer(make([]byte, 0, len(in))) - - scanner := bufio.NewScanner(bytes.NewReader(in)) - for scanner.Scan() { - if !firstLine { - if _, err := out.Write(lineSeparator); err != nil { - return nil, fmt.Errorf("adding line separator: %w", err) - } - } - - firstLine = false - - if _, err := out.Write([]byte(prefix)); err != nil { - return nil, fmt.Errorf("adding prefix: %w", err) - } - - if _, err := out.Write(scanner.Bytes()); err != nil { - return nil, fmt.Errorf("adding suffix and counter: %w", err) - } - } - - return out.Bytes(), nil -} - func lowestInFlight(inFlight mapset.Set[int64], to int64) int64 { lowest := int64(math.MaxInt64)