diff --git a/README.md b/README.md index d25aba7..b73d7ea 100644 --- a/README.md +++ b/README.md @@ -21,13 +21,22 @@ The API is really simple; one type, holding three methods, is exported (and addi ```go New(options ...CommonOption) *HIBP HIBP#Sync(options ...SyncOption) error // Syncs the local copy with the upstream database -HIBP#Export(w io.Writer, options ...ExportOption) error // Writes a continuous, decompressed and "free-of-etags" stream to the given io.Writer -HIBP#.Query("ABCDE") (io.ReadClose, error) // Returns the k-proximity API result as the upstream API would +HIBP#Export(w io.Writer, options ...ExportOption) error // Writes a continuous, decompressed and "free-of-etags" stream to the given io.Writer with the lines being prefix by the k-proximity range +HIBP#.Query("ABCDE") (io.ReadClose, error) // Returns the k-proximity API result as the upstream API would (without the k-proximity range as prefix) ``` -All operates operate on disk but, depending on the medium, should provide access times that are probably good enough for all scenarios. +All of them operate on disk but, depending on the medium, should provide access times that are probably good enough for all scenarios. A memory-based `tmpfs` will speed things up when necessary. +**Attention:** +The [official API](https://haveibeenpwned.com/API/v3#PwnedPasswords) states the following regarding the format: + +> Each password is stored as both a SHA-1 and an NTLM hash of a UTF-8 encoded password. +> The downloadable source data delimits the hash and the password count with a colon (:) and each line with a CRLF. + +The crucial part being that lines are ended with `\r\n`. +In order to be compatible with the upstream API this library sticks to this... + ## CLI diff --git a/export.go b/export.go index e340923..12f35d2 100644 --- a/export.go +++ b/export.go @@ -1,11 +1,15 @@ package hibp import ( + "bufio" + "bytes" "fmt" "io" ) -var lineSeparator = []byte("\n") +// The upstream Have-I-Been-Pwned API uses CRLF as line separator - so we are stuck with it, +// although it does not feel right. +var lineSeparator = []byte("\r\n") func export(from, to int64, store storage, w io.Writer) error { for i := from; i < to; i++ { @@ -18,7 +22,17 @@ func export(from, to int64, store storage, w io.Writer) error { } defer dataReader.Close() - if _, err := io.Copy(w, dataReader); err != nil { + lines, err := io.ReadAll(dataReader) + if err != nil { + return fmt.Errorf("reading data for range %q: %w", rangePrefix, err) + } + + prefixedLines, err := prefixLines(lines, rangePrefix) + if err != nil { + return fmt.Errorf("prefixing lines for range %q: %w", rangePrefix, err) + } + + if _, err := w.Write(prefixedLines); err != nil { return fmt.Errorf("writing data for range %q: %w", rangePrefix, err) } @@ -42,3 +56,32 @@ func export(from, to int64, store storage, w io.Writer) error { return nil } + +func prefixLines(in []byte, prefix string) ([]byte, error) { + firstLine := true + + // Actually, we know that the size will be: len(in) + rows * len(prefix) + // But we do not know the number of rows - so starting from len(in) seems to be a good choice. + out := bytes.NewBuffer(make([]byte, 0, len(in))) + + scanner := bufio.NewScanner(bytes.NewReader(in)) + for scanner.Scan() { + if !firstLine { + if _, err := out.Write(lineSeparator); err != nil { + return nil, fmt.Errorf("adding line separator: %w", err) + } + } + + firstLine = false + + if _, err := out.Write([]byte(prefix)); err != nil { + return nil, fmt.Errorf("adding prefix: %w", err) + } + + if _, err := out.Write(scanner.Bytes()); err != nil { + return nil, fmt.Errorf("adding suffix and counter: %w", err) + } + } + + return out.Bytes(), nil +} diff --git a/export_test.go b/export_test.go index 732c90a..7871ad5 100644 --- a/export_test.go +++ b/export_test.go @@ -12,9 +12,9 @@ func TestExport(t *testing.T) { ctrl := gomock.NewController(t) storageMock := NewMockstorage(ctrl) - storageMock.EXPECT().LoadData("00000").Return(io.NopCloser(bytes.NewReader([]byte("00000suffix:counter11\n00000suffix:counter12"))), nil) - storageMock.EXPECT().LoadData("00001").Return(io.NopCloser(bytes.NewReader([]byte("00001suffix:counter2"))), nil) - storageMock.EXPECT().LoadData("00002").Return(io.NopCloser(bytes.NewReader([]byte("00002suffix:counter3"))), nil) + storageMock.EXPECT().LoadData("00000").Return(io.NopCloser(bytes.NewReader([]byte("suffix:counter11\r\nsuffix:counter12"))), nil) + storageMock.EXPECT().LoadData("00001").Return(io.NopCloser(bytes.NewReader([]byte("suffix:counter2"))), nil) + storageMock.EXPECT().LoadData("00002").Return(io.NopCloser(bytes.NewReader([]byte("suffix:counter3"))), nil) buf := bytes.NewBuffer([]byte{}) @@ -22,7 +22,11 @@ func TestExport(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - if buf.String() != "00000suffix:counter11\n00000suffix:counter12\n00001suffix:counter2\n00002suffix:counter3" { + // We expect the lines to be prefixed with the range as this is what the response from the official + // HIBP API looks like. + // This has to be the case because `Export` iterates over all ranges; different from `Query` which only + // queries a single range. + if buf.String() != "00000suffix:counter11\r\n00000suffix:counter12\r\n00001suffix:counter2\r\n00002suffix:counter3" { t.Fatalf("unexpected output: %q", buf.String()) } } diff --git a/lib.go b/lib.go index fa423be..80ff857 100644 --- a/lib.go +++ b/lib.go @@ -91,7 +91,9 @@ func (h *HIBP) Sync(options ...SyncOption) error { } // Export writes the dataset to the given writer. -// The data is written in the same format as it is provided by the Have-I-Been-Pwned API itself. +// The data is written as a continuous stream with no indication of the "prefix boundaries", +// the format therefore differs from the official Have-I-Been-Pwned API and from `Query`, which is mimicking the API. +// Lines have the schema ":". func (h *HIBP) Export(w io.Writer) error { return export(0, defaultLastRange+1, h.store, w) } @@ -100,6 +102,8 @@ func (h *HIBP) Export(w io.Writer) error { // The function returns an io.ReadCloser that can be used to read the data, it should be closed as soon as possible // to release the read lock on the file. // It is the responsibility of the caller to close the returned io.ReadCloser. +// The resulting lines do NOT start with the prefix, they are following the schema ":". +// This is equivalent to the response of the official Have-I-Been-Pwned API. func (h *HIBP) Query(prefix string) (io.ReadCloser, error) { reader, err := h.store.LoadData(prefix) if err != nil { diff --git a/lib_test.go b/lib_test.go index 64df1bb..4189a04 100644 --- a/lib_test.go +++ b/lib_test.go @@ -1,11 +1,39 @@ package hibp import ( + "bytes" + "go.uber.org/mock/gomock" "io" "math/rand" "testing" ) +func TestQuery(t *testing.T) { + ctrl := gomock.NewController(t) + storageMock := NewMockstorage(ctrl) + + storageMock.EXPECT().LoadData("00000").Return(io.NopCloser(bytes.NewReader([]byte("suffix:counter11\r\nsuffix:counter12"))), nil) + + i := HIBP{store: storageMock} + + reader, err := i.Query("00000") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + defer reader.Close() + + lines, err := io.ReadAll(reader) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // We expect the lines to not be prefixed with the range as this is what the response from the official + // HIBP API looks like. + if string(lines) != "suffix:counter11\r\nsuffix:counter12" { + t.Fatalf("unexpected output: %q", string(lines)) + } +} + func BenchmarkQuery(b *testing.B) { const lastRange = 0x0000A diff --git a/sync.go b/sync.go index 95fbeb8..3befeab 100644 --- a/sync.go +++ b/sync.go @@ -1,8 +1,6 @@ package hibp import ( - "bufio" - "bytes" "context" "errors" "fmt" @@ -36,7 +34,13 @@ func sync(ctx context.Context, from, to int64, client *hibpClient, store storage pool.Submit(func() { rangePrefix := toRangeString(current) - err := func() error { + err := func() (innerErr error) { + defer func() { + if r := recover(); r != nil { + innerErr = fmt.Errorf("recovered panic: %v", r) + } + }() + inFlightSet.Add(current) // We basically ignore any error here because we can still process the range even if we can't load the etag @@ -51,12 +55,7 @@ func sync(ctx context.Context, from, to int64, client *hibpClient, store storage } if !resp.NotModified { - prefixedLines, err := prefixLines(resp.Data, rangePrefix) - if err != nil { - return fmt.Errorf("prefixing lines: %w", err) - } - - if err := store.Save(rangePrefix, resp.ETag, prefixedLines); err != nil { + if err := store.Save(rangePrefix, resp.ETag, resp.Data); err != nil { return fmt.Errorf("saving range: %w", err) } } @@ -98,35 +97,6 @@ func toRangeString(i int64) string { return fmt.Sprintf("%05X", i) } -func prefixLines(in []byte, prefix string) ([]byte, error) { - firstLine := true - - // Actually, we know that the size will be: len(in) + rows * len(prefix) - // But we do not know the number of rows - so starting from len(in) seems to be a good choice. - out := bytes.NewBuffer(make([]byte, 0, len(in))) - - scanner := bufio.NewScanner(bytes.NewReader(in)) - for scanner.Scan() { - if !firstLine { - if _, err := out.Write(lineSeparator); err != nil { - return nil, fmt.Errorf("adding line separator: %w", err) - } - } - - firstLine = false - - if _, err := out.Write([]byte(prefix)); err != nil { - return nil, fmt.Errorf("adding prefix: %w", err) - } - - if _, err := out.Write(scanner.Bytes()); err != nil { - return nil, fmt.Errorf("adding suffix and counter: %w", err) - } - } - - return out.Bytes(), nil -} - func lowestInFlight(inFlight mapset.Set[int64], to int64) int64 { lowest := int64(math.MaxInt64) diff --git a/sync_test.go b/sync_test.go index ceed45a..dddae7a 100644 --- a/sync_test.go +++ b/sync_test.go @@ -23,63 +23,63 @@ func TestSync(t *testing.T) { Get("/range/00000"). Reply(200). AddHeader("ETag", "etag"). - BodyString("suffix1") + BodyString("suffix1:1") gock.New(baseURL). Get("/range/00001"). MatchHeader("If-None-Match", "etag received earlier"). Reply(http.StatusNotModified). AddHeader("ETag", "etag received earlier"). - BodyString("suffix2") + BodyString("suffix2:2") gock.New(baseURL). Get("/range/00002"). Reply(200). AddHeader("ETag", "etag"). - BodyString("suffix31:2\nsuffix32:3") + BodyString("suffix31:2\r\nsuffix32:3") gock.New(baseURL). Get("/range/00003"). Reply(200). AddHeader("ETag", "etag"). - BodyString("suffix4") + BodyString("suffix4:4") gock.New(baseURL). Get("/range/00004"). Reply(200). AddHeader("ETag", "etag"). - BodyString("suffix5") + BodyString("suffix5:5") gock.New(baseURL). Get("/range/00005"). Reply(200). AddHeader("ETag", "etag"). - BodyString("suffix6") + BodyString("suffix6:6") gock.New(baseURL). Get("/range/00006"). Reply(200). AddHeader("ETag", "etag"). - BodyString("suffix7") + BodyString("suffix7:7") gock.New(baseURL). Get("/range/00007"). Reply(200). AddHeader("ETag", "etag"). - BodyString("suffix8") + BodyString("suffix8:8") gock.New(baseURL). Get("/range/00008"). Reply(200). AddHeader("ETag", "etag"). - BodyString("suffix9") + BodyString("suffix9:9") gock.New(baseURL). Get("/range/00009"). Reply(200). AddHeader("ETag", "etag"). - BodyString("suffix10") + BodyString("suffix10:10") gock.New(baseURL). Get("/range/0000A"). Reply(200). AddHeader("ETag", "etag"). - BodyString("suffix11") + BodyString("suffix11:11") gock.New(baseURL). Get("/range/0000B"). Reply(200). AddHeader("ETag", "etag"). - BodyString("suffix12") + BodyString("suffix12:12") client := &hibpClient{ endpoint: defaultEndpoint, @@ -90,29 +90,29 @@ func TestSync(t *testing.T) { storageMock := NewMockstorage(ctrl) storageMock.EXPECT().LoadETag("00000").Return("", nil) - storageMock.EXPECT().Save("00000", "etag", []byte("00000suffix1")).Return(nil) + storageMock.EXPECT().Save("00000", "etag", []byte("suffix1:1")).Return(nil) storageMock.EXPECT().LoadETag("00001").Return("etag received earlier", nil) // 00001 does not need to be written as its ETag has not changed storageMock.EXPECT().LoadETag("00002").Return("", nil) - storageMock.EXPECT().Save("00002", "etag", []byte("00002suffix31:2\n00002suffix32:3")).Return(nil) + storageMock.EXPECT().Save("00002", "etag", []byte("suffix31:2\r\nsuffix32:3")).Return(nil) storageMock.EXPECT().LoadETag("00003").Return("", nil) - storageMock.EXPECT().Save("00003", "etag", []byte("00003suffix4")).Return(nil) + storageMock.EXPECT().Save("00003", "etag", []byte("suffix4:4")).Return(nil) storageMock.EXPECT().LoadETag("00004").Return("", nil) - storageMock.EXPECT().Save("00004", "etag", []byte("00004suffix5")).Return(nil) + storageMock.EXPECT().Save("00004", "etag", []byte("suffix5:5")).Return(nil) storageMock.EXPECT().LoadETag("00005").Return("", nil) - storageMock.EXPECT().Save("00005", "etag", []byte("00005suffix6")).Return(nil) + storageMock.EXPECT().Save("00005", "etag", []byte("suffix6:6")).Return(nil) storageMock.EXPECT().LoadETag("00006").Return("", nil) - storageMock.EXPECT().Save("00006", "etag", []byte("00006suffix7")).Return(nil) + storageMock.EXPECT().Save("00006", "etag", []byte("suffix7:7")).Return(nil) storageMock.EXPECT().LoadETag("00007").Return("", nil) - storageMock.EXPECT().Save("00007", "etag", []byte("00007suffix8")).Return(nil) + storageMock.EXPECT().Save("00007", "etag", []byte("suffix8:8")).Return(nil) storageMock.EXPECT().LoadETag("00008").Return("", nil) - storageMock.EXPECT().Save("00008", "etag", []byte("00008suffix9")).Return(nil) + storageMock.EXPECT().Save("00008", "etag", []byte("suffix9:9")).Return(nil) storageMock.EXPECT().LoadETag("00009").Return("", nil) - storageMock.EXPECT().Save("00009", "etag", []byte("00009suffix10")).Return(nil) + storageMock.EXPECT().Save("00009", "etag", []byte("suffix10:10")).Return(nil) storageMock.EXPECT().LoadETag("0000A").Return("", nil) - storageMock.EXPECT().Save("0000A", "etag", []byte("0000Asuffix11")).Return(nil) + storageMock.EXPECT().Save("0000A", "etag", []byte("suffix11:11")).Return(nil) storageMock.EXPECT().LoadETag("0000B").Return("", nil) - storageMock.EXPECT().Save("0000B", "etag", []byte("0000Bsuffix12")).Return(nil) + storageMock.EXPECT().Save("0000B", "etag", []byte("suffix12:12")).Return(nil) var callCounter atomic.Int64