Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: improve placeholder replacement of byte sizes #13508

Merged
merged 1 commit into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions pkg/pattern/tokenization/replacer.go
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,10 @@ restore: // should be faster than a defer
return false
}

// 'b' and 'B' are not present here because of the way we check for byte size
// units below. If they were present, then suffixes like 'Bb', 'bb', etc. would
// be considered valid byte sizes. Also, only integer numbers are accepted as
// valid bytesizes in bytes, so we handle bytes with special cases instead.
var byteSizes = [256]bool{'k': true, 'K': true, 'm': true, 'M': true, 'g': true, 'G': true, 't': true, 'T': true, 'p': true, 'P': true}

// Only moves the head forward if it successfully matches a duration
Expand All @@ -339,6 +343,22 @@ func (r *replacer) advanceBytesize(c1 byte) (matched bool) {
return false
}

func (r *replacer) advanceSpacedBytesize(canBeBytes bool) (matched bool) {
// Get the next character after the space
c1, hasNext := r.advance()
if !hasNext {
return false
}
if canBeBytes && (c1 == 'b' || c1 == 'B') && r.peekNextIsBoundary() {
return true
}
if r.advanceBytesize(c1) {
return true
}
r.backtrack()
return false
}

func (r *replacer) advance() (c byte, advanced bool) {
if r.head >= len(r.source) {
return 0, false
Expand Down Expand Up @@ -394,6 +414,14 @@ func (r *replacer) handleHexOrUnit(hasMinusPrefix bool, n1, l1 uint, c1 byte) (e
c1 = r.peekFirstNonInt()
}

// Special case, this might be a byte size
if (c1 == 'b' || c1 == 'B') && r.peekNextIsBoundary() {
// We do not subsume a minus sign - byte sizes are unlikely to be
// negative, it's more likely this is a dash as a part of a range
r.emit(hasMinusPrefix, placeholderBytesize)
return true
}

// Maybe we are at the start of a hex string, either something like
// "[0-9]+[a-f]", "[0-9]+[A-F]", or "0x". We support both lower and upper
// case letters, but to avoid false positives, we want hex replacements to
Expand Down Expand Up @@ -489,6 +517,14 @@ func (r *replacer) handleNumberWithDecimal(hasMinusPrefix bool, n1 uint, l1 uint
return r.handlePotentialUnitWithDecimal(hasMinusPrefix, b2)
}

// This can be a byte size with a space, e.g. "3.14 GiB"
if b2 == ' ' && r.advanceSpacedBytesize(false) {
// We do not subsume a minus sign - byte sizes are unlikely to be
// negative, it's more likely this is a dash as a part of a range
r.emit(hasMinusPrefix, placeholderBytesize)
return true
}

// We have a decimal number followed by a non-dot boundary, so this is not
// an IP or a version number or anything like that.
if b2 != '.' {
Expand Down Expand Up @@ -633,6 +669,11 @@ func (r *replacer) handleNumberStart(hasMinusPrefix bool) (endsWithBoundary bool
case n1 <= maxYear && l1 <= 4 && (b1 == '-' || b1 == '/'):
return r.handleSaneTimestamp(hasMinusPrefix, n1, b1)

// This might be a byte size with a space, e.g. "2 b", "3 GiB"
case b1 == ' ' && r.advanceSpacedBytesize(true):
r.emit(hasMinusPrefix, placeholderBytesize)
return true

// Weird RFC822 dates like "02 Jan 06 15:04 MST"
case n1 <= 31 && l1 <= 2 && b1 == ' ':
if r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(true) && r.advanceStringOrNumericTimeZone(false) {
Expand Down
14 changes: 12 additions & 2 deletions pkg/pattern/tokenization/tokenization_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,11 @@ var tokenizationCornerTestCases = []tokenizationTestCase{
[]string{"<NUM>.<DURATION>", "3h121m3.<DURATION>", "1h0.<DURATION>", "100usa", "0.12msa"},
},
{
"2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit",
[]string{"<BYTESIZE>", "<BYTESIZE>-<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>:<BYTESIZE>"},
// We only consider integers to be valid bytesizes in bytes (0.2B doesn't make sense)
"2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit 5 B;124.1 KB/3b - 2b or 2 BeNot 13.37 b 3 b",
[]string{
"<BYTESIZE>", "<BYTESIZE>-<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>:<BYTESIZE>",
"<BYTESIZE>;<BYTESIZE>/<BYTESIZE>", "-", "<BYTESIZE>", "or", "<NUM>", "BeNot", "<NUM>", "b", "<BYTESIZE>"},
},
{
`status=123 status_code:500 status 200 status="-1" status_code:"404" httpStatus=200`,
Expand Down Expand Up @@ -175,6 +178,13 @@ var tokenizationRealisticTestCases = []tokenizationTestCase{
"level=debug", "ts=<TIMESTAMP>", "caller=shard_resolver.go:<NUM>", "bytes=<BYTESIZE>", "chunks=<NUM>", "streams=<NUM>", "entries=<NUM>", `msg="queried index"`, "type=single", `matchers="{stream=\"stdout\", pod=\"loki-canary-v75j4\"}"`, "duration=<DURATION>", "from=<TIMESTAMP>", "through=<TIMESTAMP>", "length=<DURATION>",
},
},
// tricky loki distributor message:
{
`level=debug ts=2024-07-12T12:25:06.175464934Z caller=push.go:146 org_id=29 traceID=7af4f918eab1c80f msg="push request parsed" path=/loki/api/v1/push contentType=application/x-protobuf contentEncoding= bodySize="8.8 kB" streams=11 entries=43 streamLabelsSize="3.4 kB" entriesSize="19 kB" structuredMetadataSize="71 B" totalSize="22 kB" mostRecentLagMs=167 adaptiveLogsDroppedLines=10 adaptiveLogsDroppedSize=4965 adaptiveLogsMatchedLines=37`,
[]string{
"level=debug", "ts=<TIMESTAMP>", "caller=push.go:<NUM>", "org_id=<NUM>", "traceID=<HEX>", `msg="push request parsed"`, "path=/loki/api/v1/push", "contentType=application/x-protobuf", "contentEncoding=", `bodySize="<BYTESIZE>"`, "streams=<NUM>", "entries=<NUM>", `streamLabelsSize="<BYTESIZE>"`, `entriesSize="<BYTESIZE>"`, `structuredMetadataSize="<BYTESIZE>"`, `totalSize="<BYTESIZE>"`, "mostRecentLagMs=<NUM>", "adaptiveLogsDroppedLines=<NUM>", "adaptiveLogsDroppedSize=<NUM>", "adaptiveLogsMatchedLines=<NUM>",
},
},
// random JSON logs
{
`{"timestamp": "2022-12-23T12:34:56Z", "level": "debug", "message": "Server starting", "server_id": "abcdefghij", "start_time": "2022-12-23T12:30:00Z"}`,
Expand Down
Loading