From 6e57ca2199df52a494e974dc21c2898e80581849 Mon Sep 17 00:00:00 2001 From: Leszek Swirski Date: Mon, 15 Jul 2024 10:22:59 +0200 Subject: [PATCH] Use llvm-symbolizer's JSON output for symbolizing In some edge cases (e.g. injected JIT symbols), function names can have new lines. This breaks the llvm-symbolizer output parsing, and makes pprof hang. Conveniently, as of LLVM 13, llvm-symbolizer has a JSON output mode, which is robust against all kinds of weirdness like new lines. We can use this instead of the line-based parsing, and as a bonus we get much simpler handling of multiple frames in a stack, as the JSON output already returns these as an array. This also requires splitting the CODE and DATA processing into separate functions, since their JSON output is incompatible. For now, we keep the DATA output as before, a slightly hacky but functional concatenation of start + size, but this could be improved. --- internal/binutils/addr2liner_llvm.go | 118 +++++++++--------- .../binutils/testdata/fake-llvm-symbolizer | 16 +-- 2 files changed, 63 insertions(+), 71 deletions(-) diff --git a/internal/binutils/addr2liner_llvm.go b/internal/binutils/addr2liner_llvm.go index 3049545b6..5e51644f6 100644 --- a/internal/binutils/addr2liner_llvm.go +++ b/internal/binutils/addr2liner_llvm.go @@ -16,6 +16,7 @@ package binutils import ( "bufio" + "encoding/json" "fmt" "io" "os/exec" @@ -37,6 +38,7 @@ type llvmSymbolizer struct { filename string rw lineReaderWriter base uint64 + isData bool } type llvmSymbolizerJob struct { @@ -76,7 +78,7 @@ func newLLVMSymbolizer(cmd, file string, base uint64, isData bool) (*llvmSymboli } j := &llvmSymbolizerJob{ - cmd: exec.Command(cmd, "--inlining", "-demangle=false"), + cmd: exec.Command(cmd, "--inlining", "-demangle=false", "--output-style=JSON"), symType: "CODE", } if isData { @@ -102,63 +104,67 @@ func newLLVMSymbolizer(cmd, file string, base uint64, isData bool) (*llvmSymboli filename: file, rw: j, base: base, + isData: isData, } return a, nil } -// readFrame parses the llvm-symbolizer output for a single address. It -// returns a populated plugin.Frame and whether it has reached the end of the -// data. -func (d *llvmSymbolizer) readFrame() (plugin.Frame, bool) { - funcname, err := d.rw.readLine() +// readDataFrames parses the llvm-symbolizer DATA output for a single address. It +// returns a populated plugin.Frame array with a single entry. +func (d *llvmSymbolizer) readDataFrames() ([]plugin.Frame, error) { + line, err := d.rw.readLine() if err != nil { - return plugin.Frame{}, true + return nil, err } - - switch funcname { - case "": - return plugin.Frame{}, true - case "??": - funcname = "" + var frame struct { + Address string `json:"Address"` + ModuleName string `json:"ModuleName"` + Data struct { + Start string `json:"Start"` + Size string `json:"Size"` + Name string `json:"Name"` + } `json:"Data"` + } + if err := json.Unmarshal([]byte(line), &frame); err != nil { + return nil, err + } + // Match non-JSON output behaviour of stuffing the start/size into the filename of a single frame, + // with the size being a decimal value. + size, err := strconv.ParseInt(frame.Data.Size, 0, 0) + if err != nil { + return nil, err } + var stack []plugin.Frame + stack = append(stack, plugin.Frame{Func: frame.Data.Name, File: fmt.Sprintf("%s %d", frame.Data.Start, size)}) + return stack, nil +} - fileline, err := d.rw.readLine() +// readCodeFrames parses the llvm-symbolizer CODE output for a single address. It +// returns a populated plugin.Frame array. +func (d *llvmSymbolizer) readCodeFrames() ([]plugin.Frame, error) { + line, err := d.rw.readLine() if err != nil { - return plugin.Frame{Func: funcname}, true - } - - linenumber := 0 - columnnumber := 0 - // The llvm-symbolizer outputs the ::. - // When it cannot identify the source code location, it outputs "??:0:0". - // Older versions output just the filename and line number, so we check for - // both conditions here. - if fileline == "??:0" || fileline == "??:0:0" { - fileline = "" - } else { - switch split := strings.Split(fileline, ":"); len(split) { - case 3: - // filename:line:column - if col, err := strconv.Atoi(split[2]); err == nil { - columnnumber = col - } - fallthrough - case 2: - // filename:line - if line, err := strconv.Atoi(split[1]); err == nil { - linenumber = line - } - fallthrough - case 1: - // filename - fileline = split[0] - default: - // Unrecognized, ignore - } - } - - return plugin.Frame{Func: funcname, File: fileline, Line: linenumber, Column: columnnumber}, false + return nil, err + } + var frame struct { + Address string `json:"Address"` + ModuleName string `json:"ModuleName"` + Symbol []struct { + Line int `json:"Line"` + Column int `json:"Column"` + FunctionName string `json:"FunctionName"` + FileName string `json:"FileName"` + } `json:"Symbol"` + } + if err := json.Unmarshal([]byte(line), &frame); err != nil { + return nil, err + } + var stack []plugin.Frame + for _, s := range frame.Symbol { + stack = append(stack, plugin.Frame{Func: s.FunctionName, File: s.FileName, Line: s.Line, Column: s.Column}) + } + return stack, nil } // addrInfo returns the stack frame information for a specific program @@ -170,18 +176,8 @@ func (d *llvmSymbolizer) addrInfo(addr uint64) ([]plugin.Frame, error) { if err := d.rw.write(fmt.Sprintf("%s 0x%x", d.filename, addr-d.base)); err != nil { return nil, err } - - var stack []plugin.Frame - for { - frame, end := d.readFrame() - if end { - break - } - - if frame != (plugin.Frame{}) { - stack = append(stack, frame) - } + if d.isData { + return d.readDataFrames() } - - return stack, nil + return d.readCodeFrames() } diff --git a/internal/binutils/testdata/fake-llvm-symbolizer b/internal/binutils/testdata/fake-llvm-symbolizer index a3b4546d9..507761c9e 100755 --- a/internal/binutils/testdata/fake-llvm-symbolizer +++ b/internal/binutils/testdata/fake-llvm-symbolizer @@ -22,22 +22,18 @@ IFS=" " while read line; do # line has form: # filename 0xaddr - # Emit dummy output that matches llvm-symbolizer output format. + # Emit dummy output that matches llvm-symbolizer JSON output format. set -- ${line} kind=$1 fname=$2 addr=$3 case ${kind} in CODE) - echo "Inlined_${addr}" - echo "${fname}.h" - echo "Func_${addr}" - echo "${fname}.c:2:1" - echo;; + echo "{\"Address\":\"${addr}\",\"ModuleName\":\"${fname}\",\"Symbol\":[{\"Column\":0,\"FileName\":\"${fname}.h\",\"FunctionName\":\"Inlined_${addr}\",\"Line\":0},{\"Column\":1,\"FileName\":\"${fname}.c\",\"FunctionName\":\"Func_${addr}\",\"Line\":2}]}" + ;; DATA) - echo "${fname}_${addr}" - echo "${addr} 8" - echo;; - *) echo ${kind} ${fname} ${addr};; + echo "{\"Address\":\"${addr}\",\"ModuleName\":\"${fname}\",\"Data\":{\"Name\":\"${fname}_${addr}\",\"Size\":\"0x8\",\"Start\":\"${addr}\"}}" + ;; + *) exit 1;; esac done