New processor extract_array (elastic#11761)

This adds a new processor, extract_array, that allows accessing values inside arrays and copying them to target fields.
leweafan · May 9, 2019 · 9a83039 · 9a83039
1 parent a4676ff
commit 9a83039
Show file tree

Hide file tree

Showing 6 changed files with 512 additions and 5 deletions.
diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc
@@ -143,6 +143,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
 - Add `add_observer_metadata` processor. {pull}11394[11394]
 - Add `decode_csv_fields` processor. {pull}11753[11753]
 - Add `convert` processor for converting data types of fields. {issue}8124[8124] {pull}11686[11686]
+- New `extract_array` processor. {pull}11761[11761]
 
 *Auditbeat*
 

diff --git a/libbeat/cmd/instance/imports.go b/libbeat/cmd/instance/imports.go
@@ -35,5 +35,6 @@ import (
 	_ "github.com/elastic/beats/libbeat/processors/convert"
 	_ "github.com/elastic/beats/libbeat/processors/dissect"
 	_ "github.com/elastic/beats/libbeat/processors/dns"
+	_ "github.com/elastic/beats/libbeat/processors/extract_array"
 	_ "github.com/elastic/beats/libbeat/publisher/includes" // Register publisher pipeline modules
 )
diff --git a/libbeat/docs/processors-using.asciidoc b/libbeat/docs/processors-using.asciidoc
@@ -211,10 +211,11 @@ The supported processors are:
  * <<community-id,`community_id`>>
  * <<convert,`convert`>>
 ifdef::has_decode_csv_fields_processor[]
-* <<decode-csv-fields,`decode_csv_fields`>>
+ * <<decode-csv-fields,`decode_csv_fields`>>
 endif::[]
  * <<decode-json-fields,`decode_json_fields`>>
  * <<dissect, `dissect`>>
+ * <<extract-array,`extract_array`>>
  * <<processor-dns, `dns`>>
  * <<drop-event,`drop_event`>>
  * <<drop-fields,`drop_fields`>>
@@ -815,16 +816,16 @@ The `decode_csv_fields` has the following settings:
               The default is the comma character. For using a TAB character you
               must set it to "\t".
 `ignore_missing`:: (Optional) Whether to ignore events which lack the source
-                   field. The default is false, which will fail processing of an
-                   event if a field is missing.
+                   field. The default is `false`, which will fail processing of
+                   an event if a field is missing.
 `overwrite_keys`:: Whether the target field is overwritten if it
                    already exists. The default is false, which will fail
                    processing of an event when `target` already exists.
 `trim_leading_space`:: Whether extra space after the separator is trimmed from
                        values. This works even if the separator is also a space.
-                       The default is false.
+                       The default is `false`.
 `fail_on_error`:: (Optional) If set to true, in case of an error the changes to
-the event are reverted and the original event is returned. If set to false,
+the event are reverted, and the original event is returned. If set to `false`,
 processing continues also if an error happens. Default is `true`.
 
 endif::[]
@@ -1746,3 +1747,46 @@ thrown.
 *Example*: `event.AppendTo("error.message", "invalid file hash");`
 |===
 endif::[]
+
+[[extract-array]]
+=== Extract array
+
+experimental[]
+
+The `extract_array` processor populates fields with values read from an array
+field. The following example will populate `source.ip` with the first element of
+the `my_array` field, `destination.ip` with the second element, and
+`network.transport` with the third.
+
+[source,yaml]
+-----------------------------------------------------
+processors:
+ - extract_array:
+     field: my_array
+     mappings:
+        source.ip: 0
+        destination.ip: 1
+        network.transport: 2
+-----------------------------------------------------
+
+The following settings are supported:
+
+`field`:: The array field whose elements are to be extracted.
+`mappings`:: Maps each field name to an array index. Use 0 for the first element in
+             the array. Multiple fields can be mapped to the same array element.
+`ignore_missing`:: (Optional) Whether to ignore events where the array field is
+                   missing. The default is `false`, which will fail processing
+                   of an event if the specified field does not exist. Set it to
+                   `true` to ignore this condition.
+`overwrite_keys`:: Whether the target fields specified in the mapping are
+                   overwritten if they already exist. The default is `false`,
+                   which will fail processing if a target field already exists.
+`fail_on_error`:: (Optional) If set to `true` and an error happens, changes to
+                  the event are reverted, and the original event is returned. If
+                  set to `false`, processing continues despite errors.
+                  Default is `true`.
+`omit_empty`:: (Optional) Whether empty values are extracted from the array. If
+                  set to `true`, instead of the target field being set to an
+                  empty value, it is left unset. The empty string (`""`), an
+                  empty array (`[]`) or an empty object (`{}`) are considered
+                  empty values. Default is `false`.
diff --git a/libbeat/processors/extract_array/extract_array.go b/libbeat/processors/extract_array/extract_array.go
@@ -0,0 +1,189 @@
+// Licensed to Elasticsearch B.V. under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Elasticsearch B.V. licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package extract_array
+
+import (
+	"fmt"
+	"reflect"
+	"sort"
+
+	"github.com/pkg/errors"
+
+	"github.com/elastic/beats/libbeat/beat"
+	"github.com/elastic/beats/libbeat/common"
+	"github.com/elastic/beats/libbeat/processors"
+	"github.com/elastic/beats/libbeat/processors/checks"
+)
+
+type config struct {
+	Field         string        `config:"field"`
+	Mappings      common.MapStr `config:"mappings"`
+	IgnoreMissing bool          `config:"ignore_missing"`
+	OmitEmpty     bool          `config:"omit_empty"`
+	OverwriteKeys bool          `config:"overwrite_keys"`
+	FailOnError   bool          `config:"fail_on_error"`
+}
+
+type fieldMapping struct {
+	from int
+	to   string
+}
+
+type extractArrayProcessor struct {
+	config
+	mappings []fieldMapping
+}
+
+var (
+	defaultConfig = config{
+		FailOnError: true,
+	}
+	errNoMappings = errors.New("no mappings defined in extract_array processor")
+)
+
+func init() {
+	processors.RegisterPlugin("extract_array",
+		checks.ConfigChecked(New,
+			checks.RequireFields("field", "mappings"),
+			checks.AllowedFields("field", "mappings", "ignore_missing", "overwrite_keys", "fail_on_error", "when", "omit_empty")))
+}
+
+// Unpack unpacks the processor's configuration.
+func (f *extractArrayProcessor) Unpack(from *common.Config) error {
+	tmp := defaultConfig
+	err := from.Unpack(&tmp)
+	if err != nil {
+		return fmt.Errorf("failed to unpack the extract_array configuration: %s", err)
+	}
+	f.config = tmp
+	for field, column := range f.Mappings.Flatten() {
+		colIdx, ok := common.TryToInt(column)
+		if !ok || colIdx < 0 {
+			return fmt.Errorf("bad extract_array mapping for field %s: %+v is not a positive integer", field, column)
+		}
+		f.mappings = append(f.mappings, fieldMapping{from: colIdx, to: field})
+	}
+	sort.Slice(f.mappings, func(i, j int) bool {
+		return f.mappings[i].from < f.mappings[j].from
+	})
+	return nil
+}
+
+// New builds a new extract_array processor.
+func New(c *common.Config) (processors.Processor, error) {
+	p := &extractArrayProcessor{}
+	err := c.Unpack(p)
+	if err != nil {
+		return nil, err
+	}
+	if len(p.mappings) == 0 {
+		return nil, errNoMappings
+	}
+	return p, nil
+}
+
+func isEmpty(v reflect.Value) bool {
+	switch v.Kind() {
+	case reflect.String:
+		return v.Len() == 0
+	case reflect.Slice, reflect.Map:
+		return v.IsNil() || v.Len() == 0
+	case reflect.Interface:
+		return v.IsNil() || isEmpty(v.Elem())
+	}
+	return false
+}
+
+func (f *extractArrayProcessor) Run(event *beat.Event) (*beat.Event, error) {
+	iValue, err := event.GetValue(f.config.Field)
+	if err != nil {
+		if f.config.IgnoreMissing && errors.Cause(err) == common.ErrKeyNotFound {
+			return event, nil
+		}
+		return event, errors.Wrapf(err, "could not fetch value for field %s", f.config.Field)
+	}
+
+	array := reflect.ValueOf(iValue)
+	if t := array.Type(); t.Kind() != reflect.Slice {
+		if !f.config.FailOnError {
+			return event, nil
+		}
+		return event, errors.Wrapf(err, "unsupported type for field %s: got: %s needed: array", f.config.Field, t.String())
+	}
+
+	saved := *event
+	if f.config.FailOnError {
+		saved.Fields = event.Fields.Clone()
+		saved.Meta = event.Meta.Clone()
+	}
+
+	n := array.Len()
+	for _, mapping := range f.mappings {
+		if mapping.from >= n {
+			if !f.config.FailOnError {
+				continue
+			}
+			return &saved, errors.Errorf("index %d exceeds length of %d when processing mapping for field %s", mapping.from, n, mapping.to)
+		}
+		cell := array.Index(mapping.from)
+		// checking for CanInterface() here is done to prevent .Interface() from
+		// panicking, but it can only happen when value points to a private
+		// field inside a struct.
+		if !cell.IsValid() || !cell.CanInterface() || (f.config.OmitEmpty && isEmpty(cell)) {
+			continue
+		}
+		if !f.config.OverwriteKeys {
+			if _, err = event.GetValue(mapping.to); err == nil {
+				if !f.config.FailOnError {
+					continue
+				}
+				return &saved, errors.Errorf("target field %s already has a value. Set the overwrite_keys flag or drop/rename the field first", mapping.to)
+			}
+		}
+		if _, err = event.PutValue(mapping.to, clone(cell.Interface())); err != nil {
+			if !f.config.FailOnError {
+				continue
+			}
+			return &saved, errors.Wrapf(err, "failed setting field %s", mapping.to)
+		}
+	}
+	return event, nil
+}
+
+func (f *extractArrayProcessor) String() (r string) {
+	return fmt.Sprintf("extract_array={field=%s, mappings=%v}", f.config.Field, f.mappings)
+}
+
+func clone(value interface{}) interface{} {
+	// TODO: This is dangerous but done by most processors.
+	//       Otherwise need to reflect value and deep copy lists / map types.
+	switch v := value.(type) {
+	case common.MapStr:
+		return v.Clone()
+	case map[string]interface{}:
+		return common.MapStr(v).Clone()
+	case []interface{}:
+		len := len(v)
+		newArr := make([]interface{}, len)
+		for idx, val := range v {
+			newArr[idx] = clone(val)
+		}
+		return newArr
+	}
+	return value
+}