Skip to content

Commit

Permalink
enhancement(regex_parser transform): Add RegexSet support to regex (f…
Browse files Browse the repository at this point in the history
…ixes #2469)

This allows to specify multiple regular expressions to be defined
that will be matched on the input using regex::RegexSet.

Signed-off-by: Matthias Endler <[email protected]>
  • Loading branch information
mre committed May 4, 2020
1 parent 338c83d commit 0fbb770
Show file tree
Hide file tree
Showing 14 changed files with 147 additions and 81 deletions.
8 changes: 4 additions & 4 deletions .meta/transforms/regex_parser.toml.erb
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,17 @@ If `target_field` is set and the log contains a field of the same name \
as the target, it will only be overwritten if this is set to `true`.\
"""

[transforms.regex_parser.options.regex]
[transforms.regex_parser.options.regexes]
type = "string"
common = true
examples = [
"""\
^(?P<timestamp>[\\w\\-:\\+]+) (?P<level>\\w+) (?P<message>.*)$\
['^(?P<timestamp>[\\w\\-:\\+]+) (?P<level>\\w+) (?P<message>.*)$']\
"""
]
required = true
description = """\
The Regular Expression to apply. Do not include the leading or trailing `/`.\
The Regular Expressions to apply. Do not include the leading or trailing `/` in any of the expressions.\
"""

[transforms.regex_parser.options.target_field]
Expand Down Expand Up @@ -85,7 +85,7 @@ And the following configuration:
[transforms.<transform-id>]
type = "regex_parser"
field = "message"
regex = '^(?P<host>[\w\.]+) - (?P<user>[\w]+) (?P<bytes_in>[\d]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$'
regexes = ['^(?P<host>[\w\.]+) - (?P<user>[\w]+) (?P<bytes_in>[\d]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$']

[transforms.<transform-id>.types]
bytes_in = "int"
Expand Down
6 changes: 3 additions & 3 deletions benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ fn benchmark_transforms(c: &mut Criterion) {
"parser",
&["in"],
transforms::regex_parser::RegexParserConfig {
regex: r"status=(?P<status>\d+)".to_string(),
regexes: vec![r"status=(?P<status>\d+)".to_string()],
field: None,
..Default::default()
},
Expand Down Expand Up @@ -410,7 +410,7 @@ fn benchmark_regex(c: &mut Criterion) {
let rt = vector::runtime::Runtime::single_threaded().unwrap();
let parser =transforms::regex_parser::RegexParserConfig {
// Many captures to stress the regex parser
regex: r#"^(?P<addr>\d+\.\d+\.\d+\.\d+) (?P<user>\S+) (?P<auth>\S+) \[(?P<date>\d+/[A-Za-z]+/\d+:\d+:\d+:\d+ [+-]\d{4})\] "(?P<method>[A-Z]+) (?P<uri>[^"]+) HTTP/\d\.\d" (?P<code>\d+) (?P<size>\d+) "(?P<referrer>[^"]+)" "(?P<browser>[^"]+)""#.into(),
regexes: vec![r#"^(?P<addr>\d+\.\d+\.\d+\.\d+) (?P<user>\S+) (?P<auth>\S+) \[(?P<date>\d+/[A-Za-z]+/\d+:\d+:\d+:\d+ [+-]\d{4})\] "(?P<method>[A-Z]+) (?P<uri>[^"]+) HTTP/\d\.\d" (?P<code>\d+) (?P<size>\d+) "(?P<referrer>[^"]+)" "(?P<browser>[^"]+)""#.into()],
field: None,
drop_failed: true,
..Default::default()
Expand Down Expand Up @@ -465,7 +465,7 @@ fn benchmark_complex(c: &mut Criterion) {
"parser",
&["in1", "in2"],
transforms::regex_parser::RegexParserConfig {
regex: r"status=(?P<status>\d+)".to_string(),
regexes: vec![r"status=(?P<status>\d+)".to_string()],
field: None,
..Default::default()
},
Expand Down
2 changes: 1 addition & 1 deletion config/examples/docs_example.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ data_dir = "/var/lib/vector"
[transforms.apache_parser]
inputs = ["apache_logs"]
type = "regex_parser" # fast/powerful regex
regex = '^(?P<host>[w.]+) - (?P<user>[w]+) (?P<bytes_in>[d]+) [(?P<timestamp>.*)] "(?P<method>[w]+) (?P<path>.*)" (?P<status>[d]+) (?P<bytes_out>[d]+)$'
regexes = ['^(?P<host>[w.]+) - (?P<user>[w]+) (?P<bytes_in>[d]+) [(?P<timestamp>.*)] "(?P<method>[w]+) (?P<path>.*)" (?P<status>[d]+) (?P<bytes_out>[d]+)$']

# Sample the data to save on cost
[transforms.apache_sampler]
Expand Down
2 changes: 1 addition & 1 deletion config/examples/file_to_cloudwatch_metrics.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ start_at_beginning = true
[transforms.regex_parser]
inputs = ["file"]
type = "regex_parser"
regex = '^(?P<host>[\w\.]+) - (?P<user>[\w-]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$'
regexes = ['^(?P<host>[\w\.]+) - (?P<user>[\w-]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$']

# Transform into metrics
[transforms.log_to_metric]
Expand Down
2 changes: 1 addition & 1 deletion config/examples/file_to_prometheus.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ start_at_beginning = true
[transforms.regex_parser]
inputs = ["file"]
type = "regex_parser"
regex = '^(?P<host>[\w\.]+) - (?P<user>[\w-]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$'
regexes = ['^(?P<host>[\w\.]+) - (?P<user>[\w-]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$']

# Transform into metrics
[transforms.log_to_metric]
Expand Down
5 changes: 3 additions & 2 deletions config/vector.spec.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2182,11 +2182,12 @@ require('custom_module')
overwrite_target = true
overwrite_target = false

# The Regular Expression to apply. Do not include the leading or trailing `/`.
# The Regular Expressions to apply. Do not include the leading or trailing `/`
# in any of the expressions.
#
# * required
# * type: string
regex = "^(?P<timestamp>[\\w\\-:\\+]+) (?P<level>\\w+) (?P<message>.*)$"
regexes = "['^(?P<timestamp>[\\w\\-:\\+]+) (?P<level>\\w+) (?P<message>.*)$']"

# If this setting is present, the parsed fields will be inserted into the log
# as a sub-object with this name. If a field with the same name already exists,
Expand Down
6 changes: 4 additions & 2 deletions src/sources/kubernetes/message_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,11 @@ impl Transform for DockerMessageTransformer {
fn transform_cri_message() -> crate::Result<Box<dyn Transform>> {
let mut rp_config = RegexParserConfig::default();
// message field
rp_config.regex =
rp_config.regexes = vec![
r"^(?P<timestamp>.*) (?P<stream>(stdout|stderr)) (?P<multiline_tag>(P|F)) (?P<message>.*)$"
.to_owned();
.to_owned(),
];

// drop field
rp_config.types.insert(
event::log_schema().timestamp_key().clone(),
Expand Down
18 changes: 10 additions & 8 deletions src/sources/kubernetes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ lazy_static! {

#[derive(Debug, Snafu)]
enum BuildError {
#[snafu(display("To large UID: {:?}", uid))]
#[snafu(display("Too large UID: {:?}", uid))]
UidToLarge { uid: String },
#[snafu(display("UID contains illegal characters: {:?}", uid))]
IllegalCharacterInUid { uid: String },
Expand All @@ -65,7 +65,7 @@ impl SourceConfig for KubernetesConfig {
// Kubernetes source uses 'file source' and various transforms to implement
// gathering of logs over Kubernetes CRI supported container runtimes.

// Side goal is to make kubernetes source behave as simillarly to docker source
// Side goal is to make kubernetes source behave as similarly to Docker source
// as possible to set a default behavior for all container related sources.
// This will help with interchangeability.

Expand Down Expand Up @@ -127,7 +127,7 @@ impl TimeFilter {
if let Some(Value::Timestamp(ts)) = event.as_log().get(&event::log_schema().timestamp_key())
{
if ts < &self.start {
trace!(message = "Recieved older log.", from = %ts.to_rfc3339());
trace!(message = "Received older log.", from = %ts.to_rfc3339());
return None;
}
}
Expand All @@ -152,9 +152,11 @@ fn transform_file() -> crate::Result<Box<dyn Transform>> {

config.field = Some("file".into());

config.regex = r"^".to_owned()
+ LOG_DIRECTORY
+ r"(?P<pod_uid>[^/]*)/(?P<container_name>[^/]*)/[0-9]*[.]log$";
config.regexes = vec![
r"^".to_owned()
+ LOG_DIRECTORY
+ r"(?P<pod_uid>[^/]*)/(?P<container_name>[^/]*)/[0-9]*[.]log$",
];

// this field is implementation depended so remove it
config.drop_field = true;
Expand All @@ -172,7 +174,7 @@ fn transform_file() -> crate::Result<Box<dyn Transform>> {

/// Contains several regexes that can parse common forms of pod_uid.
/// On the first message, regexes are tried out one after the other until
/// first succesfull one has been found. After that that regex will be
/// first successful one has been found. After that that regex will be
/// always used.
///
/// If nothing succeeds the message is still passed.
Expand Down Expand Up @@ -208,7 +210,7 @@ fn transform_pod_uid() -> crate::Result<ApplicableTransform> {
let mut config = RegexParserConfig::default();

config.field = Some("pod_uid".into());
config.regex = regex;
config.regexes = vec![regex];
// Remove pod_uid as it isn't usable anywhere else.
config.drop_field = true;
config.drop_failed = true;
Expand Down
Loading

0 comments on commit 0fbb770

Please sign in to comment.