enhancement(regex_parser transform): Add RegexSet support to regex (f…

…ixes #2469) This allows to specify multiple regular expressions to be defined that will be matched on the input using regex::RegexSet. Signed-off-by: Matthias Endler <[email protected]>
vectordotdev · May 4, 2020 · 0fbb770 · 0fbb770
1 parent 338c83d
commit 0fbb770
Show file tree

Hide file tree

Showing 14 changed files with 147 additions and 81 deletions.
diff --git a/.meta/transforms/regex_parser.toml.erb b/.meta/transforms/regex_parser.toml.erb
@@ -38,17 +38,17 @@ If `target_field` is set and the log contains a field of the same name \
 as the target, it will only be overwritten if this is set to `true`.\
 """
 
-[transforms.regex_parser.options.regex]
+[transforms.regex_parser.options.regexes]
 type = "string"
 common = true
 examples = [
 """\
-^(?P<timestamp>[\\w\\-:\\+]+) (?P<level>\\w+) (?P<message>.*)$\
+['^(?P<timestamp>[\\w\\-:\\+]+) (?P<level>\\w+) (?P<message>.*)$']\
 """
 ]
 required = true
 description = """\
-The Regular Expression to apply. Do not include the leading or trailing `/`.\
+The Regular Expressions to apply. Do not include the leading or trailing `/` in any of the expressions.\
 """
 
 [transforms.regex_parser.options.target_field]
@@ -85,7 +85,7 @@ And the following configuration:
 [transforms.<transform-id>]
   type = "regex_parser"
   field = "message"
-  regex = '^(?P<host>[\w\.]+) - (?P<user>[\w]+) (?P<bytes_in>[\d]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$'
+  regexes = ['^(?P<host>[\w\.]+) - (?P<user>[\w]+) (?P<bytes_in>[\d]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$']
 
 [transforms.<transform-id>.types]
   bytes_in = "int"

diff --git a/benches/bench.rs b/benches/bench.rs
@@ -346,7 +346,7 @@ fn benchmark_transforms(c: &mut Criterion) {
                         "parser",
                         &["in"],
                         transforms::regex_parser::RegexParserConfig {
-                            regex: r"status=(?P<status>\d+)".to_string(),
+                            regexes: vec![r"status=(?P<status>\d+)".to_string()],
                             field: None,
                             ..Default::default()
                         },
@@ -410,7 +410,7 @@ fn benchmark_regex(c: &mut Criterion) {
                     let rt = vector::runtime::Runtime::single_threaded().unwrap();
                     let parser =transforms::regex_parser::RegexParserConfig {
                         // Many captures to stress the regex parser
-                        regex: r#"^(?P<addr>\d+\.\d+\.\d+\.\d+) (?P<user>\S+) (?P<auth>\S+) \[(?P<date>\d+/[A-Za-z]+/\d+:\d+:\d+:\d+ [+-]\d{4})\] "(?P<method>[A-Z]+) (?P<uri>[^"]+) HTTP/\d\.\d" (?P<code>\d+) (?P<size>\d+) "(?P<referrer>[^"]+)" "(?P<browser>[^"]+)""#.into(),
+                        regexes: vec![r#"^(?P<addr>\d+\.\d+\.\d+\.\d+) (?P<user>\S+) (?P<auth>\S+) \[(?P<date>\d+/[A-Za-z]+/\d+:\d+:\d+:\d+ [+-]\d{4})\] "(?P<method>[A-Z]+) (?P<uri>[^"]+) HTTP/\d\.\d" (?P<code>\d+) (?P<size>\d+) "(?P<referrer>[^"]+)" "(?P<browser>[^"]+)""#.into()],
                         field: None,
                         drop_failed: true,
                         ..Default::default()
@@ -465,7 +465,7 @@ fn benchmark_complex(c: &mut Criterion) {
                         "parser",
                         &["in1", "in2"],
                         transforms::regex_parser::RegexParserConfig {
-                            regex: r"status=(?P<status>\d+)".to_string(),
+                            regexes: vec![r"status=(?P<status>\d+)".to_string()],
                             field: None,
                             ..Default::default()
                         },

diff --git a/config/examples/docs_example.toml b/config/examples/docs_example.toml
@@ -11,7 +11,7 @@ data_dir = "/var/lib/vector"
 [transforms.apache_parser]
   inputs       = ["apache_logs"]
   type         = "regex_parser"                # fast/powerful regex
-  regex        = '^(?P<host>[w.]+) - (?P<user>[w]+) (?P<bytes_in>[d]+) [(?P<timestamp>.*)] "(?P<method>[w]+) (?P<path>.*)" (?P<status>[d]+) (?P<bytes_out>[d]+)$'
+  regexes      = ['^(?P<host>[w.]+) - (?P<user>[w]+) (?P<bytes_in>[d]+) [(?P<timestamp>.*)] "(?P<method>[w]+) (?P<path>.*)" (?P<status>[d]+) (?P<bytes_out>[d]+)$']
 
 # Sample the data to save on cost
 [transforms.apache_sampler]

diff --git a/config/examples/file_to_cloudwatch_metrics.toml b/config/examples/file_to_cloudwatch_metrics.toml
@@ -14,7 +14,7 @@ start_at_beginning = true
 [transforms.regex_parser]
 inputs = ["file"]
 type = "regex_parser"
-regex = '^(?P<host>[\w\.]+) - (?P<user>[\w-]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$'
+regexes = ['^(?P<host>[\w\.]+) - (?P<user>[\w-]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$']
 
 # Transform into metrics
 [transforms.log_to_metric]

diff --git a/config/examples/file_to_prometheus.toml b/config/examples/file_to_prometheus.toml
@@ -14,7 +14,7 @@ start_at_beginning = true
 [transforms.regex_parser]
 inputs = ["file"]
 type = "regex_parser"
-regex = '^(?P<host>[\w\.]+) - (?P<user>[\w-]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$'
+regexes = ['^(?P<host>[\w\.]+) - (?P<user>[\w-]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$']
 
 # Transform into metrics
 [transforms.log_to_metric]

diff --git a/config/vector.spec.toml b/config/vector.spec.toml
@@ -2182,11 +2182,12 @@ require('custom_module')
   overwrite_target = true
   overwrite_target = false
 
-  # The Regular Expression to apply. Do not include the leading or trailing `/`.
+  # The Regular Expressions to apply. Do not include the leading or trailing `/`
+  # in any of the expressions.
   #
   # * required
   # * type: string
-  regex = "^(?P<timestamp>[\\w\\-:\\+]+) (?P<level>\\w+) (?P<message>.*)$"
+  regexes = "['^(?P<timestamp>[\\w\\-:\\+]+) (?P<level>\\w+) (?P<message>.*)$']"
 
   # If this setting is present, the parsed fields will be inserted into the log
   # as a sub-object with this name. If a field with the same name already exists,

diff --git a/src/sources/kubernetes/message_parser.rs b/src/sources/kubernetes/message_parser.rs
@@ -89,9 +89,11 @@ impl Transform for DockerMessageTransformer {
 fn transform_cri_message() -> crate::Result<Box<dyn Transform>> {
     let mut rp_config = RegexParserConfig::default();
     // message field
-    rp_config.regex =
+    rp_config.regexes = vec![
         r"^(?P<timestamp>.*) (?P<stream>(stdout|stderr)) (?P<multiline_tag>(P|F)) (?P<message>.*)$"
-            .to_owned();
+            .to_owned(),
+    ];
+
     // drop field
     rp_config.types.insert(
         event::log_schema().timestamp_key().clone(),

diff --git a/src/sources/kubernetes/mod.rs b/src/sources/kubernetes/mod.rs
@@ -39,7 +39,7 @@ lazy_static! {
 
 #[derive(Debug, Snafu)]
 enum BuildError {
-    #[snafu(display("To large UID: {:?}", uid))]
+    #[snafu(display("Too large UID: {:?}", uid))]
     UidToLarge { uid: String },
     #[snafu(display("UID contains illegal characters: {:?}", uid))]
     IllegalCharacterInUid { uid: String },
@@ -65,7 +65,7 @@ impl SourceConfig for KubernetesConfig {
         // Kubernetes source uses 'file source' and various transforms to implement
         // gathering of logs over Kubernetes CRI supported container runtimes.
 
-        // Side goal is to make kubernetes source behave as simillarly to docker source
+        // Side goal is to make kubernetes source behave as similarly to Docker source
         // as possible to set a default behavior for all container related sources.
         // This will help with interchangeability.
 
@@ -127,7 +127,7 @@ impl TimeFilter {
         if let Some(Value::Timestamp(ts)) = event.as_log().get(&event::log_schema().timestamp_key())
         {
             if ts < &self.start {
-                trace!(message = "Recieved older log.", from = %ts.to_rfc3339());
+                trace!(message = "Received older log.", from = %ts.to_rfc3339());
                 return None;
             }
         }
@@ -152,9 +152,11 @@ fn transform_file() -> crate::Result<Box<dyn Transform>> {
 
     config.field = Some("file".into());
 
-    config.regex = r"^".to_owned()
-        + LOG_DIRECTORY
-        + r"(?P<pod_uid>[^/]*)/(?P<container_name>[^/]*)/[0-9]*[.]log$";
+    config.regexes = vec![
+        r"^".to_owned()
+            + LOG_DIRECTORY
+            + r"(?P<pod_uid>[^/]*)/(?P<container_name>[^/]*)/[0-9]*[.]log$",
+    ];
 
     // this field is implementation depended so remove it
     config.drop_field = true;
@@ -172,7 +174,7 @@ fn transform_file() -> crate::Result<Box<dyn Transform>> {
 
 /// Contains several regexes that can parse common forms of pod_uid.
 /// On the first message, regexes are tried out one after the other until
-/// first succesfull one has been found. After that that regex will be
+/// first successful one has been found. After that that regex will be
 /// always used.
 ///
 /// If nothing succeeds the message is still passed.
@@ -208,7 +210,7 @@ fn transform_pod_uid() -> crate::Result<ApplicableTransform> {
         let mut config = RegexParserConfig::default();
 
         config.field = Some("pod_uid".into());
-        config.regex = regex;
+        config.regexes = vec![regex];
         // Remove pod_uid as it isn't usable anywhere else.
         config.drop_field = true;
         config.drop_failed = true;