Added the "big5" workload based on Elastic's comparison benchmark.

Signed-off-by: Govind Kamat <[email protected]>
sandeshkr419 · Jan 26, 2024 · 06cc8a6 · 06cc8a6
1 parent bdbd4bb
commit 06cc8a6
Show file tree

Hide file tree

Showing 47 changed files with 2,343 additions and 0 deletions.
diff --git a/big5/README.md b/big5/README.md
@@ -0,0 +1,91 @@
+## Big5 OSB Workload
+
+This repository contains the "Big5" workload for benchmarking OpenSearch using OpenSearch Benchmark. The "Big5" workload focuses on five essential areas in OpenSearch performance and querying: Text Querying, Sorting, Date Histogram, Range Queries, and Terms Aggregation.
+
+This workload is derived from the [Elasticsearch vs. OpenSearch comparison benchmark](https://github.com/elastic/elasticsearch-opensearch-benchmark) published by Elastic.  It has been slightly modified to be compliant with OpenSearch features.
+
+
+### Prerequisites
+
+Before using this repository, ensure you have the following installed:
+
+- [OpenSearch](https://opensearch.org) (v2.11 or later)
+- [OpenSearch Benchmark](https://opensearch.org/docs/latest/benchmark) (v1.2 or later)
+- An existing datastream with data generated using the open-source tool [Elastic Integration Corpus Generator Tool](https://github.com/elastic/elastic-integration-corpus-generator-tool). The data should follow the expected document structure mentioned below.
+
+
+### Data Document Structure
+
+The datastream must already exist and the data be generated using the open-source tool [Elastic Integration Corpus Generator Tool](https://github.com/elastic/elastic-integration-corpus-generator-tool). The expected structure of the documents is specified in the "Data Document Structure" section of this README.
+
+```json
+{
+    "message": "2023-04-30T21:48:56.160Z Apr 30 21:48:56 ip-66-221-134-40 journal: donkey glazer fly shark whip servant thornfalcon",
+    "process": {
+        "name": "journal"
+    },
+    "aws.cloudwatch": {
+        "ingestion_time": "2023-04-30T21:48:56.160Z",
+        "log_group": "/var/log/messages",
+        "log_stream": "luckcrafter"
+    },
+    "tags": [
+        "preserve_original_event"
+    ],
+    "meta": {
+        "file": "2023-01-02/1682891301-gotext.ndjson.gz"
+    },
+    "cloud": {
+        "region": "eu-central-1"
+    },
+    "@timestamp": "2023-01-02T22:02:34.000Z",
+    "input": {
+        "type": "aws-cloudwatch"
+    },
+    "metrics": {
+        "tmin": 849,
+        "size": 1981
+    },
+    "log.file.path": "/var/log/messages/luckcrafter",
+    "event": {
+        "id": "sunsetmark",
+        "dataset": "generic",
+        "ingested": "2023-07-20T03:36:30.223806Z"
+    },
+    "agent": {
+        "id": "c315dc22-3ea6-44dc-8d56-fd02f675367b",
+        "name": "fancydancer",
+        "ephemeral_id": "c315dc22-3ea6-44dc-8d56-fd02f675367b",
+        "type": "filebeat",
+        "version": "8.8.0"
+    }
+}
+
+```
+
+
+
+### The "Big 5" Areas
+
+1. Text Querying:
+
+   Free text search is vital for databases due to its flexibility, ease of use, and ability to quickly filter data. It allows users to input natural language queries, eliminating the need for complex query languages and making data retrieval more intuitive. With free text search, users can find information using familiar terms, such as names, email addresses, or user IDs, without requiring knowledge about the underlying schema. It is particularly useful for handling unstructured data, supporting partial matches, and facilitating data exploration.
+
+2. Sorting:
+
+   Sorting is a process of arranging data in a particular order, such as alphabetical, numerical, or chronological. The sort query in OpenSearch is useful for organizing search results based on specific criteria, ensuring that the most relevant results are presented to users. It is a vital feature that enhances the user experience and improves the overall effectiveness of the search process.
+
+   In the context of observability and security, in which signals from multiple systems are correlated, sorting is a crucial operation. By sorting results based on timestamp, metrics, or any other relevant field, analysts can more efficiently identify issues, security threats, or correlations within the data. With this information, it becomes easier to identify patterns, trends, and insights that can inform business decisions to protect your data and ensure uptime.
+
+3. Date Histogram:
+
+   The date histogram aggregation in OpenSearch is useful for aggregating and analyzing time-based data by dividing it into intervals, or buckets. This capability allows users to visualize and better understand trends, patterns, and anomalies over time.
+
+4. Range Queries:
+
+   The range query in OpenSearch is useful for filtering search results based on a specific range of values in a given field. This capability allows users to narrow down their search results and find more relevant information quickly.
+
+5. Terms Aggregation:
+
+   Terms allow users to dynamically build into buckets to source based on aggregation values. These can be 100s or 1000s of unique terms that get returned individually or in composite aggregations. Larger size values use more memory and compute to push the aggregations through.
+
diff --git a/big5/index.json b/big5/index.json
@@ -0,0 +1,162 @@
+{
+    "settings": {
+	  "index.codec": "best_compression",
+	  "index.number_of_shards": "1",
+	  "index.translog.sync_interval": "30s",
+	  "index.translog.durability": "async",
+	  "index.number_of_replicas": "1"
+      },
+      "mappings": {
+	  "properties": {
+	      "agent": {
+		  "type": "object",
+		  "properties": {
+		      "name": {
+			  "ignore_above": 1024,
+			  "type": "keyword"
+		      },
+		      "id": {
+			  "ignore_above": 1024,
+			  "type": "keyword"
+		      },
+		      "ephemeral_id": {
+			  "ignore_above": 1024,
+			  "type": "keyword"
+		      },
+		      "type": {
+			  "ignore_above": 1024,
+			  "type": "keyword"
+		      },
+		      "version": {
+			  "ignore_above": 1024,
+			  "type": "keyword"
+		      }
+		  }
+	      },
+	      "process": {
+		  "type": "object",
+		  "properties": {
+		      "name": {
+			  "ignore_above": 1024,
+			  "type": "keyword"
+		      }
+		  }
+	      },
+	      "log": {
+		  "type": "object",
+		  "properties": {
+		      "file": {
+			  "type": "object",
+			  "properties": {
+			      "path": {
+				  "ignore_above": 1024,
+				  "type": "keyword"
+			      }
+			  }
+		      }
+		  }
+	      },
+	      "message": {
+		  "type": "text"
+	      },
+	      "tags": {
+		  "ignore_above": 1024,
+		  "type": "keyword"
+	      },
+	      "cloud": {
+		  "type": "object",
+		  "properties": {
+		      "region": {
+			  "ignore_above": 1024,
+			  "type": "keyword"
+		      }
+		  }
+	      },
+	      "input": {
+		  "type": "object",
+		  "properties": {
+		      "type": {
+			  "ignore_above": 1024,
+			  "type": "keyword"
+		      }
+		  }
+	      },
+	      "@timestamp": {
+		  "type": "date"
+	      },
+	      "ecs": {
+		  "type": "object",
+		  "properties": {
+		      "version": {
+			  "ignore_above": 1024,
+			  "type": "keyword"
+		      }
+		  }
+	      },
+	      "meta": {
+		  "type": "object",
+		  "properties": {
+		      "file": {
+			  "ignore_above": 1024,
+			  "type": "keyword"
+		      }
+		  }
+	      },
+	      "host": {
+		  "type": "object"
+	      },
+	      "metrics": {
+		  "type": "object",
+		  "properties": {
+		      "size": {
+			  "type": "long"
+		      },
+		      "tmax": {
+			  "type": "long"
+		      },
+		      "tmin": {
+			  "type": "long"
+		      }
+		  }
+	      },
+	      "aws": {
+		  "type": "object",
+		  "properties": {
+		      "cloudwatch": {
+			  "type": "object",
+			  "properties": {
+			      "log_group": {
+				  "ignore_above": 1024,
+				  "type": "keyword"
+			      },
+			      "ingestion_time": {
+				  "ignore_above": 1024,
+				  "type": "keyword"
+			      },
+			      "log_stream": {
+				  "ignore_above": 1024,
+				  "type": "keyword"
+			      }
+			  }
+		      }
+		  }
+	      },
+	      "event": {
+		  "type": "object",
+		  "properties": {
+		      "ingested": {
+			  "type": "date"
+		      },
+		      "id": {
+			  "ignore_above": 1024,
+			  "type": "keyword"
+		      },
+		      "dataset": {
+			  "ignore_above": 1024,
+			  "type": "keyword"
+		      }
+		  }
+	      }
+	  }
+      }
+}