From 3764a97be1dd59946e23d086379eda6f8660f094 Mon Sep 17 00:00:00 2001 From: aditigupta17 Date: Wed, 22 Jul 2020 20:07:48 +0530 Subject: [PATCH 1/7] Change --runstats flag to string --- src/progly/query.cc | 2 +- src/progly/query.h | 2 +- src/progly/run-query.cc | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/progly/query.cc b/src/progly/query.cc index c4debcd229..017ea60412 100644 --- a/src/progly/query.cc +++ b/src/progly/query.cc @@ -87,7 +87,7 @@ std::string qop_file_name; std::string qop_tree_name; // other exec flags -bool runstats; +string runstats; std::string project_cols; // prints full record header and metadata diff --git a/src/progly/query.h b/src/progly/query.h index c4f340efd9..b6b5ccb0e1 100644 --- a/src/progly/query.h +++ b/src/progly/query.h @@ -118,7 +118,7 @@ extern std::string qop_file_name; extern std::string qop_tree_name; // other exec flags -extern bool runstats; +extern std::string runstats; extern std::string project_cols; // for debugging, prints full record header and metadata diff --git a/src/progly/run-query.cc b/src/progly/run-query.cc index 304e288cfa..6538e56690 100644 --- a/src/progly/run-query.cc +++ b/src/progly/run-query.cc @@ -157,7 +157,7 @@ int main(int argc, char **argv) ("index-delims", po::value(&text_index_delims)->default_value(""), "Use delim for text indexes (def=whitespace") ("index-ignore-stopwords", po::bool_switch(&text_index_ignore_stopwords)->default_value(false), "Ignore stopwords when building text index. (def=false)") ("index-plan-type", po::value(&index_plan_type)->default_value(Tables::SIP_IDX_STANDARD), "If 2 indexes, for intersection plan use '2', for union plan use '3' (def='1')") - ("runstats", po::bool_switch(&runstats)->default_value(false), "Run statistics on the specified table name") + ("runstats", po::value(&runstats)->default_value(""), "Run statistics on the specified table name") ("transform-format-type", po::value(&trans_format_str)->default_value("SFT_FLATBUF_FLEX_ROW"), "Destination format type ") ("verbose", po::bool_switch(&print_verbose)->default_value(false), "Print detailed record metadata.") ("header", po::bool_switch(&header)->default_value(false), "Print row header (i.e., row schema") @@ -357,7 +357,7 @@ int main(int argc, char **argv) assert (!index_cols.empty()); assert (use_cls); } - if (runstats) { + if (runstats != "") { assert (use_cls); } @@ -845,7 +845,7 @@ int main(int argc, char **argv) // for RUNSTATS job // launch run statistics on given table here. - if (query == "flatbuf" && runstats) { + if (query == "flatbuf" && runstats != "") { // create idx_op for workers stats_op op(qop_db_schema_name, qop_table_name, qop_data_schema); From 2f519f9b51d59ffa5996020194330fb40ea1d039 Mon Sep 17 00:00:00 2001 From: aditigupta17 Date: Wed, 22 Jul 2020 21:15:15 +0530 Subject: [PATCH 2/7] Change stats_op structure to accommodate parameters --- src/cls/tabular/cls_tabular.cc | 6 ++---- src/cls/tabular/cls_tabular.h | 16 ++++++---------- src/progly/query.cc | 3 ++- src/progly/query.h | 1 + src/progly/run-query.cc | 10 ++++++---- 5 files changed, 17 insertions(+), 19 deletions(-) diff --git a/src/cls/tabular/cls_tabular.cc b/src/cls/tabular/cls_tabular.cc index fe4d682f86..d457e87d5c 100644 --- a/src/cls/tabular/cls_tabular.cc +++ b/src/cls/tabular/cls_tabular.cc @@ -1933,13 +1933,11 @@ int exec_runstats_op(cls_method_context_t hctx, bufferlist *in, bufferlist *out) return -EINVAL; } - CLS_LOG(20, "exec_runstats_op: db_schema=%s", op.db_schema.c_str()); - CLS_LOG(20, "exec_runstats_op: table_name=%s", op.table_name.c_str()); + CLS_LOG(20, "exec_runstats_op: runstats_args=%s", op.runstats_args.c_str()); CLS_LOG(20, "exec_runstats_op: data_schema=%s", op.data_schema.c_str()); using namespace Tables; - std::string dbschema = op.db_schema; - std::string table_name = op.table_name; + std::string runstats_args = op.runstats_args; schema_vec data_schema = schemaFromString(op.data_schema); return 0; diff --git a/src/cls/tabular/cls_tabular.h b/src/cls/tabular/cls_tabular.h index ecfe3cd450..e06238278f 100644 --- a/src/cls/tabular/cls_tabular.h +++ b/src/cls/tabular/cls_tabular.h @@ -305,19 +305,17 @@ WRITE_CLASS_ENCODER(test_op) struct stats_op { - std::string db_schema; - std::string table_name; + std::string runstats_args; std::string data_schema; stats_op() {} - stats_op(std::string dbscma, std::string tname, std::string dtscma) : - db_schema(dbscma), table_name(tname), data_schema(dtscma) { } + stats_op(std::string runstats_args, std::string dtscma) : + runstats_args(runstats_args), data_schema(dtscma) { } // serialize the fields into bufferlist to be sent over the wire void encode(bufferlist& bl) const { ENCODE_START(1, 1, bl); - ::encode(db_schema, bl); - ::encode(table_name, bl); + ::encode(runstats_args, bl); ::encode(data_schema, bl); ENCODE_FINISH(bl); } @@ -325,8 +323,7 @@ struct stats_op { // deserialize the fields from the bufferlist into this struct void decode(bufferlist::iterator& bl) { DECODE_START(1, bl); - ::decode(db_schema, bl); - ::decode(table_name, bl); + ::decode(runstats_args, bl); ::decode(data_schema, bl); DECODE_FINISH(bl); } @@ -334,8 +331,7 @@ struct stats_op { std::string toString() { std::string s; s.append("stats_op:"); - s.append(" .db_schema=" + db_schema); - s.append(" .table_name=" + table_name); + s.append(" .runstats_args=" + runstats_args); s.append(" .data_schema=" + data_schema); return s; } diff --git a/src/progly/query.cc b/src/progly/query.cc index 017ea60412..4362f90b0d 100644 --- a/src/progly/query.cc +++ b/src/progly/query.cc @@ -65,6 +65,7 @@ std::string qop_groupby_cols; std::string qop_orderby_cols; std::string qop_index_preds; std::string qop_index2_preds; +std::string qop_runstats_args; // build index op params for flatbufs bool idx_op_idx_unique; @@ -404,7 +405,7 @@ void worker_exec_runstats_op(librados::IoCtx *ioctx, stats_op op) } std::string oid = target_objects.back(); target_objects.pop_back(); - std::cout << "computing stats...table: " << op.table_name << " oid: " + std::cout << "computing stats...table: " << op.runstats_args << " oid: " << oid << std::endl; work_lock.unlock(); diff --git a/src/progly/query.h b/src/progly/query.h index b6b5ccb0e1..68d68e359d 100644 --- a/src/progly/query.h +++ b/src/progly/query.h @@ -97,6 +97,7 @@ extern std::string qop_groupby_cols; extern std::string qop_orderby_cols; extern std::string qop_index_preds; extern std::string qop_index2_preds; +extern std::string qop_runstats_args; extern bool idx_op_idx_unique; extern bool idx_op_ignore_stopwords; diff --git a/src/progly/run-query.cc b/src/progly/run-query.cc index 6538e56690..b7763b1c54 100644 --- a/src/progly/run-query.cc +++ b/src/progly/run-query.cc @@ -52,6 +52,7 @@ int main(int argc, char **argv) std::string index2_preds; std::string index_cols; std::string index2_cols; + std::string runstats_args; bool lock_obj_free; bool lock_obj_init; bool lock_obj_get; @@ -157,7 +158,7 @@ int main(int argc, char **argv) ("index-delims", po::value(&text_index_delims)->default_value(""), "Use delim for text indexes (def=whitespace") ("index-ignore-stopwords", po::bool_switch(&text_index_ignore_stopwords)->default_value(false), "Ignore stopwords when building text index. (def=false)") ("index-plan-type", po::value(&index_plan_type)->default_value(Tables::SIP_IDX_STANDARD), "If 2 indexes, for intersection plan use '2', for union plan use '3' (def='1')") - ("runstats", po::value(&runstats)->default_value(""), "Run statistics on the specified table name") + ("runstats", po::value(&runstats_args)->default_value(""), "Run statistics on the specified table name") ("transform-format-type", po::value(&trans_format_str)->default_value("SFT_FLATBUF_FLEX_ROW"), "Destination format type ") ("verbose", po::bool_switch(&print_verbose)->default_value(false), "Print detailed record metadata.") ("header", po::bool_switch(&header)->default_value(false), "Print row header (i.e., row schema") @@ -357,7 +358,7 @@ int main(int argc, char **argv) assert (!index_cols.empty()); assert (use_cls); } - if (runstats != "") { + if (runstats_args != "") { assert (use_cls); } @@ -845,10 +846,11 @@ int main(int argc, char **argv) // for RUNSTATS job // launch run statistics on given table here. - if (query == "flatbuf" && runstats != "") { + if (query == "flatbuf" && runstats_args != "") { // create idx_op for workers - stats_op op(qop_db_schema_name, qop_table_name, qop_data_schema); + qop_runstats_args = runstats_args; + stats_op op(qop_runstats_args, qop_data_schema); if (debug) cout << "DEBUG: stats op=" << op.toString() << endl; From 9116c1d792008f151641bdf2b0fae74ae444879a Mon Sep 17 00:00:00 2001 From: aditigupta17 Date: Thu, 23 Jul 2020 22:12:29 +0530 Subject: [PATCH 3/7] Validate number of arguments for --runstats flag --- src/progly/query.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/progly/query.cc b/src/progly/query.cc index 4362f90b0d..c314ae9809 100644 --- a/src/progly/query.cc +++ b/src/progly/query.cc @@ -407,6 +407,15 @@ void worker_exec_runstats_op(librados::IoCtx *ioctx, stats_op op) target_objects.pop_back(); std::cout << "computing stats...table: " << op.runstats_args << " oid: " << oid << std::endl; + // Validating input + // Step-1 : Check if runstats_args has 5 arguments (col, min, max, bucket, sampling) + std::string arguments = op.runstats_args; + boost::trim(arguments); + vector args; + boost::split(args, arguments, boost::is_any_of(","), + boost::token_compress_on); + assert(args.size() == 5); + work_lock.unlock(); ceph::bufferlist inbl, outbl; From 26d2369e3d742585fb84bab87529e5386a832c67 Mon Sep 17 00:00:00 2001 From: aditigupta17 Date: Thu, 23 Jul 2020 22:59:55 +0530 Subject: [PATCH 4/7] Validate column in --runstats argument --- src/progly/query.cc | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/progly/query.cc b/src/progly/query.cc index c314ae9809..fc47b1aee2 100644 --- a/src/progly/query.cc +++ b/src/progly/query.cc @@ -407,6 +407,7 @@ void worker_exec_runstats_op(librados::IoCtx *ioctx, stats_op op) target_objects.pop_back(); std::cout << "computing stats...table: " << op.runstats_args << " oid: " << oid << std::endl; + // Validating input // Step-1 : Check if runstats_args has 5 arguments (col, min, max, bucket, sampling) std::string arguments = op.runstats_args; @@ -416,6 +417,25 @@ void worker_exec_runstats_op(librados::IoCtx *ioctx, stats_op op) boost::token_compress_on); assert(args.size() == 5); + // Step-2 : Check if the passed column is valid + std::string col = args[0]; + std::string data_schema = op.data_schema; + + boost::trim(col); + boost::trim(data_schema); + + boost::to_upper(col); + + assert (!data_schema.empty()); + + Tables::schema_vec table_schema = Tables::schemaFromString(data_schema); + Tables::schema_vec sv = schemaFromColNames(table_schema, col); + if (sv.empty()) { + cerr << "Error: colname=" << col << " not present in schema." + << std::endl; + assert (Tables::TablesErrCodes::RequestedColNotPresent == 0); + } + work_lock.unlock(); ceph::bufferlist inbl, outbl; From 2c4cc211c49d2cdd46862d1ffb875f6c7db1545b Mon Sep 17 00:00:00 2001 From: aditigupta17 Date: Fri, 24 Jul 2020 00:37:35 +0530 Subject: [PATCH 5/7] Validate sampling parameter in --runstats argument --- src/cls/tabular/cls_tabular_utils.h | 17 +++++++++++++++++ src/progly/query.cc | 12 ++++++++++++ 2 files changed, 29 insertions(+) diff --git a/src/cls/tabular/cls_tabular_utils.h b/src/cls/tabular/cls_tabular_utils.h index 6ce934b7e7..666fae7226 100644 --- a/src/cls/tabular/cls_tabular_utils.h +++ b/src/cls/tabular/cls_tabular_utils.h @@ -300,6 +300,23 @@ static inline int strtou64(const std::string value, uint64_t *out) return 0; } +/* + * Convert string into float value. + */ +static inline int strtofloat(const std::string value, float *out) +{ + float v; + + try { + v = boost::lexical_cast(value); + } catch (boost::bad_lexical_cast &) { + return -EIO; + } + + *out = v; + return 0; +} + // contains the value of a predicate to be applied template class PredicateValue diff --git a/src/progly/query.cc b/src/progly/query.cc index fc47b1aee2..5184907792 100644 --- a/src/progly/query.cc +++ b/src/progly/query.cc @@ -436,6 +436,18 @@ void worker_exec_runstats_op(librados::IoCtx *ioctx, stats_op op) assert (Tables::TablesErrCodes::RequestedColNotPresent == 0); } + // Step-3: Validate sampling argument: should be a float >= 0 && <= 1 + float sampling; + int err = Tables::strtofloat(args[4], &sampling); + if (err != 0) { + cerr << "Error: Invalid sampling data type." << std::endl; + exit(1); + } + if (!(sampling >= 0 && sampling <= 1)) { + cerr << "Error: Invalid sampling value = " << sampling << " Should lie in range [0, 1]." << std::endl; + exit(1); + } + work_lock.unlock(); ceph::bufferlist inbl, outbl; From 35c849274931b757914f2a4675d68b2faefde055 Mon Sep 17 00:00:00 2001 From: aditigupta17 Date: Fri, 24 Jul 2020 01:24:13 +0530 Subject: [PATCH 6/7] Check data type of number of buckets in --runstats argument --- src/cls/tabular/cls_tabular_utils.h | 1 - src/progly/query.cc | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/cls/tabular/cls_tabular_utils.h b/src/cls/tabular/cls_tabular_utils.h index 666fae7226..b20ddff7ed 100644 --- a/src/cls/tabular/cls_tabular_utils.h +++ b/src/cls/tabular/cls_tabular_utils.h @@ -292,7 +292,6 @@ static inline int strtou64(const std::string value, uint64_t *out) try { v = boost::lexical_cast(value); } catch (boost::bad_lexical_cast &) { - CLS_ERR("converting key into numeric value %s", value.c_str()); return -EIO; } diff --git a/src/progly/query.cc b/src/progly/query.cc index 5184907792..04ba79c589 100644 --- a/src/progly/query.cc +++ b/src/progly/query.cc @@ -448,6 +448,14 @@ void worker_exec_runstats_op(librados::IoCtx *ioctx, stats_op op) exit(1); } + // Step-4 : Check number of buckets: should be int + uint64_t number_of_buckets; + err = Tables::strtou64(args[3], &number_of_buckets); + if (err != 0) { + cerr << "Error: Invalid number_of_buckets data type." << std::endl; + exit(1); + } + work_lock.unlock(); ceph::bufferlist inbl, outbl; From 5afb833a12957ef3b5485e5c45dccab11cec7cd6 Mon Sep 17 00:00:00 2001 From: aditigupta17 Date: Fri, 24 Jul 2020 22:07:27 +0530 Subject: [PATCH 7/7] Pack --runstats arguments together --- src/cls/tabular/cls_tabular_utils.h | 50 +++++++++++++++ src/progly/query.cc | 96 ++++++++++++++++++++++++++++- 2 files changed, 143 insertions(+), 3 deletions(-) diff --git a/src/cls/tabular/cls_tabular_utils.h b/src/cls/tabular/cls_tabular_utils.h index b20ddff7ed..53b8392d59 100644 --- a/src/cls/tabular/cls_tabular_utils.h +++ b/src/cls/tabular/cls_tabular_utils.h @@ -506,6 +506,56 @@ class TypedPredicate : public PredicateBase } }; +template +class LimitValue +{ +public: + T val; + LimitValue(T v) : val(v) {}; + LimitValue(const LimitValue& rhs); + LimitValue& operator=(const LimitValue& rhs); +}; + +template +class StatsArgument +{ +private: + const std::string col_name; + LimitValue min; + LimitValue max; + const uint64_t buckets; + const float sampling; + +public: + StatsArgument(std::string col, const T& min_val, const T& max_val, uint64_t b, float s) : + col_name(col), min(min_val), max(max_val), buckets(b), sampling(s) {} + + ~StatsArgument() { } + StatsArgument& getThis() {return *this;} + const StatsArgument& getThis() const {return *this;} + T MinVal() {return min.val;} + T MaxVal() {return max.val;} + + std::string toString() { + std::string s("StatsArgument:\n"); + s.append(" col_name=" + col_name + "\n"); + s.append(" min="); + std::stringstream ss_min; + ss_min << this->MinVal(); + s.append(ss_min.str()); + s.append("\n"); + s.append(" max="); + std::stringstream ss_max; + ss_max << this->MaxVal(); + s.append(ss_max.str()); + s.append("\n"); + s.append(" buckets=" + std::to_string(buckets) + "\n"); + s.append(" sampling=" + std::to_string(sampling) + "\n"); + s.append("\n"); + return s; + } +}; + // col metadata used for the schema const int NUM_COL_INFO_FIELDS = 5; struct col_info { diff --git a/src/progly/query.cc b/src/progly/query.cc index 04ba79c589..f9fb30c653 100644 --- a/src/progly/query.cc +++ b/src/progly/query.cc @@ -436,15 +436,15 @@ void worker_exec_runstats_op(librados::IoCtx *ioctx, stats_op op) assert (Tables::TablesErrCodes::RequestedColNotPresent == 0); } - // Step-3: Validate sampling argument: should be a float >= 0 && <= 1 + // Step-3: Validate sampling argument: should be a float > 0 && <= 1 float sampling; int err = Tables::strtofloat(args[4], &sampling); if (err != 0) { cerr << "Error: Invalid sampling data type." << std::endl; exit(1); } - if (!(sampling >= 0 && sampling <= 1)) { - cerr << "Error: Invalid sampling value = " << sampling << " Should lie in range [0, 1]." << std::endl; + if (!(sampling > 0 && sampling <= 1)) { + cerr << "Error: Invalid sampling value = " << sampling << " Should lie in range (0, 1]." << std::endl; exit(1); } @@ -456,6 +456,96 @@ void worker_exec_runstats_op(librados::IoCtx *ioctx, stats_op op) exit(1); } + // Step-5: Get column data type, pack all arguments together and pass to function + Tables::col_info column = sv[0]; + std::string min = args[1]; + std::string max = args[2]; + switch (column.type) { + case Tables::SkyDataType::SDT_INT8: { + Tables::StatsArgument* s = \ + new Tables::StatsArgument \ + (col, static_cast(std::stol(min)), static_cast(std::stol(max)), + number_of_buckets, sampling); + std::cout << s->toString() << "\n"; + break; + } + case Tables::SkyDataType::SDT_INT16: { + Tables::StatsArgument* s = \ + new Tables::StatsArgument \ + (col, static_cast(std::stol(min)), static_cast(std::stol(max)), + number_of_buckets, sampling); + std::cout << s->toString() << "\n"; + break; + } + case Tables::SkyDataType::SDT_INT32: { + Tables::StatsArgument* s = \ + new Tables::StatsArgument \ + (col, static_cast(std::stol(min)), static_cast(std::stol(max)), + number_of_buckets, sampling); + std::cout << s->toString() << "\n"; + break; + } + case Tables::SkyDataType::SDT_INT64: { + Tables::StatsArgument* s = \ + new Tables::StatsArgument \ + (col, static_cast(std::stol(min)), static_cast(std::stol(max)), + number_of_buckets, sampling); + std::cout << s->toString() << "\n"; + break; + } + case Tables::SkyDataType::SDT_UINT8: { + Tables::StatsArgument* s = \ + new Tables::StatsArgument \ + (col, static_cast(std::stol(min)), static_cast(std::stol(max)), + number_of_buckets, sampling); + std::cout << s->toString() << "\n"; + break; + } + case Tables::SkyDataType::SDT_UINT16: { + Tables::StatsArgument* s = \ + new Tables::StatsArgument \ + (col, static_cast(std::stol(min)), static_cast(std::stol(max)), + number_of_buckets, sampling); + std::cout << s->toString() << "\n"; + break; + } + case Tables::SkyDataType::SDT_UINT32: { + Tables::StatsArgument* s = \ + new Tables::StatsArgument \ + (col, static_cast(std::stol(min)), static_cast(std::stol(max)), + number_of_buckets, sampling); + std::cout << s->toString() << "\n"; + break; + } + case Tables::SkyDataType::SDT_UINT64: { + Tables::StatsArgument* s = \ + new Tables::StatsArgument \ + (col, static_cast(std::stol(min)), static_cast(std::stol(max)), + number_of_buckets, sampling); + std::cout << s->toString() << "\n"; + break; + } + case Tables::SkyDataType::SDT_FLOAT: { + Tables::StatsArgument* s = \ + new Tables::StatsArgument \ + (col, static_cast(std::stol(min)), static_cast(std::stol(max)), + number_of_buckets, sampling); + std::cout << s->toString() << "\n"; + break; + } + case Tables::SkyDataType::SDT_DOUBLE: { + Tables::StatsArgument* s = \ + new Tables::StatsArgument \ + (col, static_cast(std::stol(min)), static_cast(std::stol(max)), + number_of_buckets, sampling); + std::cout << s->toString() << "\n"; + break; + } + default: assert (Tables::TablesErrCodes::UnknownSkyDataType == 0); + } + + // bucket_min | bucket_max | count | dead_count | star_representation + work_lock.unlock(); ceph::bufferlist inbl, outbl;