Skip to content

Commit

Permalink
Multi string search (draios#624)
Browse files Browse the repository at this point in the history
* Whitespace diffs.

Breaking whitespace diffs into standalone commit.

* Implement "in (...)" as hash table lookups.

Currently, a filtering expression with "x in (a, b c, ...)" is
transformed into "x=a or x=b or x=c or ...". This can be slow for very
long sets of a, b, c, especially when x has to be extracted from the
event over and over.

Replace this with a set membership test using unordered_set for
PT_CHARBUF types.

In filter.cpp, for CO_IN operators check the type of the operand. If
it's PT_CHARBUF, loop over the items in the (a, b, c) set and call
add_filter_value() for each. Otherwise, the current behavior is kept.

sinsp_filter_check::add_filter_value now saves pointers to the filter
values in the unordered_set m_val_storages_members, containing pairs
of (pointer, length). Note that these are only pointers--the actual
values are still held in m_val_storages.

In sinsp_filter_check::flt_compare, for CO_IN operators simply check the
unordered_set and if a match is found return true. This used to be dead
code given how all CO_INs were replaced with a sequence of x=a or x=b or
..., but is used again.

Custom functors g_hash_membuf/g_equal_to_membuf hash and compare the
pointer-to-datas. When compiling with gcc, this simply uses gnu's
built-in hash function for a pointer and length, which is quite
fast. Otherwise, a standalone function is used.

* Take advantage of string length.

Take advantage of string length when doing set membership tests:

- When comparing strings in the hash equality function, only bother
  doing the buffer comparison when the string lengths match.

- It can be inefficient to hash a very long string when all members of
  the set are short strings. To make this case faster, keep track of
  the minumum and maximum string length across the set members and
  only bother doing the set comparison of the lengths overlap.

To effectively use this, the length needs to be filled in when
sinsp_filter_check::flt_compare() is called, so do a pass over the
existing filterchecks and actually return the actual string length when
it's easily known. In flt_compare(), if the type is a string and the
provided length is 0, do a strlen() to find it. This should be very rare
now that the length is properly passed back.
  • Loading branch information
mstemm authored and Damian Myerscough committed Mar 3, 2017
1 parent e68147c commit 3d9ea6b
Show file tree
Hide file tree
Showing 3 changed files with 252 additions and 65 deletions.
184 changes: 121 additions & 63 deletions userspace/libsinsp/filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -397,11 +397,11 @@ bool flt_compare(cmpop op, ppm_param_type type, void* operand1, void* operand2,
}
}

bool flt_compare_avg(cmpop op,
ppm_param_type type,
void* operand1,
void* operand2,
uint32_t op1_len,
bool flt_compare_avg(cmpop op,
ppm_param_type type,
void* operand1,
void* operand2,
uint32_t op1_len,
uint32_t op2_len,
uint32_t cnt1,
uint32_t cnt2)
Expand Down Expand Up @@ -514,6 +514,8 @@ sinsp_filter_check::sinsp_filter_check()
m_aggregation = A_NONE;
m_merge_aggregation = A_NONE;
m_val_storages = vector<vector<uint8_t>> (1, vector<uint8_t>(256));
m_val_storages_min_size = numeric_limits<uint32_t>::max();
m_val_storages_max_size = numeric_limits<uint32_t>::min();
}

void sinsp_filter_check::set_inspector(sinsp* inspector)
Expand Down Expand Up @@ -996,6 +998,21 @@ void sinsp_filter_check::add_filter_value(const char* str, uint32_t len, uint16_
}

parse_filter_value(str, len, filter_value_p(i), filter_value(i).size());

// XXX/mstemm this doesn't work if someone called
// add_filter_value more than once for a given index.
filter_value_member_t item(filter_value_p(i), len);
m_val_storages_members.insert(item);

if(len < m_val_storages_min_size)
{
m_val_storages_min_size = len;
}

if(len > m_val_storages_max_size)
{
m_val_storages_max_size = len;
}
}


Expand Down Expand Up @@ -1028,19 +1045,18 @@ bool sinsp_filter_check::flt_compare(cmpop op, ppm_param_type type, void* operan
{
if (op == CO_IN)
{
if (op1_len)
// For raw strings, the length may not be set. So we do a strlen to find it.
if(type == PT_CHARBUF && op1_len == 0)
{
throw sinsp_exception("filter error: cannot use 'in' operator with param type "+ to_string(type));
op1_len = strlen((char *) operand1);
}
for (uint16_t i=0; i < m_val_storages.size(); i++)

filter_value_member_t item((uint8_t *) operand1, op1_len);
if(op1_len >= m_val_storages_min_size &&
op1_len <= m_val_storages_max_size &&
m_val_storages_members.find(item) != m_val_storages_members.end())
{
if (::flt_compare(CO_EQ,
type,
operand1,
filter_value_p(i)))
{
return true;
}
return true;
}
return false;
}
Expand Down Expand Up @@ -1599,19 +1615,8 @@ void sinsp_filter_compiler::parse_check()

chk->parse_field_name((char *)&operand1[0], true);

//
// In this case we need to create '(field=value1 or field=value2 ...)'
//
if(co == CO_IN)
{
//
// Separate the 'or's from the
// rest of the conditions
//
m_filter->push_expression(op);
m_last_boolop = BO_NONE;
m_nest_level++;

//
// Skip spaces
//
Expand All @@ -1630,56 +1635,109 @@ void sinsp_filter_compiler::parse_check()
//
m_scanpos++;

//
// The first boolean operand will be BO_NONE
// Then we will start putting BO_ORs
//
op = BO_NONE;

//
// Create the 'or' sequence
//
while(true)
if(chk->get_field_info()->m_type == PT_CHARBUF)
{
// 'in' clause aware
vector<char> operand2 = next_operand(false, true);

//
// Append every sinsp_filter_check creating the 'or' sequence
// For character buffers, we can check all
// values at once by putting them in a set and
// checking for set membership.
//
sinsp_filter_check* newchk = g_filterlist.new_filter_check_from_another(chk);
newchk->m_boolop = op;
newchk->m_cmpop = CO_EQ;
newchk->add_filter_value((char *)&operand2[0], (uint32_t)operand2.size() - 1);

m_filter->add_check(newchk);
//
// Create the 'or' sequence
//
uint64_t num_values = 0;
while(true)
{
// 'in' clause aware
vector<char> operand2 = next_operand(false, true);

next();
chk->add_filter_value((char *)&operand2[0], (uint32_t)operand2.size() - 1, num_values);
num_values++;
next();

if(m_fltstr[m_scanpos] == ')')
{
break;
}
else if(m_fltstr[m_scanpos] == ',')
{
m_scanpos++;
if(m_fltstr[m_scanpos] == ')')
{
break;
}
else if(m_fltstr[m_scanpos] == ',')
{
m_scanpos++;
}
else
{
throw sinsp_exception("expected either ')' or ',' after a value inside the 'in' clause");
}
}
else
m_filter->add_check(chk);
}
else
{
//
// In this case we need to create '(field=value1 or field=value2 ...)'
//

//
// Separate the 'or's from the
// rest of the conditions
//
m_filter->push_expression(op);
m_last_boolop = BO_NONE;
m_nest_level++;

//
// The first boolean operand will be BO_NONE
// Then we will start putting BO_ORs
//
op = BO_NONE;

//
// Create the 'or' sequence
//
uint64_t num_values = 0;
while(true)
{
throw sinsp_exception("expected either ')' or ',' after a value inside the 'in' clause");
// 'in' clause aware
vector<char> operand2 = next_operand(false, true);

//
// Append every sinsp_filter_check creating the 'or' sequence
//
sinsp_filter_check* newchk = g_filterlist.new_filter_check_from_another(chk);
newchk->m_boolop = op;
newchk->m_cmpop = CO_EQ;
newchk->add_filter_value((char *)&operand2[0], (uint32_t)operand2.size() - 1, num_values);
num_values++;

m_filter->add_check(newchk);

next();

if(m_fltstr[m_scanpos] == ')')
{
break;
}
else if(m_fltstr[m_scanpos] == ',')
{
m_scanpos++;
}
else
{
throw sinsp_exception("expected either ')' or ',' after a value inside the 'in' clause");
}

//
// From now on we 'or' every newchk
//
op = BO_OR;
}

//
// From now on we 'or' every newchk
// Come back to the rest of the filter
//
op = BO_OR;
m_filter->pop_expression();
m_nest_level--;
}

//
// Come back to the rest of the filter
//
m_filter->pop_expression();
m_nest_level--;
}
else
{
Expand Down
Loading

0 comments on commit 3d9ea6b

Please sign in to comment.