Skip to content

Commit

Permalink
utils: accelerate byte search in big file handles (#4279)
Browse files Browse the repository at this point in the history
The current implementation was unperformant for multiple reasons:
- it searched for lines, which means it forced the internal
implementation to search for '\n' in the bytes.
- as mentioned in the old comment, it was actually not memory stable,
because the length of the line could be huge for big files, and thus
didn't really prevent OOM issues.
- was not exactly correct, because prevented from finding patterns
containing '\n'.

On huge file (such as chrome, which is why I started investigating
this), this new implementation improves by a factor of 5 the
performances of this function, and thus tremendously reducing the amount
of time spent here.

---------

Co-authored-by: jonathanmetzman <[email protected]>
  • Loading branch information
paulsemel and jonathanmetzman authored Sep 30, 2024
1 parent b284e12 commit 49a4437
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 7 deletions.
41 changes: 34 additions & 7 deletions src/clusterfuzz/_internal/base/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@
LOCAL_SOURCE_MANIFEST = os.path.join('src', 'appengine', 'resources',
'clusterfuzz-source.manifest')

# The length we're reading from the file-object in search_bytes_in_file.
DEFAULT_SEARCH_BUFFER_LENGTH = 10 * 1024 * 1024 # 10MB


def utcnow():
"""Return datetime.datetime.utcnow(). We need this method because we can't
Expand Down Expand Up @@ -673,16 +676,40 @@ def restart_machine():
os.system('sudo shutdown -r now')


def search_bytes_in_file(search_bytes, file_handle):
"""Helper to search for bytes in a large binary file without memory
issues.
def search_bytes_in_file(search_bytes, file_handle) -> bool:
"""Searches `search_bytes` in the file represented by the file-object
`file_handle`.
Args:
search_bytes: the bytes to search for in the file.
file_handle: the file-object handle.
Returns:
Whether the pattern was found in the file.
"""
# TODO(aarya): This is too brittle and will fail if we have a very large
# line.
for line in file_handle:
if search_bytes in line:
# Invariant: `buf[:offset]` contains valid bytes.
buf = bytearray(DEFAULT_SEARCH_BUFFER_LENGTH)
mv = memoryview(buf)
offset = 0
# We copy exactly one byte less than the actual length of the sequence
# we're trying to match. This is to prevent missing stuff like:
# sequence: 'abcd'
# read #1: 'abc' -> no match
# read #2: 'd' -> no match
# With the copy, we get:
# read #1: 'abc': -> no match
# copy: 'abc'
# read #2: 'abcd': -> match!
length = len(search_bytes) - 1
# We read right after the previously copied offset. This ensures we're
# perfectly matching the byte sequence above.
while read_len := file_handle.readinto(mv[offset:]):
if search_bytes in buf[:read_len + offset]:
return True

buf[:length] = buf[-length:]
offset = length

return False


Expand Down
30 changes: 30 additions & 0 deletions src/clusterfuzz/_internal/tests/core/base/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from unittest import mock

from google.cloud import ndb
import parameterized
from pyfakefs import fake_filesystem_unittest

from clusterfuzz._internal.base import utils
Expand Down Expand Up @@ -527,3 +528,32 @@ def test_not_exists(self):

with open(self.test_path, 'rb') as f:
self.assertFalse(utils.search_bytes_in_file(b'ABCD', f))


class SearchBytesInFileTestComplex(unittest.TestCase):
"""Tests search_bytes_in_file."""

def setUp(self):
self.temp_dir = tempfile.mkdtemp()
self.test_path = os.path.join(self.temp_dir, 'file')
with open(self.test_path, 'wb') as f:
f.write(b'A' * 16 + b'\n' + b'B' * 16)

def tearDown(self):
shutil.rmtree(self.temp_dir, ignore_errors=True)

@parameterized.parameterized.expand(
[10, 16, 17, 18, 20, utils.DEFAULT_SEARCH_BUFFER_LENGTH])
def test_exists(self, buffer_length):
"""Test exists."""
utils.DEFAULT_SEARCH_BUFFER_LENGTH = buffer_length
with open(self.test_path, 'rb') as f:
self.assertTrue(utils.search_bytes_in_file(b'A\nB', f))

@parameterized.parameterized.expand(
[10, 16, 17, 18, 20, utils.DEFAULT_SEARCH_BUFFER_LENGTH])
def test_not_exists(self, buffer_length):
"""Test not exists."""
utils.DEFAULT_SEARCH_BUFFER_LENGTH = buffer_length
with open(self.test_path, 'rb') as f:
self.assertFalse(utils.search_bytes_in_file(b'A\n\nB', f))

0 comments on commit 49a4437

Please sign in to comment.