utils: accelerate byte search in big file handles (#4279)

The current implementation was unperformant for multiple reasons: - it searched for lines, which means it forced the internal implementation to search for '\n' in the bytes. - as mentioned in the old comment, it was actually not memory stable, because the length of the line could be huge for big files, and thus didn't really prevent OOM issues. - was not exactly correct, because prevented from finding patterns containing '\n'. On huge file (such as chrome, which is why I started investigating this), this new implementation improves by a factor of 5 the performances of this function, and thus tremendously reducing the amount of time spent here. --------- Co-authored-by: jonathanmetzman <[email protected]>
google · Sep 30, 2024 · 49a4437 · 49a4437
1 parent b284e12
commit 49a4437
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 7 deletions.
diff --git a/src/clusterfuzz/_internal/base/utils.py b/src/clusterfuzz/_internal/base/utils.py
@@ -70,6 +70,9 @@
 LOCAL_SOURCE_MANIFEST = os.path.join('src', 'appengine', 'resources',
                                      'clusterfuzz-source.manifest')
 
+# The length we're reading from the file-object in search_bytes_in_file.
+DEFAULT_SEARCH_BUFFER_LENGTH = 10 * 1024 * 1024  # 10MB
+
 
 def utcnow():
   """Return datetime.datetime.utcnow(). We need this method because we can't
@@ -673,16 +676,40 @@ def restart_machine():
     os.system('sudo shutdown -r now')
 
 
-def search_bytes_in_file(search_bytes, file_handle):
-  """Helper to search for bytes in a large binary file without memory
-  issues.
+def search_bytes_in_file(search_bytes, file_handle) -> bool:
+  """Searches `search_bytes` in the file represented by the file-object
+  `file_handle`.
+
+  Args:
+      search_bytes: the bytes to search for in the file.
+      file_handle: the file-object handle.
+
+  Returns:
+      Whether the pattern was found in the file.
   """
-  # TODO(aarya): This is too brittle and will fail if we have a very large
-  # line.
-  for line in file_handle:
-    if search_bytes in line:
+  # Invariant: `buf[:offset]` contains valid bytes.
+  buf = bytearray(DEFAULT_SEARCH_BUFFER_LENGTH)
+  mv = memoryview(buf)
+  offset = 0
+  # We copy exactly one byte less than the actual length of the sequence
+  # we're trying to match. This is to prevent missing stuff like:
+  # sequence: 'abcd'
+  # read #1: 'abc' -> no match
+  # read #2: 'd' -> no match
+  # With the copy, we get:
+  # read #1: 'abc': -> no match
+  # copy: 'abc'
+  # read #2: 'abcd': -> match!
+  length = len(search_bytes) - 1
+  # We read right after the previously copied offset. This ensures we're
+  # perfectly matching the byte sequence above.
+  while read_len := file_handle.readinto(mv[offset:]):
+    if search_bytes in buf[:read_len + offset]:
       return True
 
+    buf[:length] = buf[-length:]
+    offset = length
+
   return False
 
 

diff --git a/src/clusterfuzz/_internal/tests/core/base/utils_test.py b/src/clusterfuzz/_internal/tests/core/base/utils_test.py
@@ -22,6 +22,7 @@
 from unittest import mock
 
 from google.cloud import ndb
+import parameterized
 from pyfakefs import fake_filesystem_unittest
 
 from clusterfuzz._internal.base import utils
@@ -527,3 +528,32 @@ def test_not_exists(self):
 
     with open(self.test_path, 'rb') as f:
       self.assertFalse(utils.search_bytes_in_file(b'ABCD', f))
+
+
+class SearchBytesInFileTestComplex(unittest.TestCase):
+  """Tests search_bytes_in_file."""
+
+  def setUp(self):
+    self.temp_dir = tempfile.mkdtemp()
+    self.test_path = os.path.join(self.temp_dir, 'file')
+    with open(self.test_path, 'wb') as f:
+      f.write(b'A' * 16 + b'\n' + b'B' * 16)
+
+  def tearDown(self):
+    shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+  @parameterized.parameterized.expand(
+      [10, 16, 17, 18, 20, utils.DEFAULT_SEARCH_BUFFER_LENGTH])
+  def test_exists(self, buffer_length):
+    """Test exists."""
+    utils.DEFAULT_SEARCH_BUFFER_LENGTH = buffer_length
+    with open(self.test_path, 'rb') as f:
+      self.assertTrue(utils.search_bytes_in_file(b'A\nB', f))
+
+  @parameterized.parameterized.expand(
+      [10, 16, 17, 18, 20, utils.DEFAULT_SEARCH_BUFFER_LENGTH])
+  def test_not_exists(self, buffer_length):
+    """Test not exists."""
+    utils.DEFAULT_SEARCH_BUFFER_LENGTH = buffer_length
+    with open(self.test_path, 'rb') as f:
+      self.assertFalse(utils.search_bytes_in_file(b'A\n\nB', f))