[build_manager] add support for remote zip

This adds support for remote ZIP. As of now, performances are quite good locally, and the read ahead mechanism should keep reasonable performance. Also, given that the ClusterFuzz bots are having HDD, numbers might even be better there, as we're only storing on disk when unpacking the build. The memory consumption of this new feature is contant: it uses at most (and most of the time) 50 MB of RAM.
google · Oct 1, 2024 · c0e84e6 · c0e84e6
1 parent 49a4437
commit c0e84e6
Show file tree

Hide file tree

Showing 8 changed files with 460 additions and 91 deletions.
diff --git a/src/clusterfuzz/_internal/bot/fuzzers/utils.py b/src/clusterfuzz/_internal/bot/fuzzers/utils.py
@@ -13,10 +13,13 @@
 # limitations under the License.
 """Fuzzer utils."""
 
+import functools
 import os
 import re
 import stat
 import tempfile
+from typing import Callable
+from typing import Optional
 
 from clusterfuzz._internal.base import utils
 from clusterfuzz._internal.metrics import logs
@@ -30,7 +33,7 @@
 EXTRA_BUILD_DIR = '__extra_build'
 
 
-def is_fuzz_target_local(file_path, file_handle=None):
+def is_fuzz_target(file_path, file_opener: Optional[Callable] = None):
   """Returns whether |file_path| is a fuzz target binary (local path)."""
   if '@' in file_path:
     # GFT targets often have periods in the name that get misinterpreted as an
@@ -53,7 +56,7 @@ def is_fuzz_target_local(file_path, file_handle=None):
     # Ignore files with disallowed extensions (to prevent opening e.g. .zips).
     return False
 
-  if not file_handle and not os.path.exists(file_path):
+  if not file_opener and not os.path.exists(file_path):
     # Ignore non-existent files for cases when we don't have a file handle.
     return False
 
@@ -72,24 +75,27 @@ def is_fuzz_target_local(file_path, file_handle=None):
     logs.warning('Tried to read from non-regular file: %s.' % file_path)
     return False
 
-  # Use already provided file handle or open the file.
-  local_file_handle = file_handle or open(file_path, 'rb')
-
-  result = False
-  for pattern in FUZZ_TARGET_SEARCH_BYTES:
-    # TODO(metzman): Bound this call so we don't read forever if something went
-    # wrong.
-    local_file_handle.seek(0)
-    result = utils.search_bytes_in_file(pattern, local_file_handle)
-    if result:
-      break
-
-  if not file_handle:
-    # If this local file handle is owned by our function, close it now.
-    # Otherwise, it is caller's responsibility.
-    local_file_handle.close()
-
-  return result
+  # Either use the file opener or open the file ourselves.
+  if not file_opener:
+    file_opener = functools.partial(open, mode='rb')
+  try:
+    with file_opener(file_path) as file_handle:
+      result = False
+      for pattern in FUZZ_TARGET_SEARCH_BYTES:
+        # TODO(metzman): Bound this call so we don't read forever if something
+        # went wrong.
+        file_handle.seek(0)
+        result = utils.search_bytes_in_file(pattern, file_handle)
+        if result:
+          break
+
+      file_handle.close()
+
+      return result
+  except Exception as e:
+    # In case we could not open the file, we consider it's not a fuzzer.
+    logs.warning(f'Could not open {file_path}: {e}')
+    return False
 
 
 def get_fuzz_targets_local(path):
@@ -103,7 +109,7 @@ def get_fuzz_targets_local(path):
         continue
 
       file_path = os.path.join(root, filename)
-      if is_fuzz_target_local(file_path):
+      if is_fuzz_target(file_path):
         fuzz_target_paths.append(file_path)
 
   return fuzz_target_paths

diff --git a/src/clusterfuzz/_internal/build_management/build_archive.py b/src/clusterfuzz/_internal/build_management/build_archive.py
@@ -190,14 +190,10 @@ def list_fuzz_targets(self) -> List[str]:
     from clusterfuzz._internal.bot.fuzzers import utils as fuzzer_utils
 
     for archive_file in self.list_members():
-      file_content = self.try_open(archive_file.name)
-      if fuzzer_utils.is_fuzz_target_local(archive_file.name, file_content):
+      if fuzzer_utils.is_fuzz_target(archive_file.name, self.open):
         fuzz_target = fuzzer_utils.normalize_target_name(archive_file.name)
         self._fuzz_targets[fuzz_target] = archive_file.name
 
-      if file_content:
-        file_content.close()
-
     return list(self._fuzz_targets.keys())
 
   def unpacked_size(self, fuzz_target: Optional[str] = None) -> int:
@@ -299,23 +295,19 @@ def get_target_dependencies(
     return res
 
 
-# pylint: disable=redefined-builtin
-def open(archive_path: str) -> BuildArchive:
-  """Opens the archive and gets the appropriate build archive based on the
-  `archive_path`. The resulting object is usable as a normal archive reader,
-  but provides additional feature related to build handling.
+def open_with_reader(reader: archive.ArchiveReader) -> BuildArchive:
+  """Open the archive and gets the appropriate build archive based on the
+  provided archive information.
 
   Args:
-      archive_path: the path to the archive.
+      reader: the archive reader.
 
   Raises:
-      If the file could not be opened or if the archive type cannot be handled.
+    If the archive reader cannot be handled.
 
   Returns:
-      the build archive.
+      The build archive.
   """
-  reader = archive.open(archive_path)
-
   # Unfortunately, there is no good heuristic for determining which build
   # archive implementation to use.
   # Hopefully, we can search in the archive whether some files are present and
@@ -328,3 +320,42 @@ def open(archive_path: str) -> BuildArchive:
   if reader.file_exists(args_gn_path):
     return ChromeBuildArchive(reader)
   return DefaultBuildArchive(reader)
+
+
+def open(archive_path: str) -> BuildArchive:  # pylint: disable=redefined-builtin
+  """Opens the archive and gets the appropriate build archive based on the
+  `archive_path`. The resulting object is usable as a normal archive reader,
+  but provides additional feature related to build handling.
+
+  Args:
+      archive_path: the path to the archive.
+
+  Raises:
+      If the file could not be opened or if the archive type cannot be handled.
+
+  Returns:
+      The build archive.
+  """
+  reader = archive.open(archive_path)
+  return open_with_reader(reader)
+
+
+def open_uri(uri: str) -> BuildArchive:
+  """Opens a build archive over HTTP. This is only compatible with chromium as
+  of now.
+
+  Args:
+      uri: the URI pointing to the zip file.
+
+  Returns:
+      The build archive.
+  """
+  reader = archive.ZipArchiveReader(archive.HttpZipFile(uri))
+  return open_with_reader(reader)
+
+
+def unzip_over_http_compatible(build_url: str) -> bool:
+  """Whether the build URL is compatible with unzipping over HTTP.
+  As for now, we're only checking for chromium compatible URLs.
+  """
+  return archive.HttpZipFile.is_uri_compatible(build_url)
diff --git a/src/clusterfuzz/_internal/build_management/build_manager.py b/src/clusterfuzz/_internal/build_management/build_manager.py
@@ -14,11 +14,13 @@
 """Build manager."""
 
 from collections import namedtuple
+import contextlib
 import os
 import re
 import shutil
 import subprocess
 import time
+from typing import Optional
 
 from clusterfuzz._internal.base import errors
 from clusterfuzz._internal.base import utils
@@ -402,23 +404,19 @@ def _post_setup_success(self, update_revision=True):
     if instrumented_library_paths:
       self._patch_rpaths(instrumented_library_paths)
 
-  def _unpack_build(self, base_build_dir, build_dir, build_url):
-    """Unpacks a build from a build url into the build directory."""
-    # Track time taken to unpack builds so that it doesn't silently regress.
-    start_time = time.time()
-
-    logs.info(f'Unpacking build from {build_url} into {build_dir}.')
+  @contextlib.contextmanager
+  def _download_and_open_build_archive(self, base_build_dir: str,
+                                       build_dir: str, build_url: str):
+    """Downloads the build archive at `build_url` and opens it.
 
-    # Free up memory.
-    utils.python_gc()
-
-    # Remove the current build.
-    logs.info(f'Removing build directory {build_dir}.')
-    if not shell.remove_directory(build_dir, recreate=True):
-      logs.error(f'Unable to clear build directory {build_dir}.')
-      _handle_unrecoverable_error_on_windows()
-      return False
+    Args:
+        base_build_dir: the base build directory
+        build_dir: the current build directory
+        build_url: the build URL
 
+    Yields:
+        the build archive
+    """
     # Download build archive locally.
     build_local_archive = os.path.join(build_dir, os.path.basename(build_url))
 
@@ -431,15 +429,83 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
           'Failed to make space for download. '
           'Cleared all data directories to free up space, exiting.')
 
-    logs.info(f'Downloading build from {build_url}.')
+    logs.info(f'Downloading build from {build_url} to {build_local_archive}.')
     try:
       storage.copy_file_from(build_url, build_local_archive)
     except Exception as e:
       logs.error(f'Unable to download build from {build_url}: {e}')
-      return False
+      raise
 
     try:
       with build_archive.open(build_local_archive) as build:
+        yield build
+    finally:
+      shell.remove_file(build_local_archive)
+
+  def _open_build_archive(self, base_build_dir: str, build_dir: str,
+                          build_url: str, http_build_url: Optional[str],
+                          unpack_everything: Optional[bool]):
+    """Gets a handle on a build archive for the current build. Depending on the
+    provided parameters, this function might download the build archive into
+    the build directory or directly use remote HTTP archive.
+
+    Args:
+        unpack_everything: wether we should unpack the whole archive or try
+        selective unpacking.
+        base_build_dir: the base build directory.
+        build_dir: the current build directory.
+        build_url: the build URL.
+        http_build_url: the HTTP build URL.
+
+    Raises:
+        if an error occurred while accessing the file over HTTP or while
+        downloading the file on disk.
+
+    Returns:
+        the build archive.
+    """
+    # We only want to use remote unzipping if we're not unpacking everything and
+    # if the HTTP URL is compatible with remote unzipping.
+    allow_unpack_over_http = environment.get_value(
+        'ALLOW_UNPACK_OVER_HTTP', default_value=False)
+    can_unzip_over_http = (
+        allow_unpack_over_http and not unpack_everything and http_build_url and
+        build_archive.unzip_over_http_compatible(http_build_url))
+
+    if not can_unzip_over_http:
+      return self._download_and_open_build_archive(base_build_dir, build_dir,
+                                                   build_url)
+    logs.info("Opening an archive over HTTP, skipping archive download.")
+    assert http_build_url
+    return build_archive.open_uri(http_build_url)
+
+  def _unpack_build(self,
+                    base_build_dir,
+                    build_dir,
+                    build_url,
+                    http_build_url=None):
+    """Unpacks a build from a build url into the build directory."""
+    # Track time taken to unpack builds so that it doesn't silently regress.
+    start_time = time.time()
+
+    unpack_everything = environment.get_value(
+        'UNPACK_ALL_FUZZ_TARGETS_AND_FILES')
+
+    logs.info(f'Unpacking build from {build_url} into {build_dir}.')
+
+    # Free up memory.
+    utils.python_gc()
+
+    # Remove the current build.
+    logs.info(f'Removing build directory {build_dir}.')
+    if not shell.remove_directory(build_dir, recreate=True):
+      logs.error(f'Unable to clear build directory {build_dir}.')
+      _handle_unrecoverable_error_on_windows()
+      return False
+
+    try:
+      with self._open_build_archive(base_build_dir, build_dir, build_url,
+                                    http_build_url, unpack_everything) as build:
         unpack_everything = environment.get_value(
             'UNPACK_ALL_FUZZ_TARGETS_AND_FILES')
 
@@ -463,8 +529,7 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
               'Cleared all data directories to free up space, exiting.')
 
         # Unpack the local build archive.
-        logs.info(
-            f'Unpacking build archive {build_local_archive} to {build_dir}.')
+        logs.info(f'Unpacking build archive {build_url} to {build_dir}.')
         trusted = not utils.is_oss_fuzz()
 
         build.unpack(
@@ -473,7 +538,7 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
             trusted=trusted)
 
     except Exception as e:
-      logs.error(f'Unable to unpack build archive {build_local_archive}: {e}')
+      logs.error(f'Unable to unpack build archive {build_url}: {e}')
       return False
 
     if unpack_everything:
@@ -484,9 +549,6 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
       partial_build_file_path = os.path.join(build_dir, PARTIAL_BUILD_FILE)
       utils.write_data_to_file('', partial_build_file_path)
 
-    # No point in keeping the archive around.
-    shell.remove_file(build_local_archive)
-
     elapsed_time = time.time() - start_time
     elapsed_mins = elapsed_time / 60.
     log_func = logs.warning if elapsed_time > UNPACK_TIME_LIMIT else logs.info
@@ -605,10 +667,20 @@ def __init__(self,
                revision,
                build_url,
                build_prefix='',
-               fuzz_target=None):
+               fuzz_target=None,
+               http_build_url=None):
+    """RegularBuild constructor. See Build constructor for other parameters.
+
+    Args:
+        http_build_url: the http build URL. E.g.
+        http://storage.com/foo/bar.zip. Defaults to None.
+        build_url: the bucket URL whether the build can be found. E.g.
+        gs://foo/bar.zip.
+    """
     super().__init__(
         base_build_dir, revision, build_prefix, fuzz_target=fuzz_target)
     self.build_url = build_url
+    self.http_build_url = http_build_url
 
     if build_prefix:
       self.build_dir_name = build_prefix.lower()
@@ -630,7 +702,7 @@ def setup(self):
     build_update = not self.exists()
     if build_update:
       if not self._unpack_build(self.base_build_dir, self.build_dir,
-                                self.build_url):
+                                self.build_url, self.http_build_url):
         return False
 
       logs.info('Retrieved build r%d.' % self.revision)
@@ -1116,6 +1188,9 @@ def setup_regular_build(revision,
 
     return None
 
+  # build_url points to a GCP bucket, and we're only converting it to its HTTP
+  # endpoint so that we can use remote unzipping.
+  http_build_url = build_url.replace('gs://', 'https://storage.googleapis.com/')
   base_build_dir = _base_build_dir(bucket_path)
 
   build_class = RegularBuild
@@ -1133,7 +1208,8 @@ def setup_regular_build(revision,
       revision,
       build_url,
       build_prefix=build_prefix,
-      fuzz_target=fuzz_target)
+      fuzz_target=fuzz_target,
+      http_build_url=http_build_url)
   if build.setup():
     result = build
   else: