Skip to content

Commit

Permalink
Merge pull request #63 from lean-dojo/dev
Browse files Browse the repository at this point in the history
Faster repo loading
  • Loading branch information
Kaiyu Yang authored Sep 24, 2023
2 parents 99c5723 + a0dc340 commit 59937fa
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 8 deletions.
1 change: 1 addition & 0 deletions docs/source/user-guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ LeanDojo's behavior can be configured through the following environment variable
* :code:`TACTIC_CPU_LIMIT`: Number of CPUs for executing tactics (see the `--cpus` flag of `docker run <https://docs.docker.com/engine/reference/commandline/run/#memory>`_) when interacting with Lean (only applicable when :code:`CONTAINER=docker`). Default to 1.
* :code:`TACTIC_MEMORY_LIMIT`: Maximum memory when interacting with Lean (see the `--memory` flag of `docker run <https://docs.docker.com/engine/reference/commandline/run/#memory>`_) (only applicable when :code:`CONTAINER=docker`). Default to 16 GB.
* :code:`GITHUB_ACCESS_TOKEN`: GitHub `personal access token <https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-personal-access-token-classic>`_ for using the GitHub API. They are optional. If provided, they can increase the `API rate limit <https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting>`_.
* :code:`LOAD_USED_DEPS_ONLY`: Setting it to any value will cause LeanDojo to load only the dependency files that are actually used by the target repo. Otherwise, for Lean 4, it will load all files in the dependency repos. Not set by default.
* :code:`VERBOSE` or :code:`DEBUG`: Setting either of them to any value will cause LeanDojo to print debug information. Not set by default.

LeanDojo supports `python-dotenv <https://pypi.org/project/python-dotenv/>`_. You can use it to manage environment variables in a :file:`.env` file.
Expand Down
3 changes: 3 additions & 0 deletions src/lean_dojo/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@
LEAN4_DEPS_DIR = Path("lake-packages")
"""The directory where Lean 4 dependencies are stored."""

LOAD_USED_DEPS_ONLY = "LOAD_USED_DEPS_ONLY" in os.environ
"""Only load depdendency files that are actually used by the target repo."""

TACTIC_TIMEOUT = int(os.getenv("TACTIC_TIMEOUT", 5000))
"""Maximum time (in milliseconds) before interrupting a tactic when interacting with Lean (only for Lean 3).
"""
Expand Down
2 changes: 1 addition & 1 deletion src/lean_dojo/data_extraction/lean.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ def clone_and_checkout(self) -> None:
def get_dependencies(
self, path: Union[str, Path, None] = None
) -> Dict[str, "LeanGitRepo"]:
"""Return the dependencies required by the current repo.
"""Return the dependencies required by the target repo.
Args:
path (Union[str, Path, None], optional): Root directory of the repo if it is on the disk.
Expand Down
36 changes: 29 additions & 7 deletions src/lean_dojo/data_extraction/traced_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
)
from .ast.lean3.node import *
from .ast.lean4.node import *
from ..constants import LOAD_USED_DEPS_ONLY
from .lean import LeanFile, LeanGitRepo, Theorem, Pos
from .ast.lean3.expr import Expr, ConstExpr, parse_exprs_forest
from ..constants import NUM_WORKERS, LEAN3_DEPS_DIR, LEAN4_DEPS_DIR
Expand Down Expand Up @@ -1229,21 +1230,36 @@ def _save_xml_to_disk(tf: TracedFile) -> None:
oup.write(tf.to_xml())


def _build_dependency_graph(traced_files: List[TracedFile]) -> nx.DiGraph:
def _build_dependency_graph(
seed_files: List[TracedFile], root_dir: Path, repo: LeanGitRepo
) -> nx.DiGraph:
G = nx.DiGraph()

for tf in traced_files:
for tf in seed_files:
tf_path_str = str(tf.path)
assert not G.has_node(tf_path_str)
G.add_node(tf_path_str, traced_file=tf)

for tf in traced_files:
traced_files = seed_files.copy()
i = 0

while i < len(traced_files):
tf = traced_files[i]
tf_path_str = str(tf.path)

for dep_path in tf.get_direct_dependencies():
dep_path_str = str(dep_path)
assert G.has_node(dep_path_str)
if not G.has_node(dep_path_str):
xml_path = to_xml_path(root_dir, dep_path, repo.uses_lean4)
tf_dep = TracedFile.from_xml(root_dir, xml_path, repo)
assert tf_dep.path == tf.path
G.add_node(dep_path_str, traced_file=tf_dep)
traced_files.append(tf_dep)

G.add_edge(tf_path_str, dep_path_str)

i += 1

assert nx.is_directed_acyclic_graph(G)
return G

Expand Down Expand Up @@ -1348,7 +1364,8 @@ def check_sanity(self) -> None:
p.relative_to(self.root_dir) for p in self.root_dir.glob("**/*.dep_paths")
}

assert len(json_files) == self.traced_files_graph.number_of_nodes()
if not LOAD_USED_DEPS_ONLY:
assert len(json_files) == self.traced_files_graph.number_of_nodes()

for path_str, tf_node in self.traced_files_graph.nodes.items():
tf = tf_node["traced_file"]
Expand Down Expand Up @@ -1407,7 +1424,7 @@ def from_traced_files(cls, root_dir: Union[str, Path]) -> None:
)

dependencies = repo.get_dependencies(root_dir)
traced_files_graph = _build_dependency_graph(traced_files)
traced_files_graph = _build_dependency_graph(traced_files, root_dir, repo)
traced_repo = cls(repo, dependencies, root_dir, traced_files_graph)
traced_repo._update_traced_files()
return traced_repo
Expand Down Expand Up @@ -1464,6 +1481,11 @@ def load_from_disk(cls, root_dir: Union[str, Path]) -> "TracedRepo":
f"Loading {len(xml_paths)} traced XML files from {root_dir} with {NUM_WORKERS} workers"
)

# Start from files in the target repo as seeds.
# Only load dependency files that are actually used.
if LOAD_USED_DEPS_ONLY:
xml_paths = [p for p in xml_paths if not "lake-packages" in p.parts]

if NUM_WORKERS <= 1:
traced_files = [
TracedFile.from_xml(root_dir, path, repo) for path in tqdm(xml_paths)
Expand All @@ -1480,7 +1502,7 @@ def load_from_disk(cls, root_dir: Union[str, Path]) -> "TracedRepo":
)

dependencies = repo.get_dependencies(root_dir)
traced_files_graph = _build_dependency_graph(traced_files)
traced_files_graph = _build_dependency_graph(traced_files, root_dir, repo)
traced_repo = cls(repo, dependencies, root_dir, traced_files_graph)
traced_repo._update_traced_files()
return traced_repo
Expand Down

0 comments on commit 59937fa

Please sign in to comment.