From 08918c7860e41d75fa397970187a64fa43c14222 Mon Sep 17 00:00:00 2001
From: Chris Iacovella <christopher.r.iacovella@vanderbilt.edu>
Date: Fri, 28 Apr 2023 12:46:03 -0700
Subject: [PATCH] speeding up add function for large compounds (#1107)

* updated add routine to compose bond graphs of a list together before composing with main compound

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* change flattening routine for list, as it was causing a failure in the add remove bond test

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* some how left a conflict statement in...not sure why it is causing a conflict to begin with

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* added in sped up routine for loading mdtraj trajectories

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* added in sped up routine for loading parmed trajectories

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* added new tests

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* added tests for helper functions

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* added condense function for Compounds

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* added tests to condense function

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* addressed comments from cal

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor changes in codes and arrangmenet

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix style

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix unit test

---------

Co-authored-by: Christopher Iacovella <cri@MB22.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Christopher Iacovella <cri@MB22.hsd1.or.comcast.net>
Co-authored-by: Co Quach <daico007@gmail.com>
---
 mbuild/compound.py            | 133 +++++++++++++++++++++++++++++++---
 mbuild/conversion.py          |  45 ++++++++++--
 mbuild/tests/test_compound.py | 113 +++++++++++++++++++++++++++++
 3 files changed, 274 insertions(+), 17 deletions(-)

diff --git a/mbuild/compound.py b/mbuild/compound.py
index e39c03ac8..b09569b3f 100644
--- a/mbuild/compound.py
+++ b/mbuild/compound.py
@@ -13,6 +13,7 @@
 import ele
 import networkx as nx
 import numpy as np
+from boltons.setutils import IndexedSet
 from ele.element import Element, element_from_name, element_from_symbol
 from ele.exceptions import ElementError
 from treelib import Tree
@@ -811,8 +812,8 @@ def add(
         ----------
         new_child : mb.Compound or list-like of mb.Compound
             The object(s) to be added to this Compound.
-        label : str, optional, default None
-            A descriptive string for the part.
+        label : str, or list-like of str, optional, default None
+            A descriptive string for the part; if a list, must be the same length/shape as new_child.
         containment : bool, optional, default=True
             Add the part to self.children.
         replace : bool, optional, default=True
@@ -829,11 +830,56 @@ def add(
             to add Compounds to an existing rigid body.
         """
         # Support batch add via lists, tuples and sets.
+        # If iterable, we will first compose all the bondgraphs of individual
+        # Compounds in the list for efficiency
         from mbuild.port import Port
 
         if isinstance(new_child, Iterable) and not isinstance(new_child, str):
-            for child in new_child:
-                self.add(child, reset_rigid_ids=reset_rigid_ids)
+            compound_list = [c for c in _flatten_list(new_child)]
+            if label is not None and isinstance(label, (list, tuple)):
+                label_list = [c for c in _flatten_list(label)]
+                if len(label_list) != len(compound_list):
+                    raise ValueError(
+                        "The list-like object for label must be the same length as"
+                        "the list-like object of child Compounds. "
+                        f"total length of labels: {len(label_list)}, new_child: {len(new_child)}."
+                    )
+            temp_bond_graphs = []
+            for child in compound_list:
+                # create a list of bond graphs of the children to add
+                if containment:
+                    if child.bond_graph and not isinstance(self, Port):
+                        temp_bond_graphs.append(child.bond_graph)
+
+            # compose children bond_graphs; make sure we actually have graphs to compose
+            children_bond_graph = None
+            if len(temp_bond_graphs) != 0:
+                children_bond_graph = nx.compose_all(temp_bond_graphs)
+
+            if (
+                temp_bond_graphs
+                and not isinstance(self, Port)
+                and children_bond_graph is not None
+            ):
+                # If anything is added at self level, it is no longer a particle
+                # search for self in self.root.bond_graph and remove self
+                if self.root.bond_graph.has_node(self):
+                    self.root.bond_graph.remove_node(self)
+                # compose the bond graph of all the children with the root
+                self.root.bond_graph = nx.compose(
+                    self.root.bond_graph, children_bond_graph
+                )
+            for i, child in enumerate(compound_list):
+                child.bond_graph = None
+                if label is not None:
+                    self.add(
+                        child,
+                        label=label_list[i],
+                        reset_rigid_ids=reset_rigid_ids,
+                    )
+                else:
+                    self.add(child, reset_rigid_ids=reset_rigid_ids)
+
             return
 
         if not isinstance(new_child, Compound):
@@ -1859,9 +1905,9 @@ def _visualize_nglview(self, show_ports=False, color_scheme={}):
         mdtraj = import_("mdtraj")
         from mdtraj.geometry.sasa import _ATOMIC_RADII
 
-        remove_digits = lambda x: "".join(
-            i for i in x if not i.isdigit() or i == "_"
-        )
+        def remove_digits(x):
+            return "".join(i for i in x if not i.isdigit() or i == "_")
+
         for particle in self.particles():
             particle.name = remove_digits(particle.name).upper()
             if not particle.name:
@@ -1899,6 +1945,58 @@ def _visualize_nglview(self, show_ports=False, color_scheme={}):
         overwrite_nglview_default(widget)
         return widget
 
+    def condense(self, inplace=True):
+        """Condense the hierarchical structure of the Compound to the level of molecules.
+
+        Modify the mBuild Compound to become a Compound with 3 distinct levels in the hierarchy.
+        The top level container (self), contains molecules (i.e., connected Compounds) and the
+        third level represents Particles (i.e., Compounds with no children).
+        If the system contains a Particle(s) without any connections to other Compounds, it will
+        appear in the 2nd level (with the top level self as a parent).
+
+        Parameter
+        ---------
+        inplace : bool, optional, default=True
+            Option to perform the condense operation inplace or return a copy
+
+        Return
+        ------
+        self : mb.Compound or None
+            return a condensed Compound if inplace is False.
+        """
+        # temporary list of components
+        comp_list = []
+        connected_subgraph = self.root.bond_graph.connected_components()
+
+        for molecule in connected_subgraph:
+            if len(molecule) == 1:
+                ancestors = [molecule[0]]
+            else:
+                ancestors = IndexedSet(molecule[0].ancestors())
+                for particle in molecule[1:]:
+                    # This works because the way in which particle.ancestors is
+                    # traversed, the lower level will be in the front.
+                    # The intersection will be left at the end,
+                    # ancestor of the first particle is used as reference.
+                    # Hence, this called will return the lowest-level Compound
+                    # that is a molecule
+                    ancestors = ancestors.intersection(
+                        IndexedSet(particle.ancestors())
+                    )
+
+            """Parse molecule information"""
+            molecule_tag = ancestors[0]
+            comp_list.append(clone(molecule_tag))
+        if inplace:
+            for child in [self.children]:
+                # Need to handle the case when child is a port
+                self.remove(child)
+            self.add(comp_list)
+        else:
+            new_compound = Compound(name=self.name)
+            new_compound.add(comp_list)
+            return new_compound
+
     def flatten(self, inplace=True):
         """Flatten the hierarchical structure of the Compound.
 
@@ -1913,7 +2011,7 @@ def flatten(self, inplace=True):
         Return
         ------
         self : mb.Compound or None
-            return a flatten Compound if inplace is False.
+            return a flattened Compound if inplace is False.
         """
         ports_list = list(self.all_ports())
         children_list = list(self.children)
@@ -2554,8 +2652,10 @@ def _energy_minimize_openbabel(
                         f"Cannot create a constraint between a Particle and itself: {p1} {p2} ."
                     )
 
-                pid_1 = particle_idx[id(p1)] + 1  # openbabel indices start at 1
-                pid_2 = particle_idx[id(p2)] + 1  # openbabel indices start at 1
+                # openbabel indices start at 1
+                pid_1 = particle_idx[id(p1)] + 1
+                # openbabel indices start at 1
+                pid_2 = particle_idx[id(p2)] + 1
                 dist = (
                     con_temp[1] * 10.0
                 )  # obenbabel uses angstroms, not nm, convert to angstroms
@@ -3421,3 +3521,16 @@ def _clone_bonds(self, clone_of=None):
 
 
 Particle = Compound
+
+
+def _flatten_list(c_list):
+    """Flatten a list.
+
+    Helper function to flatten a list that may be nested, e.g. [comp1, [comp2, comp3]].
+    """
+    if isinstance(c_list, Iterable) and not isinstance(c_list, str):
+        for c in c_list:
+            if isinstance(c, Iterable) and not isinstance(c, str):
+                yield from _flatten_list(c)
+            else:
+                yield c
diff --git a/mbuild/conversion.py b/mbuild/conversion.py
index e2341b349..50e0f2bb8 100644
--- a/mbuild/conversion.py
+++ b/mbuild/conversion.py
@@ -554,19 +554,23 @@ def from_parmed(
         chains[residue.chain].append(residue)
 
     # Build up compound
+    chain_list = []
     for chain, residues in chains.items():
         if len(chain) > 1:
             chain_compound = mb.Compound()
-            compound.add(chain_compound, chain_id)
+            chain_list.append(chain_compound)
         else:
             chain_compound = compound
+        res_list = []
         for residue in residues:
             if infer_hierarchy:
                 residue_compound = mb.Compound(name=residue.name)
-                chain_compound.add(residue_compound)
                 parent_compound = residue_compound
+                res_list.append(residue_compound)
             else:
                 parent_compound = chain_compound
+            atom_list = []
+            atom_label_list = []
             for atom in residue.atoms:
                 # Angstrom to nm
                 pos = np.array([atom.xx, atom.xy, atom.xz]) / 10
@@ -577,8 +581,14 @@ def from_parmed(
                 new_atom = mb.Particle(
                     name=str(atom.name), pos=pos, element=element
                 )
-                parent_compound.add(new_atom, label="{0}[$]".format(atom.name))
+                atom_list.append(new_atom)
+                atom_label_list.append("{0}[$]".format(atom.name))
                 atom_mapping[atom] = new_atom
+            parent_compound.add(atom_list, label=atom_label_list)
+        if infer_hierarchy:
+            chain_compound.add(res_list)
+    if len(chain) > 1:
+        compound.add(chain_list)
 
     # Infer bonds information
     for bond in structure.bonds:
@@ -650,19 +660,29 @@ def from_trajectory(
         compound = mb.Compound()
 
     atom_mapping = dict()
+    # temporary lists to speed up add to the compound
+    chains_list = []
+    chains_list_label = []
+
     for chain in traj.topology.chains:
         if traj.topology.n_chains > 1:
             chain_compound = mb.Compound()
-            compound.add(chain_compound, "chain[$]")
+            chains_list.append(chain_compound)
+            chains_list_label.append("chain[$]")
         else:
             chain_compound = compound
+
+        res_list = []
         for res in chain.residues:
             if infer_hierarchy:
                 res_compound = mb.Compound(name=res.name)
-                chain_compound.add(res_compound)
                 parent_cmpd = res_compound
+                res_list.append(res_compound)
             else:
                 parent_cmpd = chain_compound
+
+            atom_list = []
+            atom_label_list = []
             for atom in res.atoms:
                 try:
                     element = element_from_atomic_number(
@@ -675,9 +695,17 @@ def from_trajectory(
                     pos=traj.xyz[frame, atom.index],
                     element=element,
                 )
-                parent_cmpd.add(new_atom, label="{0}[$]".format(atom.name))
+                atom_list.append(new_atom)
+                atom_label_list.append("{0}[$]".format(atom.name))
                 atom_mapping[atom] = new_atom
 
+            parent_cmpd.add(atom_list, label=atom_label_list)
+
+        if infer_hierarchy:
+            chain_compound.add(res_list)
+    if traj.topology.n_chains > 1:
+        compound.add(chains_list, label=chains_list_label)
+
     for mdtraj_atom1, mdtraj_atom2 in traj.topology.bonds:
         atom1 = atom_mapping[mdtraj_atom1]
         atom2 = atom_mapping[mdtraj_atom2]
@@ -846,13 +874,16 @@ def from_rdkit(rdkit_mol, compound=None, coords_only=False, smiles_seed=0):
     else:
         comp = compound
 
+    part_list = []
     for i, atom in enumerate(mymol.GetAtoms()):
         part = mb.Particle(
             name=atom.GetSymbol(),
             element=element_from_atomic_number(atom.GetAtomicNum()),
             pos=xyz[i],
         )
-        comp.add(part)
+        part_list.append(part)
+
+    comp.add(part_list)
 
     for bond in mymol.GetBonds():
         comp.add_bond(
diff --git a/mbuild/tests/test_compound.py b/mbuild/tests/test_compound.py
index d82a88714..99c19e1a5 100644
--- a/mbuild/tests/test_compound.py
+++ b/mbuild/tests/test_compound.py
@@ -621,6 +621,51 @@ def test_add_label_exists(self, ethane, h2o):
         with pytest.raises(MBuildError):
             ethane.add(mb.clone(h2o), label="water")
 
+    def test_list_flatten(self, h2o):
+        from mbuild.compound import _flatten_list
+
+        out = [a for a in _flatten_list(["one", "two", ["three", "four"]])]
+        assert out == ["one", "two", "three", "four"]
+
+        one = mb.clone(h2o)
+        one.name = "one"
+        two = mb.clone(h2o)
+        two.name = "two"
+        three = mb.clone(h2o)
+        three.name = "three"
+        four = mb.clone(h2o)
+        four.name = "four"
+        out = [a.name for a in _flatten_list([one, two, [three, four]])]
+
+        assert out == ["one", "two", "three", "four"]
+
+    def test_add_by_list(self, h2o):
+        temp_comp = mb.Compound()
+        comp_list = []
+        label_list = []
+        for j in range(0, 5):
+            comp_list.append(mb.clone(h2o))
+            label_list.append("water[$]")
+        temp_comp.add(comp_list, label=label_list)
+        a = [k for k, v in temp_comp.labels.items()]
+        assert a == [
+            "water",
+            "water[0]",
+            "water[1]",
+            "water[2]",
+            "water[3]",
+            "water[4]",
+        ]
+
+        temp_comp = mb.Compound()
+        comp_list = []
+        label_list = ["water"]
+        for j in range(0, 5):
+            comp_list.append(mb.clone(h2o))
+
+        with pytest.raises(ValueError):
+            temp_comp.add(comp_list, label=label_list)
+
     def test_set_pos(self, ethane):
         with pytest.raises(MBuildError):
             ethane.pos = [0, 0, 0]
@@ -877,6 +922,74 @@ def test_particle_in_particle(self):
         assert len(list(parent.ancestors())) == 0
         assert next(parent.particles_by_name("A")) == part
 
+    def test_condense(self, ethane):
+        ethanes = mb.Compound()
+        ethanes.add(mb.clone(ethane))
+        ethanes.add(mb.clone(ethane))
+        system = mb.Compound()
+        system.add(ethanes)
+        system_hierarchy = system.print_hierarchy(show_tree=False)
+
+        # Before we condese
+        assert len(system.children) == 1
+        assert len(ethanes.children) == 2
+        assert system.n_bonds == 14
+        assert system.n_particles == 16
+        assert system_hierarchy.depth() == 4
+
+        # Condense the Compound, returning a copy
+        condensed = system.condense(inplace=False)
+        condensed_hierarchy = condensed.print_hierarchy(show_tree=False)
+
+        assert len(condensed.children) == 2
+        assert condensed.n_bonds == 14
+        assert condensed.n_particles == 16
+
+        assert condensed_hierarchy.depth() == 3
+        assert (
+            condensed_hierarchy.to_json(with_data=False)
+            == '{"Compound, 16 particles, 14 bonds, 2 children": {"children": [{"[Ethane x 2], 8 particles, 7 bonds, 2 children": {"children": [{"[CH3 x 2], 4 particles, 3 bonds, 4 children": {"children": ["[C x 1], 1 particles, 4 bonds, 0 children", "[H x 3], 1 particles, 1 bonds, 0 children"]}}]}}]}}'
+        )
+
+        # Condense the Compound in place
+        system_copy = mb.clone(system)
+        system_copy.condense(inplace=True)
+        condensed_hierarchy2 = system_copy.print_hierarchy(show_tree=False)
+
+        assert len(system_copy.children) == 2
+        assert system_copy.n_bonds == 14
+        assert system_copy.n_particles == 16
+        assert condensed_hierarchy2.depth() == 3
+        assert (
+            condensed_hierarchy2.to_json(with_data=False)
+            == '{"Compound, 16 particles, 14 bonds, 2 children": {"children": [{"[Ethane x 2], 8 particles, 7 bonds, 2 children": {"children": [{"[CH3 x 2], 4 particles, 3 bonds, 4 children": {"children": ["[C x 1], 1 particles, 4 bonds, 0 children", "[H x 3], 1 particles, 1 bonds, 0 children"]}}]}}]}}'
+        )
+
+        # add two particles that aren't bonded
+        system.add(mb.Compound(name="C"))
+        system.add(mb.Compound(name="C"))
+        system_hierarchy = system.print_hierarchy(show_tree=False)
+
+        assert len(system.children) == 3
+        assert len(ethanes.children) == 2
+        assert system.n_bonds == 14
+        assert system.n_particles == 18
+        assert system_hierarchy.depth() == 4
+
+        # condense the system
+        condensed = system.condense(inplace=False)
+        condensed_hierarchy = condensed.print_hierarchy(show_tree=False)
+
+        assert len(condensed.children) == 4
+        assert condensed.n_bonds == 14
+        assert condensed.n_particles == 18
+
+        assert condensed_hierarchy.depth() == 3
+        assert (
+            condensed_hierarchy.to_json(with_data=False)
+            == '{"Compound, 18 particles, 14 bonds, 4 children": {"children": ["[C x 2], 1 particles, 0 bonds, 0 children", {"[Ethane x 2], 8 particles, 7 bonds, 2 children": {"children": [{"[CH3 x 2], 4 particles, 3 bonds, 4 children": {"children": ["[C x 1], 1 particles, 4 bonds, 0 children", "[H x 3], 1 particles, 1 bonds, 0 children"]}}]}}]}}'
+        )
+
     def test_flatten_eth(self, ethane):
         # Before flattening
         assert len(ethane.children) == 2