[Add Randomized SVD in PCA] (secretflow#300)

# Pull Request ## What problem does this PR solve? 使用 SPU 优化 PCA 算法 Issue Number: Fixed secretflow#259 ## Possible side effects? - Performance: 1. 收敛速度更快(体现在能支持更大的特征维度) 2. 不需要显示的计算原数据集的协方差矩阵 - Backward compatibility:
magic-hya · Aug 15, 2023 · f726019 · f726019
1 parent 9857c82
commit f726019
Show file tree

Hide file tree

Showing 9 changed files with 388 additions and 57 deletions.
diff --git a/sml/decomposition/BUILD.bazel b/sml/decomposition/BUILD.bazel
@@ -19,4 +19,5 @@ package(default_visibility = ["//visibility:public"])
 py_library(
     name = "pca",
     srcs = ["pca.py"],
+    deps = ["//sml/utils:extmath"],
 )
diff --git a/sml/decomposition/emulations/3pc.json b/sml/decomposition/emulations/3pc.json
@@ -0,0 +1,52 @@
+{
+    "id": "outsourcing.3pc",
+    "nodes": {
+        "node:0": "127.0.0.1:9920",
+        "node:1": "127.0.0.1:9921",
+        "node:2": "127.0.0.1:9922",
+        "node:3": "127.0.0.1:9923",
+        "node:4": "127.0.0.1:9924"
+    },
+    "devices": {
+        "SPU": {
+            "kind": "SPU",
+            "config": {
+                "node_ids": [
+                    "node:0",
+                    "node:1",
+                    "node:2"
+                ],
+                "spu_internal_addrs": [
+                    "127.0.0.1:9930",
+                    "127.0.0.1:9931",
+                    "127.0.0.1:9932"
+                ],
+                "experimental_data_folder": [
+                    "/tmp/spu_data_0/",
+                    "/tmp/spu_data_1/",
+                    "/tmp/spu_data_2/"
+                ],
+                "runtime_config": {
+                    "protocol": "ABY3",
+                    "field": "FM128",
+                    "fxp_fraction_bits": 30,
+                    "enable_pphlo_profile": true,
+                    "enable_hal_profile": true,
+                    "enable_pphlo_trace": false
+                }
+            }
+        },
+        "P1": {
+            "kind": "PYU",
+            "config": {
+                "node_id": "node:3"
+            }
+        },
+        "P2": {
+            "kind": "PYU",
+            "config": {
+                "node_id": "node:4"
+            }
+        }
+    }
+}
diff --git a/sml/decomposition/emulations/BUILD.bazel b/sml/decomposition/emulations/BUILD.bazel
@@ -24,3 +24,20 @@ py_binary(
         "//sml/utils:emulation",
     ],
 )
+
+py_binary(
+    name = "rsvd_pca_emul",
+    srcs = ["rsvd_pca_emul.py"],
+    data = [":conf"],
+    deps = [
+        "//sml/decomposition:pca",
+        "//sml/utils:emulation",
+    ],
+)
+
+filegroup(
+    name = "conf",
+    srcs = [
+        "3pc.json",
+    ],
+)
diff --git a/sml/decomposition/emulations/pca_emul.py b/sml/decomposition/emulations/pca_emul.py
@@ -21,13 +21,13 @@
 from sklearn.decomposition import PCA as SklearnPCA
 
 # Add the library directory to the path
-sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
+sys.path.append(os.path.join(os.path.dirname(__file__), '../../../'))
 
 import sml.utils.emulation as emulation
 from sml.decomposition.pca import PCA
 
 
-def emul_PCA(mode: emulation.Mode.MULTIPROCESS):
+def emul_powerPCA(mode: emulation.Mode.MULTIPROCESS):
     def proc(X):
         model = PCA(
             method='power_iteration',
@@ -94,4 +94,4 @@ def proc_reconstruct(X):
 
 
 if __name__ == "__main__":
-    emul_PCA(emulation.Mode.MULTIPROCESS)
+    emul_powerPCA(emulation.Mode.MULTIPROCESS)
diff --git a/sml/decomposition/emulations/rsvd_pca_emul.py b/sml/decomposition/emulations/rsvd_pca_emul.py
@@ -0,0 +1,107 @@
+# Copyright 2023 Ant Group Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+import jax.numpy as jnp
+import jax.random as random
+import numpy as np
+from sklearn.decomposition import PCA as SklearnPCA
+
+# Add the library directory to the path
+sys.path.append(os.path.join(os.path.dirname(__file__), '../../../'))
+
+import sml.utils.emulation as emulation
+from sml.decomposition.pca import PCA
+
+
+def emul_rsvdPCA(mode: emulation.Mode.MULTIPROCESS):
+    print("emul rsvdPCA.")
+
+    def proc(X, random_matrix, n_components, n_oversamples, max_power_iter, scale):
+        model = PCA(
+            method='rsvd',
+            n_components=n_components,
+            n_oversamples=n_oversamples,
+            random_matrix=random_matrix,
+            max_power_iter=max_power_iter,
+            scale=scale,
+        )
+
+        model.fit(X)
+        X_transformed = model.transform(X)
+        X_variances = model._variances
+        X_reconstructed = model.inverse_transform(X_transformed)
+
+        return X_transformed, X_variances, X_reconstructed
+
+    try:
+        # bandwidth and latency only work for docker mode
+        conf_path = "sml/decomposition/emulations/3pc.json"
+        emulator = emulation.Emulator(conf_path, mode, bandwidth=300, latency=20)
+        emulator.up()
+
+        # Create a simple dataset
+        X = random.normal(random.PRNGKey(0), (1000, 20))
+        X_spu = emulator.seal(X)
+        n_components = 5
+        n_oversamples = 10
+        max_power_iter = 300
+        scale = (10000000, 10000)
+
+        # Create random_matrix
+        random_state = np.random.RandomState(0)
+        random_matrix = random_state.normal(
+            size=(X.shape[1], n_components + n_oversamples)
+        )
+        random_matrix_spu = emulator.seal(random_matrix)
+
+        result = emulator.run(proc, static_argnums=(2, 3, 4, 5))(
+            X_spu, random_matrix_spu, n_components, n_oversamples, max_power_iter, scale
+        )
+        print("X_transformed_spu: ", result[0][:5, :])
+        print("X_variance_spu: ", result[1])
+        print("X_reconstructed_spu:", result[2][:5, :])
+
+        # The transformed data should have 2 dimensions
+        assert result[0].shape[1] == n_components
+
+        # The mean of the transformed data should be approximately 0
+        assert jnp.allclose(jnp.mean(result[0], axis=0), 0, atol=1e-3)
+
+        # Compare with sklearn
+        model = SklearnPCA(
+            n_components=n_components,
+            svd_solver="randomized",
+            power_iteration_normalizer="QR",
+            random_state=0,
+        )
+        model.fit(X)
+        X_transformed = model.transform(X)
+        X_variances = model.explained_variance_
+        X_reconstructed = model.inverse_transform(X_transformed)
+
+        print("X_transformed_sklearn: ", X_transformed[:5, :])
+        print("X_variances_sklearn: ", X_variances)
+        print("X_reconstructed_sklearn: ", X_reconstructed[:5, :])
+
+        assert np.allclose(X_reconstructed, result[2], atol=1e-1)
+
+    finally:
+        emulator.down()
+
+
+if __name__ == "__main__":
+    emul_rsvdPCA(emulation.Mode.MULTIPROCESS)