CAST-genomics · aryarm · Dec 30, 2022 · Dec 30, 2022 · Dec 30, 2022
diff --git a/haptools/__main__.py b/haptools/__main__.py
@@ -258,6 +258,13 @@ def simgenotype(
     show_default="quantitative trait",
     help="Disease prevalence if simulating a case-control trait",
 )
+@click.option(
+    "--normalize/--no-normalize",
+    is_flag=True,
+    default=True,
+    show_default=True,
+    help="Whether to normalize the genotypes before using them for simulation",
+)
 @click.option(
     "--region",
     type=str,
@@ -344,6 +351,7 @@ def simphenotype(
     replications: int = 1,
     heritability: float = None,
     prevalence: float = None,
+    normalize: bool = True,
     region: str = None,
     samples: tuple[str] = tuple(),
     samples_file: Path = None,
@@ -396,6 +404,7 @@ def simphenotype(
         replications,
         heritability,
         prevalence,
+        normalize,
         region,
         samples,
         ids,
@@ -738,6 +747,7 @@ def ld(
     "--sort/--no-sort",
     is_flag=True,
     default=True,
+    show_default=True,
     help="Sorting of the file will not be performed",
 )
 @click.option(

diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py
@@ -97,6 +97,7 @@ def run(
         effects: list[Haplotype],
         heritability: float = None,
         prevalence: float = None,
+        normalize: bool = True,
     ) -> npt.NDArray:
         """
         Simulate phenotypes for an entry in the Genotypes object
@@ -117,6 +118,9 @@ def run(
 
             If this value is specified, case/control phenotypes will be generated
             instead of quantitative traits.
+        normalize: bool, optional
+            If True, normalize the genotypes before using them to simulate the
+            phenotypes. Otherwise, use the raw values.
 
         Returns
         -------
@@ -132,21 +136,22 @@ def run(
         gts = self.gens.subset(variants=ids).data[:, :, :2].sum(axis=2)
         self.log.info(f"Computing genetic component w/ {gts.shape[1]} causal effects")
         # standardize the genotypes
-        std = gts.std(axis=0)
-        gts = (gts - gts.mean(axis=0)) / std
-        # for genotypes where the stdev is 0, just set all values to 0 instead of nan
-        zero_elements = std == 0
-        num_zero_elements = np.sum(zero_elements)
-        if num_zero_elements:
-            # get the first five causal variables with variances == 0
-            zero_elements_ids = np.array(ids)[zero_elements]
-            if len(zero_elements_ids) > 5:
-                zero_elements_ids = zero_elements_ids[:5]
-            self.log.warning(
-                "Some of your causal variables have genotypes with variance 0. Here "
-                f"are the first few few: {zero_elements_ids}"
-            )
-        gts[:, zero_elements] = np.zeros((gts.shape[0], num_zero_elements))
+        if normalize:
+            std = gts.std(axis=0)
+            gts = (gts - gts.mean(axis=0)) / std
+            # when the stdev is 0, just set all values to 0 instead of nan
+            zero_elements = std == 0
+            num_zero_elements = np.sum(zero_elements)
+            if num_zero_elements:
+                # get the first five causal variables with variances == 0
+                zero_elements_ids = np.array(ids)[zero_elements]
+                if len(zero_elements_ids) > 5:
+                    zero_elements_ids = zero_elements_ids[:5]
+                self.log.warning(
+                    "Some of your causal variables have genotypes with variance 0. "
+                    f"Here are the first few few: {zero_elements_ids}"
+                )
+            gts[:, zero_elements] = np.zeros((gts.shape[0], num_zero_elements))
         # generate the genetic component
         pt = (betas * gts).sum(axis=1)
         # compute the heritability
@@ -209,6 +214,7 @@ def simulate_pt(
     num_replications: int = 1,
     heritability: float = None,
     prevalence: float = None,
+    normalize: bool = True,
     region: str = None,
     samples: list[str] = None,
     haplotype_ids: set[str] = None,
@@ -247,6 +253,9 @@ def simulate_pt(
         must be a float between 0 and 1
 
         If not provided, a quantitative trait will be simulated, instead
+    normalize: bool, optional
+        If True, normalize the genotypes before using them to simulate the phenotypes.
+        Otherwise, use the raw values.
     region : str, optional
         The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'
 
@@ -312,6 +321,6 @@ def simulate_pt(
     log.info("Simulating phenotypes")
     pt_sim = PhenoSimulator(gt, output=output, log=log)
     for i in range(num_replications):
-        pt_sim.run(hp.data.values(), heritability, prevalence)
+        pt_sim.run(hp.data.values(), heritability, prevalence, normalize)
     log.info("Writing phenotypes")
     pt_sim.write()
diff --git a/tests/test_simphenotype.py b/tests/test_simphenotype.py
@@ -253,6 +253,23 @@ def test_case_control(self):
         diff1 = (phens.data[:, 3] == phens.data[:, 0]).sum()
         assert diff1 > 0
 
+    def test_one_hap_zero_noise_no_normalize(self):
+        gts = self._get_fake_gens()
+        hps = self._get_fake_haps()
+        expected = self._get_expected_phens()
+
+        pt_sim = PhenoSimulator(gts, seed=42)
+        data = pt_sim.run([hps[0]], heritability=1, normalize=False)
+        data = data[:, np.newaxis]
+        phens = pt_sim.phens
+
+        # check the data and the generated phenotype object
+        assert phens.data.shape == (5, 1)
+        np.testing.assert_allclose(phens.data, data)
+        np.testing.assert_allclose(data, np.array([0.25, 0.25, 0, 0.5, 0])[:, None])
+        assert phens.samples == expected.samples
+        assert phens.names[0] == expected.names[0]
+
 
 class TestSimPhenotypeCLI:
     def _get_transform_stdin(self):
@@ -436,3 +453,18 @@ def test_complex(self, capfd):
 
         tmp_file.unlink()
         tmp_tsfm.unlink()
+
+    def test_no_normalize(self, capfd):
+        # first, create a temporary file containing the output of transform
+        tmp_transform = Path("temp-transform.vcf")
+        with open(tmp_transform, "w") as file:
+            file.write(self._get_transform_stdin())
+
+        cmd = f"simphenotype --no-normalize {tmp_transform} tests/data/simple.hap"
+        runner = CliRunner()
+        result = runner.invoke(main, cmd.split(" "), catch_exceptions=False)
+        captured = capfd.readouterr()
+        assert captured.out
+        assert result.exit_code == 0
+
+        tmp_transform.unlink()