Skip to content

Commit

Permalink
feat: a new --no-normalize parameter for simphenotype (#156)
Browse files Browse the repository at this point in the history
  • Loading branch information
aryarm authored Dec 30, 2022
1 parent 591a2da commit 24a0867
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 16 deletions.
10 changes: 10 additions & 0 deletions haptools/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,13 @@ def simgenotype(
show_default="quantitative trait",
help="Disease prevalence if simulating a case-control trait",
)
@click.option(
"--normalize/--no-normalize",
is_flag=True,
default=True,
show_default=True,
help="Whether to normalize the genotypes before using them for simulation",
)
@click.option(
"--region",
type=str,
Expand Down Expand Up @@ -344,6 +351,7 @@ def simphenotype(
replications: int = 1,
heritability: float = None,
prevalence: float = None,
normalize: bool = True,
region: str = None,
samples: tuple[str] = tuple(),
samples_file: Path = None,
Expand Down Expand Up @@ -396,6 +404,7 @@ def simphenotype(
replications,
heritability,
prevalence,
normalize,
region,
samples,
ids,
Expand Down Expand Up @@ -738,6 +747,7 @@ def ld(
"--sort/--no-sort",
is_flag=True,
default=True,
show_default=True,
help="Sorting of the file will not be performed",
)
@click.option(
Expand Down
41 changes: 25 additions & 16 deletions haptools/sim_phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def run(
effects: list[Haplotype],
heritability: float = None,
prevalence: float = None,
normalize: bool = True,
) -> npt.NDArray:
"""
Simulate phenotypes for an entry in the Genotypes object
Expand All @@ -117,6 +118,9 @@ def run(
If this value is specified, case/control phenotypes will be generated
instead of quantitative traits.
normalize: bool, optional
If True, normalize the genotypes before using them to simulate the
phenotypes. Otherwise, use the raw values.
Returns
-------
Expand All @@ -132,21 +136,22 @@ def run(
gts = self.gens.subset(variants=ids).data[:, :, :2].sum(axis=2)
self.log.info(f"Computing genetic component w/ {gts.shape[1]} causal effects")
# standardize the genotypes
std = gts.std(axis=0)
gts = (gts - gts.mean(axis=0)) / std
# for genotypes where the stdev is 0, just set all values to 0 instead of nan
zero_elements = std == 0
num_zero_elements = np.sum(zero_elements)
if num_zero_elements:
# get the first five causal variables with variances == 0
zero_elements_ids = np.array(ids)[zero_elements]
if len(zero_elements_ids) > 5:
zero_elements_ids = zero_elements_ids[:5]
self.log.warning(
"Some of your causal variables have genotypes with variance 0. Here "
f"are the first few few: {zero_elements_ids}"
)
gts[:, zero_elements] = np.zeros((gts.shape[0], num_zero_elements))
if normalize:
std = gts.std(axis=0)
gts = (gts - gts.mean(axis=0)) / std
# when the stdev is 0, just set all values to 0 instead of nan
zero_elements = std == 0
num_zero_elements = np.sum(zero_elements)
if num_zero_elements:
# get the first five causal variables with variances == 0
zero_elements_ids = np.array(ids)[zero_elements]
if len(zero_elements_ids) > 5:
zero_elements_ids = zero_elements_ids[:5]
self.log.warning(
"Some of your causal variables have genotypes with variance 0. "
f"Here are the first few few: {zero_elements_ids}"
)
gts[:, zero_elements] = np.zeros((gts.shape[0], num_zero_elements))
# generate the genetic component
pt = (betas * gts).sum(axis=1)
# compute the heritability
Expand Down Expand Up @@ -209,6 +214,7 @@ def simulate_pt(
num_replications: int = 1,
heritability: float = None,
prevalence: float = None,
normalize: bool = True,
region: str = None,
samples: list[str] = None,
haplotype_ids: set[str] = None,
Expand Down Expand Up @@ -247,6 +253,9 @@ def simulate_pt(
must be a float between 0 and 1
If not provided, a quantitative trait will be simulated, instead
normalize: bool, optional
If True, normalize the genotypes before using them to simulate the phenotypes.
Otherwise, use the raw values.
region : str, optional
The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'
Expand Down Expand Up @@ -312,6 +321,6 @@ def simulate_pt(
log.info("Simulating phenotypes")
pt_sim = PhenoSimulator(gt, output=output, log=log)
for i in range(num_replications):
pt_sim.run(hp.data.values(), heritability, prevalence)
pt_sim.run(hp.data.values(), heritability, prevalence, normalize)
log.info("Writing phenotypes")
pt_sim.write()
32 changes: 32 additions & 0 deletions tests/test_simphenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,23 @@ def test_case_control(self):
diff1 = (phens.data[:, 3] == phens.data[:, 0]).sum()
assert diff1 > 0

def test_one_hap_zero_noise_no_normalize(self):
gts = self._get_fake_gens()
hps = self._get_fake_haps()
expected = self._get_expected_phens()

pt_sim = PhenoSimulator(gts, seed=42)
data = pt_sim.run([hps[0]], heritability=1, normalize=False)
data = data[:, np.newaxis]
phens = pt_sim.phens

# check the data and the generated phenotype object
assert phens.data.shape == (5, 1)
np.testing.assert_allclose(phens.data, data)
np.testing.assert_allclose(data, np.array([0.25, 0.25, 0, 0.5, 0])[:, None])
assert phens.samples == expected.samples
assert phens.names[0] == expected.names[0]


class TestSimPhenotypeCLI:
def _get_transform_stdin(self):
Expand Down Expand Up @@ -436,3 +453,18 @@ def test_complex(self, capfd):

tmp_file.unlink()
tmp_tsfm.unlink()

def test_no_normalize(self, capfd):
# first, create a temporary file containing the output of transform
tmp_transform = Path("temp-transform.vcf")
with open(tmp_transform, "w") as file:
file.write(self._get_transform_stdin())

cmd = f"simphenotype --no-normalize {tmp_transform} tests/data/simple.hap"
runner = CliRunner()
result = runner.invoke(main, cmd.split(" "), catch_exceptions=False)
captured = capfd.readouterr()
assert captured.out
assert result.exit_code == 0

tmp_transform.unlink()

0 comments on commit 24a0867

Please sign in to comment.