Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: a new --no-normalize parameter for simphenotype #156

Merged
merged 2 commits into from
Dec 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions haptools/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,13 @@ def simgenotype(
show_default="quantitative trait",
help="Disease prevalence if simulating a case-control trait",
)
@click.option(
"--normalize/--no-normalize",
is_flag=True,
default=True,
show_default=True,
help="Whether to normalize the genotypes before using them for simulation",
)
@click.option(
"--region",
type=str,
Expand Down Expand Up @@ -344,6 +351,7 @@ def simphenotype(
replications: int = 1,
heritability: float = None,
prevalence: float = None,
normalize: bool = True,
region: str = None,
samples: tuple[str] = tuple(),
samples_file: Path = None,
Expand Down Expand Up @@ -396,6 +404,7 @@ def simphenotype(
replications,
heritability,
prevalence,
normalize,
region,
samples,
ids,
Expand Down Expand Up @@ -738,6 +747,7 @@ def ld(
"--sort/--no-sort",
is_flag=True,
default=True,
show_default=True,
help="Sorting of the file will not be performed",
)
@click.option(
Expand Down
41 changes: 25 additions & 16 deletions haptools/sim_phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def run(
effects: list[Haplotype],
heritability: float = None,
prevalence: float = None,
normalize: bool = True,
) -> npt.NDArray:
"""
Simulate phenotypes for an entry in the Genotypes object
Expand All @@ -117,6 +118,9 @@ def run(

If this value is specified, case/control phenotypes will be generated
instead of quantitative traits.
normalize: bool, optional
If True, normalize the genotypes before using them to simulate the
phenotypes. Otherwise, use the raw values.

Returns
-------
Expand All @@ -132,21 +136,22 @@ def run(
gts = self.gens.subset(variants=ids).data[:, :, :2].sum(axis=2)
self.log.info(f"Computing genetic component w/ {gts.shape[1]} causal effects")
# standardize the genotypes
std = gts.std(axis=0)
gts = (gts - gts.mean(axis=0)) / std
# for genotypes where the stdev is 0, just set all values to 0 instead of nan
zero_elements = std == 0
num_zero_elements = np.sum(zero_elements)
if num_zero_elements:
# get the first five causal variables with variances == 0
zero_elements_ids = np.array(ids)[zero_elements]
if len(zero_elements_ids) > 5:
zero_elements_ids = zero_elements_ids[:5]
self.log.warning(
"Some of your causal variables have genotypes with variance 0. Here "
f"are the first few few: {zero_elements_ids}"
)
gts[:, zero_elements] = np.zeros((gts.shape[0], num_zero_elements))
if normalize:
std = gts.std(axis=0)
gts = (gts - gts.mean(axis=0)) / std
# when the stdev is 0, just set all values to 0 instead of nan
zero_elements = std == 0
num_zero_elements = np.sum(zero_elements)
if num_zero_elements:
# get the first five causal variables with variances == 0
zero_elements_ids = np.array(ids)[zero_elements]
if len(zero_elements_ids) > 5:
zero_elements_ids = zero_elements_ids[:5]
self.log.warning(
"Some of your causal variables have genotypes with variance 0. "
f"Here are the first few few: {zero_elements_ids}"
)
gts[:, zero_elements] = np.zeros((gts.shape[0], num_zero_elements))
# generate the genetic component
pt = (betas * gts).sum(axis=1)
# compute the heritability
Expand Down Expand Up @@ -209,6 +214,7 @@ def simulate_pt(
num_replications: int = 1,
heritability: float = None,
prevalence: float = None,
normalize: bool = True,
region: str = None,
samples: list[str] = None,
haplotype_ids: set[str] = None,
Expand Down Expand Up @@ -247,6 +253,9 @@ def simulate_pt(
must be a float between 0 and 1

If not provided, a quantitative trait will be simulated, instead
normalize: bool, optional
If True, normalize the genotypes before using them to simulate the phenotypes.
Otherwise, use the raw values.
region : str, optional
The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'

Expand Down Expand Up @@ -312,6 +321,6 @@ def simulate_pt(
log.info("Simulating phenotypes")
pt_sim = PhenoSimulator(gt, output=output, log=log)
for i in range(num_replications):
pt_sim.run(hp.data.values(), heritability, prevalence)
pt_sim.run(hp.data.values(), heritability, prevalence, normalize)
log.info("Writing phenotypes")
pt_sim.write()
32 changes: 32 additions & 0 deletions tests/test_simphenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,23 @@ def test_case_control(self):
diff1 = (phens.data[:, 3] == phens.data[:, 0]).sum()
assert diff1 > 0

def test_one_hap_zero_noise_no_normalize(self):
gts = self._get_fake_gens()
hps = self._get_fake_haps()
expected = self._get_expected_phens()

pt_sim = PhenoSimulator(gts, seed=42)
data = pt_sim.run([hps[0]], heritability=1, normalize=False)
data = data[:, np.newaxis]
phens = pt_sim.phens

# check the data and the generated phenotype object
assert phens.data.shape == (5, 1)
np.testing.assert_allclose(phens.data, data)
np.testing.assert_allclose(data, np.array([0.25, 0.25, 0, 0.5, 0])[:, None])
assert phens.samples == expected.samples
assert phens.names[0] == expected.names[0]


class TestSimPhenotypeCLI:
def _get_transform_stdin(self):
Expand Down Expand Up @@ -436,3 +453,18 @@ def test_complex(self, capfd):

tmp_file.unlink()
tmp_tsfm.unlink()

def test_no_normalize(self, capfd):
# first, create a temporary file containing the output of transform
tmp_transform = Path("temp-transform.vcf")
with open(tmp_transform, "w") as file:
file.write(self._get_transform_stdin())

cmd = f"simphenotype --no-normalize {tmp_transform} tests/data/simple.hap"
runner = CliRunner()
result = runner.invoke(main, cmd.split(" "), catch_exceptions=False)
captured = capfd.readouterr()
assert captured.out
assert result.exit_code == 0

tmp_transform.unlink()