Skip to content

Commit

Permalink
Merge branch 'paper' of https://github.com/BoevaLab/survhive into paper
Browse files Browse the repository at this point in the history
  • Loading branch information
dnwissel committed Jul 11, 2023
2 parents 7c66bd2 + 3cd16ab commit 70120e2
Show file tree
Hide file tree
Showing 6 changed files with 713 additions and 26 deletions.
302 changes: 302 additions & 0 deletions paper/experiments/experiments_concordance.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,302 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "069941be",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"from survhive.cox import CoxPHElasticNet\n",
"from survhive.cv_models import CoxPHElasticNetCV, CoxPHPrecondCV\n",
"from survhive.utils import transform_survival, transform_preconditioning"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "04fc657a",
"metadata": {},
"outputs": [],
"source": [
"with open(f\"../config.json\") as f:\n",
" config = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "343b2daf",
"metadata": {},
"outputs": [],
"source": [
"results_efron_lasso = {}\n",
"failures_efron_lasso = {}\n",
"sparsity_efron_lasso = {}\n",
"\n",
"results_efron_elastic_net = {}\n",
"failures_efron_elastic_net = {}\n",
"sparsity_efron_elastic_net = {}\n",
"\n",
"results_efron_precond = {}\n",
"failures_efron_precond = {}\n",
"sparsity_efron_precond = {}\n",
"tau_precond = {}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "77df313d",
"metadata": {},
"outputs": [],
"source": [
"pipe = make_pipeline(\n",
" StandardScaler(),\n",
" CoxPHElasticNetCV(tie_correction=\"efron\",\n",
" eps=0.1,\n",
" n_alphas=100,\n",
" l1_ratios=[1.0],\n",
" cv=5,\n",
" n_jobs=1,\n",
" random_state=config[\"random_state\"],\n",
" n_irls_iter=5,\n",
" tol=0.0001\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "615d5848",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"for cancer in config[\"datasets\"]:\n",
" print(f\"Starting: {cancer}\")\n",
" train_splits = pd.read_csv(f\"../data/splits/TCGA/{cancer}_train_splits.csv\")\n",
" test_splits = pd.read_csv(f\"../data/splits/TCGA/{cancer}_test_splits.csv\")\n",
" data = pd.read_csv(f\"../data/processed/TCGA/{cancer}_data_preprocessed.csv\").iloc[:, 1:]\n",
" X_ = data.iloc[:, 3:]\n",
" y_ = transform_survival(time=data[\"OS_days\"].values, event=data[\"OS\"].values)\n",
" for split in range(25):\n",
" print(f\"Starting split: {split+1} / 25\")\n",
" train_ix = train_splits.iloc[split, :].dropna().to_numpy().astype(int)\n",
" test_ix = test_splits.iloc[split, :].dropna().to_numpy().astype(int)\n",
" X_train = X_.iloc[train_ix, :].copy().reset_index(drop=True)\n",
" y_train = y_[train_ix].copy()\n",
" X_test = X_.iloc[test_ix, :].copy().reset_index(drop=True)\n",
" if split == 0:\n",
" results_efron_lasso[cancer] = {}\n",
" sparsity_efron_lasso[cancer] = {}\n",
" failures_efron_lasso[cancer] = 0\n",
" try:\n",
" pipe.fit(X_train, y_train)\n",
" sparsity_efron_lasso[cancer][split] = np.sum(pipe[1].coef_ != 0)\n",
" results_efron_lasso[cancer][split] = pipe.predict(X_test)\n",
" except ValueError as e:\n",
" failures_efron_lasso[cancer] += 1\n",
" results_efron_lasso[cancer][split] = np.zeros(test_ix.shape[0])\n",
" sparsity_efron_lasso[cancer][split] = 0\n",
" \n",
" pd.concat([pd.DataFrame(results_efron_lasso[cancer][i]) for i in range(25)], axis=1).to_csv(\n",
" f\"../results/efron_lasso_{cancer}.csv\", index=False\n",
" )\n",
" \n",
"pd.DataFrame(sparsity_efron_lasso).to_csv(\n",
" f\"../results/efron_lasso_sparsity.csv\", index=False\n",
")\n",
"pd.DataFrame(failures_efron_lasso).to_csv(\n",
" f\"../results/efron_lasso_failures.csv\", index=False\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9fc9eb25",
"metadata": {},
"outputs": [],
"source": [
"pipe = make_pipeline(\n",
" StandardScaler(),\n",
" CoxPHElasticNetCV(tie_correction=\"efron\",\n",
" eps=0.1,\n",
" n_alphas=100,\n",
" l1_ratios=[.1, .5, .7, .9, .95, .99, 1],\n",
" cv=5,\n",
" n_jobs=-1,\n",
" random_state=config[\"random_state\"],\n",
" n_irls_iter=5,\n",
" tol=0.0001\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "96188f35",
"metadata": {},
"outputs": [],
"source": [
"for cancer in config[\"datasets\"]:\n",
" print(f\"Starting: {cancer}\")\n",
" train_splits = pd.read_csv(f\"../data/splits/TCGA/{cancer}_train_splits.csv\")\n",
" test_splits = pd.read_csv(f\"../data/splits/TCGA/{cancer}_test_splits.csv\")\n",
" data = pd.read_csv(f\"../data/processed/TCGA/{cancer}_data_preprocessed.csv\").iloc[:, 1:]\n",
" X_ = data.iloc[:, 3:]\n",
" y_ = transform_survival(time=data[\"OS_days\"].values, event=data[\"OS\"].values)\n",
" for split in range(25):\n",
" print(f\"Starting split: {split+1} / 25\")\n",
" train_ix = train_splits.iloc[split, :].dropna().to_numpy().astype(int)\n",
" test_ix = test_splits.iloc[split, :].dropna().to_numpy().astype(int)\n",
" X_train = X_.iloc[train_ix, :].copy().reset_index(drop=True)\n",
" y_train = y_[train_ix].copy()\n",
" X_test = X_.iloc[test_ix, :].copy().reset_index(drop=True)\n",
" if split == 0:\n",
" results_efron_elastic_net[cancer] = {}\n",
" sparsity_efron_elastic_net[cancer] = {}\n",
" failures_efron_elastic_net[cancer] = 0\n",
" try:\n",
" pipe.fit(X_train, y_train)\n",
" sparsity_efron_elastic_net[cancer][split] = np.sum(pipe[1].coef_ != 0)\n",
" results_efron_elastic_net[cancer][split] = pipe.predict(X_test)\n",
" except ValueError as e:\n",
" failures_efron_elastic_net[cancer] += 1\n",
" results_efron_elastic_net[cancer][split] = np.zeros(test_ix.shape[0])\n",
" sparsity_efron_elastic_net[cancer][split] = 0\n",
" \n",
" pd.concat([pd.DataFrame(results_efron_elastic_net[cancer][i]) for i in range(25)], axis=1).to_csv(\n",
" f\"../results/efron_elastic_net_{cancer}.csv\", index=False\n",
" )\n",
" \n",
"pd.DataFrame(sparsity_efron_elastic_net).to_csv(\n",
" f\"../results/efron_elastic_net_sparsity.csv\", index=False\n",
")\n",
"pd.DataFrame(failures_efron_elastic_net).to_csv(\n",
" f\"../results/efron_elastic_net_failures.csv\", index=False\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f9f745e4",
"metadata": {},
"outputs": [],
"source": [
"pipe = make_pipeline(\n",
" StandardScaler(),\n",
" CoxPHPrecondCV(tie_correction=\"efron\",\n",
" eps=0.1,\n",
" n_alphas=100,\n",
" taus=[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],\n",
" cv=5,\n",
" n_jobs=-1,\n",
" random_state=config[\"random_state\"],\n",
" maxiter=1000,\n",
" rtol=1e-6,\n",
" verbose=0,\n",
" default_step_size=1.0\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0c236ee2",
"metadata": {},
"outputs": [],
"source": [
"for cancer in config[\"datasets\"]:\n",
" print(f\"Starting: {cancer}\")\n",
" train_splits = pd.read_csv(f\"../data/splits/TCGA/{cancer}_train_splits.csv\")\n",
" train_predictions = pd.read_csv(f\"../results/teacher/efron_{cancer}.csv\")\n",
" test_splits = pd.read_csv(f\"../data/splits/TCGA/{cancer}_test_splits.csv\")\n",
" data = pd.read_csv(f\"../data/processed/TCGA/{cancer}_data_preprocessed.csv\").iloc[:, 1:]\n",
" X_ = data.iloc[:, 3:]\n",
" for split in range(25):\n",
" print(f\"Starting split: {split+1} / 25\")\n",
" train_ix = train_splits.iloc[split, :].dropna().to_numpy().astype(int)\n",
" test_ix = test_splits.iloc[split, :].dropna().to_numpy().astype(int)\n",
" X_train = X_.iloc[train_ix, :].copy().reset_index(drop=True)\n",
" y_train = transform_preconditioning(\n",
" time=data[\"OS_days\"].values[train_ix],\n",
" event=data[\"OS\"].values[train_ix],\n",
" y_teacher=train_predictions.iloc[:, split].dropna().values\n",
" )\n",
" X_test = X_.iloc[test_ix, :].copy().reset_index(drop=True)\n",
" if split == 0:\n",
" results_efron_precond[cancer] = {}\n",
" sparsity_efron_precond[cancer] = {}\n",
" failures_efron_precond[cancer] = 0\n",
" tau_precond[cancer] = {}\n",
" try:\n",
" pipe.fit(X_train, y_train)\n",
" sparsity_efron_precond[cancer][split] = np.sum(pipe[1].coef_ != 0)\n",
" results_efron_precond[cancer][split] = pipe.predict(X_test)\n",
" tau_precond[cancer][split] = pipe[1].tau\n",
" except ValueError as e:\n",
" failures_efron_precond[cancer] += 1\n",
" results_efron_precond[cancer][split] = np.zeros(test_ix.shape[0])\n",
" sparsity_efron_precond[cancer][split] = 0\n",
" \n",
" pd.concat([pd.DataFrame(results_efron_precond[cancer][i]) for i in range(25)], axis=1).to_csv(\n",
" f\"../results/efron_precond_{cancer}.csv\", index=False\n",
" )\n",
" \n",
"pd.DataFrame(sparsity_efron_precond).to_csv(\n",
" f\"../results/efron_precond_sparsity.csv\", index=False\n",
")\n",
"pd.DataFrame(failures_efron_precond).to_csv(\n",
" f\"../results/efron_precond_failures.csv\", index=False\n",
")\n",
"\n",
"pd.DataFrame(tau_precond).to_csv(\n",
" f\"../results/efron_precond_taus.csv\", index=False\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ebd1ba1",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 70120e2

Please sign in to comment.