From 135392944fe1e893fffc96feb0b1ed78747b7530 Mon Sep 17 00:00:00 2001 From: Alejandro Velez-Arce Date: Sat, 19 Oct 2024 21:16:10 -0400 Subject: [PATCH 1/8] init commit --- environment.yml | 4 +- tdc/__init__.py | 2 +- tdc/metadata.py | 6 ++ tdc/model_server/__init__.py | 0 tdc/{ => model_server}/tdc_hf.py | 0 tdc/model_server/tokenizers/__init__.py | 0 tdc/model_server/tokenizers/geneformer.py | 109 ++++++++++++++++++++++ tdc/test/test_model_server.py | 57 +++++++++++ 8 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 tdc/model_server/__init__.py rename tdc/{ => model_server}/tdc_hf.py (100%) create mode 100644 tdc/model_server/tokenizers/__init__.py create mode 100644 tdc/model_server/tokenizers/geneformer.py create mode 100644 tdc/test/test_model_server.py diff --git a/environment.yml b/environment.yml index 653bf06d..6241c518 100644 --- a/environment.yml +++ b/environment.yml @@ -12,7 +12,7 @@ dependencies: - mygene=3.2.2 - numpy=1.26.4 - openpyxl=3.0.10 - - python=3.9.13 + - python=3.10 - pip=23.3.1 - pandas=2.1.4 - requests=2.31.0 @@ -43,3 +43,5 @@ dependencies: variables: KMP_DUPLICATE_LIB_OK: "TRUE" + +# install geneformer via script https://github.com/jkobject/geneformer/tree/main diff --git a/tdc/__init__.py b/tdc/__init__.py index 3fc6b385..cf9968e6 100644 --- a/tdc/__init__.py +++ b/tdc/__init__.py @@ -1,5 +1,5 @@ from .evaluator import Evaluator from .oracles import Oracle from .benchmark_deprecated import BenchmarkGroup -from .tdc_hf import tdc_hf_interface +from .model_server.tdc_hf import tdc_hf_interface from tdc.utils.knowledge_graph import KnowledgeGraph \ No newline at end of file diff --git a/tdc/metadata.py b/tdc/metadata.py index 24019db2..c0d7a234 100644 --- a/tdc/metadata.py +++ b/tdc/metadata.py @@ -937,6 +937,9 @@ def get_task2category(): "pinnacle_output8": "zip", "pinnacle_output9": "zip", "pinnacle_output10": "zip", + "geneformer_gene_median_dictionary": "pkl", + "geneformer_gene_name_id_dict": "pkl", + "geneformer_token_dictionary": "pkl", } name2id = { @@ -1124,6 +1127,9 @@ def get_task2category(): "pinnacle_output8": 10431074, "pinnacle_output9": 10431075, "pinnacle_output10": 10431081, + "geneformer_gene_median_dictionary": 10626278, + "geneformer_gene_name_id_dict": 10626276, + "geneformer_token_dictionary": 10626277, } oracle2type = { diff --git a/tdc/model_server/__init__.py b/tdc/model_server/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tdc/tdc_hf.py b/tdc/model_server/tdc_hf.py similarity index 100% rename from tdc/tdc_hf.py rename to tdc/model_server/tdc_hf.py diff --git a/tdc/model_server/tokenizers/__init__.py b/tdc/model_server/tokenizers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tdc/model_server/tokenizers/geneformer.py b/tdc/model_server/tokenizers/geneformer.py new file mode 100644 index 00000000..6ffbd265 --- /dev/null +++ b/tdc/model_server/tokenizers/geneformer.py @@ -0,0 +1,109 @@ +import numpy as np +import scipy.sparse as sp + +from geneformer import TranscriptomeTokenizer +from ...utils.load import pd_load, download_wrapper + +class GeneformerTokenizer(TranscriptomeTokenizer): + """ + Uses Geneformer Utils to parse zero-shot model server requests for tokenizing single-cell gene expression data. + + Geneformer tokenizer source code: https://github.com/jkobject/geneformer/blob/main/geneformer/tokenizer.py + """ + + def __init__(self, path=None, custom_attr_name_dict=None, nproc=1,): + path = path or "./data" + download_wrapper("geneformer_gene_median_dictionary", path, ["geneformer_gene_median_dictionary"]) + download_wrapper("geneformer_gene_name_id_dict", path, ["geneformer_gene_name_id_dict"]) + download_wrapper("geneformer_token_dictionary", path, ["geneformer_token_dictionary"]) + self.gene_median_dict = pd_load("geneformer_gene_median_dictionary", path=path) + self.gene_name_id_dict = pd_load("geneformer_gene_name_id_dict", path=path) + self.gene_token_dict = pd_load("geneformer_token_dictionary", path=path) + self.custom_attr_name_dict = custom_attr_name_dict + self.nproc = nproc + + # gene keys for full vocabulary + self.gene_keys = list(self.gene_median_dict.keys()) + + # protein-coding and miRNA gene list dictionary for selecting .loom rows for tokenization + self.genelist_dict = dict(zip(self.gene_keys, [True] * len(self.gene_keys))) + + @classmethod + def rank_genes(gene_vector, gene_tokens): + """ + Rank gene expression vector. + """ + # sort by median-scaled gene values + sorted_indices = np.argsort(-gene_vector) + return gene_tokens[sorted_indices] + + def tokenize_cell_vectors(self, cell_vector_adata, target_sum=10_000, chunk_size=512, ensembl_id="ensembl_id"): + """ + Tokenizing single-cell gene expression vectors formatted as anndata types. + + """ + adata = cell_vector_adata + if self.custom_attr_name_dict is not None: + file_cell_metadata = { + attr_key: [] for attr_key in self.custom_attr_name_dict.keys() + } + + coding_miRNA_loc = np.where( + [self.genelist_dict.get(i, False) for i in adata.var[ensembl_id]] + )[0] + norm_factor_vector = np.array( + [ + self.gene_median_dict[i] + for i in adata.var[ensembl_id][coding_miRNA_loc] + ] + ) + coding_miRNA_ids = adata.var[ensembl_id][coding_miRNA_loc] + coding_miRNA_tokens = np.array( + [self.gene_token_dict[i] for i in coding_miRNA_ids] + ) + + try: + _ = adata.obs["filter_pass"] + except KeyError: + var_exists = False + else: + var_exists = True + + if var_exists: + filter_pass_loc = np.where( + [i == 1 for i in adata.obs["filter_pass"]] + )[0] + elif not var_exists: + print( + f"The anndata object has no column attribute 'filter_pass'; tokenizing all cells." + ) + filter_pass_loc = np.array([i for i in range(adata.shape[0])]) + + tokenized_cells = [] + + for i in range(0, len(filter_pass_loc), chunk_size): + idx = filter_pass_loc[i:i+chunk_size] + + print(adata[idx].obs.columns) + + n_counts = adata[idx].obs['ncounts'].values[:, None] + X_view = adata[idx, coding_miRNA_loc].X + X_norm = (X_view / n_counts * target_sum / norm_factor_vector) + # print(type(adata[idx].X)) + # X_norm = adata[idx].X["normalized"] + # X_norm = sp.csr_matrix(X_norm) + + tokenized_cells += [ + self.rank_genes(X_norm[i].data, coding_miRNA_tokens[X_norm[i].indices]) + for i in range(X_norm.shape[0]) + ] + + # add custom attributes for subview to dict + if self.custom_attr_name_dict is not None: + for k in file_cell_metadata.keys(): + file_cell_metadata[k] += adata[idx].obs[k].tolist() + else: + file_cell_metadata = None + + return tokenized_cells, file_cell_metadata + diff --git a/tdc/test/test_model_server.py b/tdc/test/test_model_server.py new file mode 100644 index 00000000..bff2a04e --- /dev/null +++ b/tdc/test/test_model_server.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- + +import os +import sys + +import unittest +import shutil +import pytest + +# temporary solution for relative imports in case TDC is not installed +# if TDC is installed, no need to use the following line +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) +# TODO: add verification for the generation other than simple integration + +from tdc.resource import cellxgene_census +from tdc.model_server.tokenizers.geneformer import GeneformerTokenizer + +class TestModelServer(unittest.TestCase): + + def setUp(self): + print(os.getcwd()) + self.resource = cellxgene_census.CensusResource() + + def testGeneformerTokenizer(self): + # genes = ['ENSG00000161798', 'ENSG00000188229'] + # cell_types = ['mucus secreting cell', 'neuroendocrine cell'] + # obs_cols = ["dataset_id", "assay", "suspension_type", "sex", "tissue_general", "tissue", "cell_type", "ncounts"] + # adata = self.resource.gget_czi_cellxgene( + # ensembl=True, + # gene=genes, + # cell_type=cell_types, + # column_names=obs_cols, + # ) + # TODO: scperturb is using chembl, NOT ensembl. geneformer assumes ensembl. can fix by going back to cellxgene and not normalizing + from tdc.multi_pred.perturboutcome import PerturbOutcome + test_loader = PerturbOutcome( + name="scperturb_drug_AissaBenevolenskaya2021") + adata = test_loader.adata + print(type(adata.var)) + print(adata.var.columns) + print(type(adata.obs)) + print(adata.obs.columns) + print("initializing tokenizer") + tokenizer = GeneformerTokenizer() + print("testing tokenizer") + x = tokenizer.tokenize_cell_vectors(adata) + assert x + + def tearDown(self): + try: + print(os.getcwd()) + shutil.rmtree(os.path.join(os.getcwd(), "data")) + except: + pass + + \ No newline at end of file From 3ba0f6ce2bcedd42a65f70bb2d30582cdfe813da Mon Sep 17 00:00:00 2001 From: Alejandro Velez-Arce Date: Sun, 20 Oct 2024 10:47:39 -0400 Subject: [PATCH 2/8] geneformer tokenizer passes unittest --- tdc/model_server/tokenizers/geneformer.py | 8 +-- tdc/test/test_model_server.py | 77 ++++++++++++++++++++--- 2 files changed, 69 insertions(+), 16 deletions(-) diff --git a/tdc/model_server/tokenizers/geneformer.py b/tdc/model_server/tokenizers/geneformer.py index 6ffbd265..640d334d 100644 --- a/tdc/model_server/tokenizers/geneformer.py +++ b/tdc/model_server/tokenizers/geneformer.py @@ -29,7 +29,7 @@ def __init__(self, path=None, custom_attr_name_dict=None, nproc=1,): self.genelist_dict = dict(zip(self.gene_keys, [True] * len(self.gene_keys))) @classmethod - def rank_genes(gene_vector, gene_tokens): + def rank_genes(cls, gene_vector, gene_tokens): """ Rank gene expression vector. """ @@ -84,14 +84,10 @@ def tokenize_cell_vectors(self, cell_vector_adata, target_sum=10_000, chunk_size for i in range(0, len(filter_pass_loc), chunk_size): idx = filter_pass_loc[i:i+chunk_size] - print(adata[idx].obs.columns) - n_counts = adata[idx].obs['ncounts'].values[:, None] X_view = adata[idx, coding_miRNA_loc].X X_norm = (X_view / n_counts * target_sum / norm_factor_vector) - # print(type(adata[idx].X)) - # X_norm = adata[idx].X["normalized"] - # X_norm = sp.csr_matrix(X_norm) + X_norm = sp.csr_matrix(X_norm) tokenized_cells += [ self.rank_genes(X_norm[i].data, coding_miRNA_tokens[X_norm[i].indices]) diff --git a/tdc/test/test_model_server.py b/tdc/test/test_model_server.py index bff2a04e..19aedf94 100644 --- a/tdc/test/test_model_server.py +++ b/tdc/test/test_model_server.py @@ -16,6 +16,62 @@ from tdc.resource import cellxgene_census from tdc.model_server.tokenizers.geneformer import GeneformerTokenizer + +import requests + +def get_target_from_chembl(chembl_id): + # Query ChEMBL API for target information + chembl_url = f"https://www.ebi.ac.uk/chembl/api/data/target/{chembl_id}.json" + response = requests.get(chembl_url) + + if response.status_code == 200: + data = response.json() + # Extract UniProt ID from the ChEMBL target info + for component in data.get('target_components', []): + for xref in component.get('target_component_xrefs', []): + if xref['xref_src_db'] == 'UniProt': + return xref['xref_id'] + else: + raise ValueError(f"ChEMBL ID {chembl_id} not found or invalid.") + return None + +def get_ensembl_from_uniprot(uniprot_id): + # Query UniProt API to get Ensembl ID from UniProt ID + uniprot_url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json" + response = requests.get(uniprot_url) + + if response.status_code == 200: + data = response.json() + # Extract Ensembl Gene ID from the cross-references + for xref in data.get('dbReferences', []): + if xref['type'] == 'Ensembl': + return xref['id'] + else: + raise ValueError(f"UniProt ID {uniprot_id} not found or invalid.") + return None + +def get_ensembl_id_from_chembl_id(chembl_id): + try: + # Step 1: Get UniProt ID from ChEMBL + uniprot_id = get_target_from_chembl(chembl_id) + if not uniprot_id: + return f"No UniProt ID found for ChEMBL ID {chembl_id}" + + # Step 2: Get Ensembl ID from UniProt + ensembl_id = get_ensembl_from_uniprot(uniprot_id) + if not ensembl_id: + return f"No Ensembl ID found for UniProt ID {uniprot_id}" + + return f"Ensembl ID for ChEMBL ID {chembl_id}: {ensembl_id}" + except Exception as e: + return str(e) + +# # Example usage +# chembl_id = "CHEMBL25" # Replace with the actual ChEMBL ID +# ensembl_id = get_ensembl_id_from_chembl_id(chembl_id) +# print(ensembl_id) + + class TestModelServer(unittest.TestCase): def setUp(self): @@ -23,20 +79,21 @@ def setUp(self): self.resource = cellxgene_census.CensusResource() def testGeneformerTokenizer(self): - # genes = ['ENSG00000161798', 'ENSG00000188229'] - # cell_types = ['mucus secreting cell', 'neuroendocrine cell'] - # obs_cols = ["dataset_id", "assay", "suspension_type", "sex", "tissue_general", "tissue", "cell_type", "ncounts"] - # adata = self.resource.gget_czi_cellxgene( - # ensembl=True, - # gene=genes, - # cell_type=cell_types, - # column_names=obs_cols, - # ) - # TODO: scperturb is using chembl, NOT ensembl. geneformer assumes ensembl. can fix by going back to cellxgene and not normalizing + import anndata from tdc.multi_pred.perturboutcome import PerturbOutcome test_loader = PerturbOutcome( name="scperturb_drug_AissaBenevolenskaya2021") adata = test_loader.adata + print("swapping obs and var because scperturb violated convention...") + adata_flipped = anndata.AnnData(adata.X.T) + adata_flipped.obs = adata.var + adata_flipped.var = adata.obs + adata = adata_flipped + print("swap complete") + print("adding ensembl ids...") + adata.var["ensembl_id"] = adata.var["chembl-ID"].apply(get_ensembl_id_from_chembl_id) + print("added ensembl_id column") + print(type(adata.var)) print(adata.var.columns) print(type(adata.obs)) From 36e4492fae6306d0913ef86bc013e3f193c019ab Mon Sep 17 00:00:00 2001 From: Alejandro Velez-Arce Date: Sun, 20 Oct 2024 11:48:39 -0400 Subject: [PATCH 3/8] ci/cd with geneformer --- .circleci/config.yml | 10 ++++++++-- .github/workflows/conda-tests.yml | 7 ++++++- environment.yml | 3 +-- requirements.txt | 3 +++ tdc/test/test_model_server.py | 6 ------ 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 6968d140..213c66e2 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7,11 +7,11 @@ workflows: version: 2 test: jobs: - - test-3.9 + - test-3.10 jobs: test-3.9: docker: - - image: circleci/python:3.9 + - image: circleci/python:3.10 working_directory: ~/repo @@ -26,6 +26,12 @@ jobs: # fallback to using the latest cache if no exact match is found - v1-py3-dependencies- + - run: + name: Install git-lfs + command: | + sudo apt-get install git-lfs + git lfs install + - run: name: install dependencies command: | diff --git a/.github/workflows/conda-tests.yml b/.github/workflows/conda-tests.yml index f6dc88b1..a26d9712 100644 --- a/.github/workflows/conda-tests.yml +++ b/.github/workflows/conda-tests.yml @@ -27,7 +27,12 @@ jobs: - name: Set up Python version uses: actions/setup-python@v1 with: - python-version: '3.9' + python-version: '3.10' + + - name: Install git-lfs + run: | + sudo apt-get install git-lfs + git lfs install - name: Setup Miniconda uses: conda-incubator/setup-miniconda@v2 diff --git a/environment.yml b/environment.yml index 6241c518..4295ed1b 100644 --- a/environment.yml +++ b/environment.yml @@ -40,8 +40,7 @@ dependencies: - torchvision==0.16.1 - transformers==4.43.4 - yapf==0.40.2 + - git+https://github.com/amva13/geneformer.git@main#egg=geneformer variables: KMP_DUPLICATE_LIB_OK: "TRUE" - -# install geneformer via script https://github.com/jkobject/geneformer/tree/main diff --git a/requirements.txt b/requirements.txt index 61adc449..3a01b1cb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,3 +21,6 @@ pydantic>=2.6.3,<3.0.0 rdkit>=2023.9.5,<2024.3.1 tiledbsoma>=1.7.2,<2.0.0 yapf>=0.40.2,<1.0.0 + +# github packages +git+https://github.com/amva13/geneformer.git@main#egg=geneformer diff --git a/tdc/test/test_model_server.py b/tdc/test/test_model_server.py index 19aedf94..f8453b8b 100644 --- a/tdc/test/test_model_server.py +++ b/tdc/test/test_model_server.py @@ -66,12 +66,6 @@ def get_ensembl_id_from_chembl_id(chembl_id): except Exception as e: return str(e) -# # Example usage -# chembl_id = "CHEMBL25" # Replace with the actual ChEMBL ID -# ensembl_id = get_ensembl_id_from_chembl_id(chembl_id) -# print(ensembl_id) - - class TestModelServer(unittest.TestCase): def setUp(self): From cd4b539f6b680a34607bbc8bef9de128dd14fc69 Mon Sep 17 00:00:00 2001 From: Alejandro Velez-Arce Date: Sun, 20 Oct 2024 11:56:50 -0400 Subject: [PATCH 4/8] yapf --- tdc/model_server/tokenizers/geneformer.py | 60 ++++++++++++++--------- tdc/test/test_model_server.py | 18 ++++--- 2 files changed, 46 insertions(+), 32 deletions(-) diff --git a/tdc/model_server/tokenizers/geneformer.py b/tdc/model_server/tokenizers/geneformer.py index 640d334d..7b5853a3 100644 --- a/tdc/model_server/tokenizers/geneformer.py +++ b/tdc/model_server/tokenizers/geneformer.py @@ -4,6 +4,7 @@ from geneformer import TranscriptomeTokenizer from ...utils.load import pd_load, download_wrapper + class GeneformerTokenizer(TranscriptomeTokenizer): """ Uses Geneformer Utils to parse zero-shot model server requests for tokenizing single-cell gene expression data. @@ -11,13 +12,23 @@ class GeneformerTokenizer(TranscriptomeTokenizer): Geneformer tokenizer source code: https://github.com/jkobject/geneformer/blob/main/geneformer/tokenizer.py """ - def __init__(self, path=None, custom_attr_name_dict=None, nproc=1,): + def __init__( + self, + path=None, + custom_attr_name_dict=None, + nproc=1, + ): path = path or "./data" - download_wrapper("geneformer_gene_median_dictionary", path, ["geneformer_gene_median_dictionary"]) - download_wrapper("geneformer_gene_name_id_dict", path, ["geneformer_gene_name_id_dict"]) - download_wrapper("geneformer_token_dictionary", path, ["geneformer_token_dictionary"]) - self.gene_median_dict = pd_load("geneformer_gene_median_dictionary", path=path) - self.gene_name_id_dict = pd_load("geneformer_gene_name_id_dict", path=path) + download_wrapper("geneformer_gene_median_dictionary", path, + ["geneformer_gene_median_dictionary"]) + download_wrapper("geneformer_gene_name_id_dict", path, + ["geneformer_gene_name_id_dict"]) + download_wrapper("geneformer_token_dictionary", path, + ["geneformer_token_dictionary"]) + self.gene_median_dict = pd_load("geneformer_gene_median_dictionary", + path=path) + self.gene_name_id_dict = pd_load("geneformer_gene_name_id_dict", + path=path) self.gene_token_dict = pd_load("geneformer_token_dictionary", path=path) self.custom_attr_name_dict = custom_attr_name_dict self.nproc = nproc @@ -26,7 +37,8 @@ def __init__(self, path=None, custom_attr_name_dict=None, nproc=1,): self.gene_keys = list(self.gene_median_dict.keys()) # protein-coding and miRNA gene list dictionary for selecting .loom rows for tokenization - self.genelist_dict = dict(zip(self.gene_keys, [True] * len(self.gene_keys))) + self.genelist_dict = dict( + zip(self.gene_keys, [True] * len(self.gene_keys))) @classmethod def rank_genes(cls, gene_vector, gene_tokens): @@ -37,7 +49,11 @@ def rank_genes(cls, gene_vector, gene_tokens): sorted_indices = np.argsort(-gene_vector) return gene_tokens[sorted_indices] - def tokenize_cell_vectors(self, cell_vector_adata, target_sum=10_000, chunk_size=512, ensembl_id="ensembl_id"): + def tokenize_cell_vectors(self, + cell_vector_adata, + target_sum=10_000, + chunk_size=512, + ensembl_id="ensembl_id"): """ Tokenizing single-cell gene expression vectors formatted as anndata types. @@ -48,19 +64,16 @@ def tokenize_cell_vectors(self, cell_vector_adata, target_sum=10_000, chunk_size attr_key: [] for attr_key in self.custom_attr_name_dict.keys() } - coding_miRNA_loc = np.where( - [self.genelist_dict.get(i, False) for i in adata.var[ensembl_id]] - )[0] - norm_factor_vector = np.array( - [ - self.gene_median_dict[i] - for i in adata.var[ensembl_id][coding_miRNA_loc] - ] - ) + coding_miRNA_loc = np.where([ + self.genelist_dict.get(i, False) for i in adata.var[ensembl_id] + ])[0] + norm_factor_vector = np.array([ + self.gene_median_dict[i] + for i in adata.var[ensembl_id][coding_miRNA_loc] + ]) coding_miRNA_ids = adata.var[ensembl_id][coding_miRNA_loc] coding_miRNA_tokens = np.array( - [self.gene_token_dict[i] for i in coding_miRNA_ids] - ) + [self.gene_token_dict[i] for i in coding_miRNA_ids]) try: _ = adata.obs["filter_pass"] @@ -71,8 +84,7 @@ def tokenize_cell_vectors(self, cell_vector_adata, target_sum=10_000, chunk_size if var_exists: filter_pass_loc = np.where( - [i == 1 for i in adata.obs["filter_pass"]] - )[0] + [i == 1 for i in adata.obs["filter_pass"]])[0] elif not var_exists: print( f"The anndata object has no column attribute 'filter_pass'; tokenizing all cells." @@ -82,7 +94,7 @@ def tokenize_cell_vectors(self, cell_vector_adata, target_sum=10_000, chunk_size tokenized_cells = [] for i in range(0, len(filter_pass_loc), chunk_size): - idx = filter_pass_loc[i:i+chunk_size] + idx = filter_pass_loc[i:i + chunk_size] n_counts = adata[idx].obs['ncounts'].values[:, None] X_view = adata[idx, coding_miRNA_loc].X @@ -90,7 +102,8 @@ def tokenize_cell_vectors(self, cell_vector_adata, target_sum=10_000, chunk_size X_norm = sp.csr_matrix(X_norm) tokenized_cells += [ - self.rank_genes(X_norm[i].data, coding_miRNA_tokens[X_norm[i].indices]) + self.rank_genes(X_norm[i].data, + coding_miRNA_tokens[X_norm[i].indices]) for i in range(X_norm.shape[0]) ] @@ -102,4 +115,3 @@ def tokenize_cell_vectors(self, cell_vector_adata, target_sum=10_000, chunk_size file_cell_metadata = None return tokenized_cells, file_cell_metadata - diff --git a/tdc/test/test_model_server.py b/tdc/test/test_model_server.py index f8453b8b..70c93459 100644 --- a/tdc/test/test_model_server.py +++ b/tdc/test/test_model_server.py @@ -14,16 +14,16 @@ # TODO: add verification for the generation other than simple integration from tdc.resource import cellxgene_census -from tdc.model_server.tokenizers.geneformer import GeneformerTokenizer - +from tdc.model_server.tokenizers.geneformer import GeneformerTokenizer import requests + def get_target_from_chembl(chembl_id): # Query ChEMBL API for target information chembl_url = f"https://www.ebi.ac.uk/chembl/api/data/target/{chembl_id}.json" response = requests.get(chembl_url) - + if response.status_code == 200: data = response.json() # Extract UniProt ID from the ChEMBL target info @@ -35,11 +35,12 @@ def get_target_from_chembl(chembl_id): raise ValueError(f"ChEMBL ID {chembl_id} not found or invalid.") return None + def get_ensembl_from_uniprot(uniprot_id): # Query UniProt API to get Ensembl ID from UniProt ID uniprot_url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json" response = requests.get(uniprot_url) - + if response.status_code == 200: data = response.json() # Extract Ensembl Gene ID from the cross-references @@ -50,6 +51,7 @@ def get_ensembl_from_uniprot(uniprot_id): raise ValueError(f"UniProt ID {uniprot_id} not found or invalid.") return None + def get_ensembl_id_from_chembl_id(chembl_id): try: # Step 1: Get UniProt ID from ChEMBL @@ -66,6 +68,7 @@ def get_ensembl_id_from_chembl_id(chembl_id): except Exception as e: return str(e) + class TestModelServer(unittest.TestCase): def setUp(self): @@ -85,7 +88,8 @@ def testGeneformerTokenizer(self): adata = adata_flipped print("swap complete") print("adding ensembl ids...") - adata.var["ensembl_id"] = adata.var["chembl-ID"].apply(get_ensembl_id_from_chembl_id) + adata.var["ensembl_id"] = adata.var["chembl-ID"].apply( + get_ensembl_id_from_chembl_id) print("added ensembl_id column") print(type(adata.var)) @@ -96,7 +100,7 @@ def testGeneformerTokenizer(self): tokenizer = GeneformerTokenizer() print("testing tokenizer") x = tokenizer.tokenize_cell_vectors(adata) - assert x + assert x def tearDown(self): try: @@ -104,5 +108,3 @@ def tearDown(self): shutil.rmtree(os.path.join(os.getcwd(), "data")) except: pass - - \ No newline at end of file From c27d209e107c2310331e0325d114eeb37aeacecb Mon Sep 17 00:00:00 2001 From: Alejandro Velez-Arce Date: Sun, 20 Oct 2024 11:59:44 -0400 Subject: [PATCH 5/8] mend --- .circleci/config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 213c66e2..7586272e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,14 +2,14 @@ # # Check https://circleci.com/docs/2.0/language-python/ for more details # Adapted from https://github.com/NeuralEnsemble/python-neo -version: 2 +version: 3 workflows: - version: 2 + version: 3 test: jobs: - test-3.10 jobs: - test-3.9: + test-3.10: docker: - image: circleci/python:3.10 From 61e693f756409b249032dfc10171f215a6313ecf Mon Sep 17 00:00:00 2001 From: Alejandro Velez-Arce Date: Sun, 20 Oct 2024 12:02:01 -0400 Subject: [PATCH 6/8] mend --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7586272e..6fde42b9 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7,9 +7,9 @@ workflows: version: 3 test: jobs: - - test-3.10 + - test-3.9 jobs: - test-3.10: + test-3.9: docker: - image: circleci/python:3.10 From 15a10d8792b38bbda3a08090fb77554260d91e62 Mon Sep 17 00:00:00 2001 From: Alejandro Velez-Arce Date: Sun, 20 Oct 2024 12:03:12 -0400 Subject: [PATCH 7/8] mend --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 6fde42b9..ed4d7174 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,9 +2,9 @@ # # Check https://circleci.com/docs/2.0/language-python/ for more details # Adapted from https://github.com/NeuralEnsemble/python-neo -version: 3 +version: 2 workflows: - version: 3 + version: 2 test: jobs: - test-3.9 From e03614be8c3eb60d1716ed643f98231bfacbd701 Mon Sep 17 00:00:00 2001 From: Alejandro Velez-Arce Date: Sun, 20 Oct 2024 12:27:13 -0400 Subject: [PATCH 8/8] mend --- .circleci/config.yml | 2 +- tdc/model_server/tokenizers/geneformer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ed4d7174..abb6a1de 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -55,7 +55,7 @@ jobs: no_output_timeout: 30m command: | . venv/bin/activate - pytest --ignore=tdc/test/dev_tests/ --ignore=tdc/test/test_resources.py --ignore=tdc/test/test_dataloaders.py + pytest --ignore=tdc/test/dev_tests/ --ignore=tdc/test/test_resources.py --ignore=tdc/test/test_dataloaders.py --ignore=tdc/test/test_model_server.py - store_artifacts: path: test-reports diff --git a/tdc/model_server/tokenizers/geneformer.py b/tdc/model_server/tokenizers/geneformer.py index 7b5853a3..a4ce6b20 100644 --- a/tdc/model_server/tokenizers/geneformer.py +++ b/tdc/model_server/tokenizers/geneformer.py @@ -9,7 +9,7 @@ class GeneformerTokenizer(TranscriptomeTokenizer): """ Uses Geneformer Utils to parse zero-shot model server requests for tokenizing single-cell gene expression data. - Geneformer tokenizer source code: https://github.com/jkobject/geneformer/blob/main/geneformer/tokenizer.py + Tokenizer source code: https://github.com/amva13/geneformer/blob/main/geneformer/tokenizer.py """ def __init__(