Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[data] Refactors skema to fetch artifacts locally instead of from artifacts.askem.lum.ai #881

Merged
merged 16 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 0 additions & 91 deletions .drone.yml

This file was deleted.

30 changes: 0 additions & 30 deletions .github/workflows/deploy.yml

This file was deleted.

5 changes: 4 additions & 1 deletion .github/workflows/tests-and-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,10 @@ jobs:
working-directory: .
run: |
# retrieve latest model for img2mml component
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
pip install huggingface_hub
python scripts/retrieve_model_ci.py

# Install askem
pip install ".[all]"

# Install tree-sitter parser (for Python component unit tests)
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.skema-py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ ENV PATH="/root/.cargo/bin:${PATH}"
RUN pip install wheel
RUN pip install six
# Download ML model (~150MB)
RUN curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
RUN pip install huggingface_hub && python scripts/retrieve_model_ci.py
RUN tree /app
#RUN pip install ".[all]"
# exclude dependencies for docs
Expand Down
2 changes: 1 addition & 1 deletion docs/dev/env.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ conda activate skema
# Install tree-sitter parsers
python skema/program_analysis/tree_sitter_parsers/build_parsers.py --all
# download the checkpoint for the img2mml service
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
python scripts/retrieve_model.py
# mathjax deps for img2mml
(cd skema/img2mml/data_generation && npm install)
```
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ isa = [
]

# shared ML dependencies
ml = ["torch==2.0.1", "torchvision==0.15.2", "beartype==0.15.0"]
ml = ["torch==2.0.1", "torchvision==0.15.2", "beartype==0.15.0", "huggingface_hub"]

# Im2MML dependencies. The img2mml service converts equation images to MathML.
# See the skema/img2mml directory.
Expand Down Expand Up @@ -100,6 +100,7 @@ all = ["skema[core]", "skema[dev]", "skema[doc]", "skema[demo]", "skema[annotati
"skema.rest" = "skema/rest"
"skema.skema_py" = "skema/skema_py"
"skema.utils" = "skema/utils"
"skema.data" = "skema/data"

# re-map skema/text_reading/python to skema.text_reading
#"skema.text_reading" = "skema/text_reading/python"
Expand All @@ -110,7 +111,7 @@ all = ["skema[core]", "skema[dev]", "skema[doc]", "skema[demo]", "skema[annotati

[tool.setuptools.package-data]
# needed to ensure models are included in package/discoverable
"*" = ["*.json", "*vocab.txt", "*.pt", "*.png", "*.html", "*.yml", "*.yaml"]
"*" = ["*.json", "*vocab.txt", "*.pt", "*.png", "*.html", "*.yml", "*.yaml", "*.zip"]

[tool.setuptools.dynamic]
readme = {file = ["README.md"], content-type = "text/markdown"}
Expand Down
31 changes: 31 additions & 0 deletions scripts/retrieve_model_ci.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
from pathlib import Path

from huggingface_hub import hf_hub_download

def retrieve_model(model_path=None) -> str:
"""
Retrieve the img2mml model from the specified path or download it if not found.

Args:
model_path (str, optional): Path to the img2mml model file. Defaults to None.

Returns:
str: Path to the loaded model file.
"""
cwd = Path(__file__).parents[0]
REPO_NAME = "lum-ai/img2mml"
MODEL_NAME = "cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt"
# If the model path is none or doesn't exist, the default model will be downloaded from server.
if model_path is None or not os.path.exists(model_path):
model_path = cwd / "trained_models" / MODEL_NAME

# Check if the model file already exists
if not os.path.exists(model_path):
# If the file doesn't exist, download it from the specified URL
print(f"Downloading the model checkpoint from HuggingFace...")
hf_hub_download(repo_id=REPO_NAME, filename=MODEL_NAME, local_dir=model_path.parent, local_dir_use_symlinks=False)

return str(model_path)

retrieve_model()
Binary file added skema/data/program_analysis/ABM-COVID-ABS.zip
Binary file not shown.
Binary file added skema/data/program_analysis/ABM-COmplexVID-19.zip
Binary file not shown.
Binary file added skema/data/program_analysis/ABM-Covasim.zip
Binary file not shown.
Binary file added skema/data/program_analysis/ABM-REINA.zip
Binary file not shown.
Binary file added skema/data/program_analysis/Bucky.zip
Binary file not shown.
Binary file added skema/data/program_analysis/CHIME-SIR-model.zip
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added skema/data/program_analysis/Climlab.zip
Binary file not shown.
Binary file not shown.
Binary file added skema/data/program_analysis/Generated-Halfar.zip
Binary file not shown.
Binary file added skema/data/program_analysis/MechBayes.zip
Binary file not shown.
Binary file added skema/data/program_analysis/SIDARTHE.zip
Binary file not shown.
Binary file not shown.
Binary file added skema/data/program_analysis/Simple-SIR.zip
Binary file not shown.
Binary file added skema/data/program_analysis/TIE-GCM.zip
Binary file not shown.
3 changes: 3 additions & 0 deletions skema/data/program_analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from pathlib import Path

MODEL_ZIP_ROOT_PATH = Path(__file__).parent
Binary file added skema/data/program_analysis/cism_v3.zip
Binary file not shown.
Binary file added skema/data/program_analysis/climlab-v2.zip
Binary file not shown.
Binary file added skema/data/program_analysis/code_sir.zip
Binary file not shown.
Binary file added skema/data/program_analysis/examples_python.zip
Binary file not shown.
11 changes: 5 additions & 6 deletions skema/img2mml/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,23 @@ service was developed by Deepsana Shahi, Adarsh Pyarelal and Liang Zhang.

The model itself is not checked into the repository, but you can get it from
here:
https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
https://huggingface.co/lum-ai/img2mml/blob/main/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt

Place the model file in the `trained_models` directory.

The curl command below should do the trick.
The Python command below should do the trick.

```
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
python ../../scripts/retrieve_model_ci.py
```

If you have the checkpoint in the `trained_models` directory already and hope to update it, please run the above curl command that will replace the previous one.
If you have the checkpoint in the `trained_models` directory already and hope to update it, please run the above Python command that will replace the previous one.

To update the model name or path, please make the following modifications to support updating the img2mml service and the corresponding Docker operations:

1. Modify the ENV variable of `SKEMA_IMG2MML_MODEL_PATH`.
2. Update the path settings in the "retrieve latest model for img2mml component" section of `skema/.github/workflows/tests-and-docs.yml`.
3. Adjust the curl command in the test_equation_reading section of `skema/.drone.yml` to download the checkpoint.
4. Update the download checkpoint path in `skema/img2mml/README.md`.
3. Update the download checkpoint path in `skema/img2mml/README.md`.

These changes will ensure that the necessary files and paths are updated correctly.

Expand Down
10 changes: 5 additions & 5 deletions skema/img2mml/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from PIL import Image
from io import BytesIO

from huggingface_hub import hf_hub_download

def retrieve_model(model_path=None) -> str:
"""
Expand All @@ -25,7 +26,7 @@ def retrieve_model(model_path=None) -> str:
str: Path to the loaded model file.
"""
cwd = Path(__file__).parents[0]
MODEL_BASE_ADDRESS = "https://artifacts.askem.lum.ai/skema/img2mml/models"
REPO_NAME = "lum-ai/img2mml"
MODEL_NAME = "cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt"
# If the model path is none or doesn't exist, the default model will be downloaded from server.
if model_path is None or not os.path.exists(model_path):
Expand All @@ -34,10 +35,9 @@ def retrieve_model(model_path=None) -> str:
# Check if the model file already exists
if not os.path.exists(model_path):
# If the file doesn't exist, download it from the specified URL
url = f"{MODEL_BASE_ADDRESS}/{MODEL_NAME}"
print(f"Downloading the model checkpoint from {url}...")
urllib.request.urlretrieve(url, model_path)

print(f"Downloading the model checkpoint from HuggingFace...")
hf_hub_download(repo_id=REPO_NAME, filename=MODEL_NAME, local_dir=model_path.parent, local_dir_use_symlinks=False)

return str(model_path)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@
from skema.rest.utils import fn_preprocessor
from skema.rest.workflows import code_snippets_to_pn_amr
from skema.utils.fold import del_nulls, dictionary_to_gromet_json
from skema.data.program_analysis import MODEL_ZIP_ROOT_PATH
from skema.utils.change_dir_back import change_dir_back
from skema.skema_py.server import System


# Constants for file paths
THIS_PATH = Path(__file__).parent.resolve()
MODEL_YAML_PATH = THIS_PATH / "models.yaml"
Expand Down Expand Up @@ -149,11 +151,12 @@ def process_single_model(html: HTML_Instance, output_dir: str, model_name: str):
"""Generate an HTML report for a single model"""
html.add_model(model_name)

if model_name in MODEL_YAML:
model_url = MODEL_YAML[model_name]["zip_archive"]
response = requests.get(model_url)

zip = ZipFile(BytesIO(response.content))
if not model_name in MODEL_YAML:
return

model_path = MODEL_ZIP_ROOT_PATH.resolve() / MODEL_YAML[model_name]["zip_archive"]

zip = ZipFile(BytesIO(model_path.read_bytes()))
with TemporaryDirectory() as temp:
# We need to write all the files to the temporary directory before processing
# This is because some steps may require additional files, such as include directories in Fortran
Expand Down Expand Up @@ -297,7 +300,8 @@ def process_all_models(html: HTML_Instance, output_dir: str):
try:
supported, total = process_single_model(html, output_dir, model_name)
model_line_coverage[model_name] = (supported, total)
except:
except Exception as e:
print(e)
continue
return model_line_coverage

Expand Down
39 changes: 19 additions & 20 deletions skema/program_analysis/model_coverage_report/models.yaml
Original file line number Diff line number Diff line change
@@ -1,57 +1,56 @@
---
CHIME-penn-full:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/CHIME-penn-full-model.zip"
zip_archive: "CHIME-penn-full-model.zip"

CHIME-SIR:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/CHIME-SIR-model.zip"
zip_archive: "CHIME-SIR-model.zip"

CHIME-SVIIvR:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/CHIME-SVIIvR-model.zip"
zip_archive: "CHIME-SVIIvR-model.zip"

ABM-COmplexVID-19:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/ABM-COmplexVID-19.zip"
zip_archive: "ABM-COmplexVID-19.zip"

ABM-COVID-ABS:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/ABM-COVID-ABS.zip"
zip_archive: "ABM-COVID-ABS.zip"

CHIME-penn-full:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/CHIME-penn-full-model.zip"
zip_archive: "CHIME-penn-full-model.zip"

MechBayes:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/MechBayes.zip"
zip_archive: "MechBayes.zip"

SIDARTHE:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/SIDARTHE.zip"
zip_archive: "SIDARTHE.zip"

Simple-SIR:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/Simple-SIR.zip"
zip_archive: "Simple-SIR.zip"

Climlab-v1:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/Climlab.zip"
zip_archive: "Climlab.zip"

Climlab-v2:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/climlab-v2.zip"
zip_archive: "climlab-v2.zip"

Examples-Python:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/examples_python.zip"
zip_archive: "examples_python.zip"

Generated-Halfar:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/Generated-Halfar.zip"
zip_archive: "Generated-Halfar.zip"

SV2AIR3-Waterloo-MATLAB:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/SV2AIR3-Waterloo-MATLAB.zip"
zip_archive: "SV2AIR3-Waterloo-MATLAB.zip"

Bucky:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/Bucky.zip"
zip_archive: "Bucky.zip"

ABM-REINA:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/ABM-REINA.zip"
zip_archive: "ABM-REINA.zip"

Cornell-COVID19-sim-Frazier:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/Cornell-COVID19-sim-Frazier.zip"
zip_archive: "Cornell-COVID19-sim-Frazier.zip"

ABM-Covasim:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/ABM-Covasim.zip"
zip_archive: "ABM-Covasim.zip"

TIE-GCM:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/TIE-GCM.zip"
zip_archive: "TIE-GCM.zip"
Loading
Loading