-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update data directory structure #22
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# Whether to run data-processing tasks locally | ||
# or on the cloud with Coiled. | ||
local: true | ||
# Output location for data files. Can be a local directory | ||
# or a remote path like "s3://path/to/bucket". | ||
data-dir: ./data | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,13 @@ | ||
import datetime | ||
import os | ||
|
||
import botocore.session | ||
import coiled | ||
import duckdb | ||
import psutil | ||
from dask.distributed import print | ||
from prefect import flow, task | ||
|
||
from .settings import ( | ||
LOCAL, | ||
REGION, | ||
STAGING_JSON_DIR, | ||
STAGING_PARQUET_DIR, | ||
fs, | ||
lock_generate, | ||
) | ||
from .settings import LOCAL, PROCESSED_DIR, REGION, STAGING_DIR, fs, lock_generate | ||
|
||
|
||
@task(log_prints=True) | ||
|
@@ -31,21 +23,6 @@ def generate(scale: float, path: os.PathLike) -> None: | |
with duckdb.connect() as con: | ||
con.install_extension("tpch") | ||
con.load_extension("tpch") | ||
|
||
if str(path).startswith("s3://"): | ||
session = botocore.session.Session() | ||
creds = session.get_credentials() | ||
con.install_extension("httpfs") | ||
con.load_extension("httpfs") | ||
con.sql( | ||
f""" | ||
SET s3_region='{REGION}'; | ||
SET s3_access_key_id='{creds.access_key}'; | ||
SET s3_secret_access_key='{creds.secret_key}'; | ||
SET s3_session_token='{creds.token}'; | ||
""" | ||
) | ||
|
||
Comment on lines
-34
to
-48
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is unrelated to the directory structure. We don't need to do this configuration because we're not reading / writing with duckdb. |
||
con.sql( | ||
f""" | ||
SET memory_limit='{psutil.virtual_memory().available // 2**30 }G'; | ||
|
@@ -67,8 +44,8 @@ def generate(scale: float, path: os.PathLike) -> None: | |
) | ||
for table in map(str, tables): | ||
if table in static_tables and ( | ||
list((STAGING_JSON_DIR / table).rglob("*.json")) | ||
or list((STAGING_PARQUET_DIR / table).rglob("*.parquet")) | ||
list((STAGING_DIR / table).rglob("*.json")) | ||
or list((PROCESSED_DIR / table).rglob("*.parquet")) | ||
): | ||
print(f"Static table {table} already exists") | ||
continue | ||
|
@@ -97,6 +74,6 @@ def generate_data(): | |
with lock_generate: | ||
generate( | ||
scale=0.01, | ||
path=STAGING_JSON_DIR, | ||
path=STAGING_DIR, | ||
) | ||
generate.fn.client.restart() | ||
generate.fn.client.restart(wait_for_workers=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an unrelated bugfix for running locally (xref dask/distributed#8534) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,35 +1,33 @@ | ||
import boto3 | ||
import fsspec | ||
import yaml | ||
from filelock import FileLock | ||
from upath import UPath as Path | ||
|
||
LOCAL = True | ||
# LOCAL = False | ||
with open(Path(__file__).parent / "config.yml", "rb") as f: | ||
data = yaml.safe_load(f) | ||
|
||
LOCAL = data["local"] | ||
ROOT = Path(data["data-dir"]).resolve() | ||
fs = fsspec.filesystem(ROOT.protocol, use_listings_cache=False) | ||
|
||
if LOCAL: | ||
ROOT = Path(__file__).parent.parent.resolve() / "data" | ||
fs = fsspec.filesystem("local") | ||
REGION = None | ||
storage_options = {} | ||
else: | ||
# TODO: Make the cloud path nicer (e.g. s3://coiled-datasets-rp) | ||
ROOT = Path("s3://openscapes-scratch/jrbourbeau/etl-tpch/data") | ||
fs = fsspec.filesystem("s3", use_listings_cache=False) | ||
# Find cloud region being used | ||
bucket = str(ROOT).replace("s3://", "").split("/")[0] | ||
resp = boto3.client("s3").get_bucket_location(Bucket=bucket) | ||
REGION = resp["LocationConstraint"] or "us-east-1" | ||
storage_options = {"AWS_REGION": REGION, "AWS_S3_ALLOW_UNSAFE_RENAME": "true"} | ||
|
||
STAGING_JSON_DIR = ROOT / "staged" / "json" | ||
STAGING_PARQUET_DIR = ROOT / "staged" / "parquet" | ||
RAW_JSON_DIR = ROOT / "raw" / "json" | ||
RAW_PARQUET_DIR = ROOT / "raw" / "parquet" | ||
PROCESSED_DATA_DIR = ROOT / "processed" | ||
REDUCED_DATA_DIR = ROOT / "reduced" | ||
STAGING_DIR = ROOT / "staging" # Input JSON files | ||
PROCESSED_DIR = ROOT / "processed" # Processed Parquet files | ||
RESULTS_DIR = ROOT / "results" # Reduced/aggrgated results | ||
ARCHIVE_DIR = ROOT / "archive" # Archived JSON files | ||
MODEL_FILE = ROOT.parent / "model.json" | ||
MODEL_SERVER_FILE = ROOT.parent / "serve_model.py" | ||
|
||
lock_generate = FileLock("generate.lock") | ||
lock_json_to_parquet = FileLock("json_to_parquet.lock") | ||
lock_compact = FileLock("compact.lock") | ||
lock_dir = Path(__file__).parent.parent / ".locks" | ||
lock_generate = FileLock(lock_dir / "generate.lock") | ||
lock_json_to_parquet = FileLock(lock_dir / "json.lock") | ||
lock_compact = FileLock(lock_dir / "compact.lock") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moving this configuration out into a standalone file (related to, but doesn't close, #2)