Skip to content

Commit

Permalink
Merge branch 'dev' into ferc714-optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
rousik authored Dec 5, 2023
2 parents 097e97f + 3452bae commit a555217
Show file tree
Hide file tree
Showing 40 changed files with 1,044 additions and 1,522 deletions.
5 changes: 4 additions & 1 deletion .codecov.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
---
coverage:
range: 70..100
round: down
round: nearest
precision: 1

ignore:
- "src/pudl/validate.py"

codecov:
token: 23a7ee04-6ac5-4d1b-9d36-86b0c50d40c5
require_ci_to_pass: true
Expand Down
22 changes: 0 additions & 22 deletions .coveragerc

This file was deleted.

23 changes: 9 additions & 14 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
covargs := --append --source=src/pudl
gcs_cache_path := --gcs-cache-path=gs://zenodo-cache.catalyst.coop
pytest_covargs := --cov-append --cov=src/pudl --cov-report=xml
coverage_report := coverage report --sort=cover
pytest_args := --durations 20 ${pytest_covargs} ${gcs_cache_path}
covargs := --append
pytest_args := --durations 20 ${gcs_cache_path}
etl_fast_yml := src/pudl/package_data/settings/etl_fast.yml
etl_full_yml := src/pudl/package_data/settings/etl_full.yml

Expand Down Expand Up @@ -96,10 +94,7 @@ ferc:
rm -f ${PUDL_OUTPUT}/ferc*.sqlite
rm -f ${PUDL_OUTPUT}/ferc*_xbrl_datapackage.json
rm -f ${PUDL_OUTPUT}/ferc*_xbrl_taxonomy_metadata.json
coverage run ${covargs} -- \
src/pudl/ferc_to_sqlite/cli.py \
${gcs_cache_path} \
${etl_full_yml}
coverage run ${covargs} -- src/pudl/ferc_to_sqlite/cli.py ${gcs_cache_path} ${etl_full_yml}

# Remove the existing PUDL DB if it exists.
# Create a new empty DB using alembic.
Expand All @@ -108,7 +103,7 @@ ferc:
pudl:
rm -f ${PUDL_OUTPUT}/pudl.sqlite
alembic upgrade head
coverage run ${covargs} -- src/pudl/cli/etl.py ${gcs_cache_path} ${etl_full_yml}
coverage run ${covargs} -- src/pudl/etl/cli.py ${gcs_cache_path} ${etl_full_yml}

########################################################################################
# Targets that are coordinated by pytest -- mostly they're actual tests.
Expand All @@ -125,13 +120,13 @@ pytest-integration:
coverage-erase:
coverage erase

.PHONY: pytest-coverage
pytest-coverage: coverage-erase docs-build pytest-ci
${coverage_report}

.PHONY: pytest-ci
pytest-ci: pytest-unit pytest-integration

.PHONY: pytest-coverage
pytest-coverage: coverage-erase docs-build pytest-ci
coverage report

.PHONY: pytest-integration-full
pytest-integration-full:
pytest ${pytest_args} -n auto --live-dbs --etl-settings ${etl_full_yml} test/integration
Expand All @@ -151,7 +146,7 @@ nuke: coverage-erase docs-build pytest-unit ferc pudl
pudl_check_fks
pytest ${pytest_args} -n auto --live-dbs --etl-settings ${etl_full_yml} test/integration
pytest ${pytest_args} -n auto --live-dbs test/validate
${coverage_report}
coverage report

# Check that designated Jupyter notebooks can be run against the current DB
.PHONY: pytest-jupyter
Expand Down
74 changes: 0 additions & 74 deletions devtools/data-release.sh

This file was deleted.

1 change: 1 addition & 0 deletions devtools/sqlite_to_duckdb.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#! /usr/bin/env python
"""A naive script for converting SQLite to DuckDB."""
import logging
from pathlib import Path
Expand Down
5 changes: 1 addition & 4 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,7 @@ ENV LD_LIBRARY_PATH=${CONDA_PREFIX}/lib
# We need information from .git to get version with setuptools_scm so we mount that
# directory without copying it into the image.
RUN --mount=type=bind,source=.git,target=${PUDL_REPO}/.git \
${CONDA_RUN} pip install --no-cache-dir --no-deps --editable . && \
# Run the PUDL setup script so we know where to read and write data
${CONDA_RUN} pudl_setup

${CONDA_RUN} pip install --no-cache-dir --no-deps --editable .

# Install awscli2
# Change back to root because the install script needs access to /usr/local/aws-cli
Expand Down
3 changes: 1 addition & 2 deletions docker/gcp_pudl_etl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ function run_pudl_etl() {
send_slack_msg ":large_yellow_circle: Deployment started for $ACTION_SHA-$GITHUB_REF :floppy_disk:"
authenticate_gcp && \
alembic upgrade head && \
pudl_setup && \
ferc_to_sqlite \
--loglevel DEBUG \
--gcs-cache-path gs://internal-zenodo-cache.catalyst.coop \
Expand All @@ -44,7 +43,7 @@ function run_pudl_etl() {
-n auto \
--gcs-cache-path gs://internal-zenodo-cache.catalyst.coop \
--etl-settings $PUDL_SETTINGS_YML \
--live-dbs test/validate
--live-dbs test/validate \
&& touch ${PUDL_OUTPUT}/success
}

Expand Down
20 changes: 0 additions & 20 deletions docs/Makefile

This file was deleted.

2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def data_dictionary_metadata_to_rst(app):
"""Export data dictionary metadata to RST for inclusion in the documentation."""
# Create an RST Data Dictionary for the PUDL DB:
print("Exporting PUDL DB data dictionary metadata to RST.")
skip_names = ["datasets", "accumulated_depreciation_ferc1"]
skip_names = ["datasets", "accumulated_depreciation_ferc1", "entity_types_eia"]
names = [name for name in RESOURCE_METADATA if name not in skip_names]
package = Package.from_resource_ids(resource_ids=tuple(sorted(names)))
# Sort fields within each resource by name:
Expand Down
6 changes: 1 addition & 5 deletions docs/dev/annual_updates.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,7 @@ fields such as ``source_format`` or ``path`` are still accurate.
``etl_fast.yml`` settings files stored under ``src/pudl/package_data/settings`` in the
PUDL repo.

**1.5)** Update the settings files in your PUDL workspace to reflect the new
years by running ``pudl_setup {path to your pudl_work directory} -c``. Don't worry, it
won't remove any custom settings files you've added under a diffrent name.

**1.6)** Use the ``pudl_datastore`` script (see :doc:`datastore`) to download the new
**1.5)** Use the ``pudl_datastore`` script (see :doc:`datastore`) to download the new
raw data archives in bulk so that network hiccups don't cause issues during the ETL.

2. Map the Structure of the New Data
Expand Down
104 changes: 26 additions & 78 deletions docs/dev/dev_setup.rst
Original file line number Diff line number Diff line change
Expand Up @@ -239,29 +239,22 @@ Creating a Workspace
PUDL Workspace Setup
^^^^^^^^^^^^^^^^^^^^

.. note::

If you used ``pudl_setup`` to set up your pudl workspace already,
skip ahead to :ref:`Legacy PUDL Setup`. If you haven't setup
a PUDL workspace before, read the remainder of this section.

PUDL needs to know where to store its big piles of inputs and outputs.
The ``PUDL_OUTPUT`` and ``PUDL_INPUT`` environment variables let PUDL know where
all this stuff should go. We call this a "PUDL workspace".
PUDL needs to know where to store its big piles of inputs and outputs. The
``PUDL_OUTPUT`` and ``PUDL_INPUT`` environment variables let PUDL know where all this
stuff should go. We call this a "PUDL workspace".

First, create a directory to store local caches of raw PUDL data. You can put
this anywhere, but we put this in ``~/pudl_input`` in the documentation.
Then create an environment variable called ``PUDL_INPUT`` to store the path to
this new directory:
First, create a directory to store local caches of raw PUDL data. You can put this
anywhere, but we put this in ``~/pudl_input`` in the documentation. Then create an
environment variable called ``PUDL_INPUT`` to store the path to this new directory:

.. code-block:: console
$ echo "export PUDL_INPUT=/absolute/path/to/pudl_input" >> ~/.zshrc # if you are using zsh
$ echo "export PUDL_INPUT=/absolute/path/to/pudl_input" >> ~/.bashrc # if you are using bash
$ set -Ux PUDL_INPUT /absolute/path/to/pudl_input # if you are using fish shell
The directory stored in ``PUDL_INPUT`` contains versions of PUDL's
raw data archives on Zenodo for each datasource:
The directory stored in ``PUDL_INPUT`` contains versions of PUDL's raw data archives on
Zenodo for each datasource:

.. code-block::
Expand All @@ -281,81 +274,36 @@ raw data archives on Zenodo for each datasource:
.. warning::

The data stored at the ``PUDL_INPUT`` directory can grow to be dozens
of gigabytes in size. This is because when the raw data are updated,
a new version of the archive is downloaded to the ``PUDL_INPUT``
directory. To slim down the size you can always delete
out of date archives the code no longer depends on.
The data stored at the ``PUDL_INPUT`` directory can grow to be dozens of gigabytes
in size. This is because when the raw data are updated, a new version of the archive
is downloaded to the ``PUDL_INPUT`` directory. To slim down the size you can always
delete out of date archives the code no longer depends on.

Next, create a directory to store the outputs of the PUDL ETL. As above, you
can put this anywhere, but typically this is ``~/pudl_output``. Then, as
with ``PUDL_INPUT``, create an environment variable called ``PUDL_OUTPUT`` to
store the path to this new directory:
Next, create a directory to store the outputs of the PUDL ETL. As above, you can put
this anywhere, but typically this is ``~/pudl_output``. Then, as with ``PUDL_INPUT``,
create an environment variable called ``PUDL_OUTPUT`` to store the path to this new
directory:

.. code-block:: console
$ echo "export PUDL_OUTPUT=/absolute/path/to/pudl_output" >> ~/.zshrc # zsh
$ echo "export PUDL_OUTPUT=/absolute/path/to/pudl_output" >> ~/.bashrc # bash
$ set -Ux PUDL_OUTPUT /absolute/path/to/pudl_output # fish
The path stored in ``PUDL_OUTPUT`` contains all ETL outputs like
``pudl.sqlite`` and ``hourly_emissions_epacems.parquet``.

**Make sure you create separate directories for these environment variables!
It is recommended you create these directories outside of the pudl repository
directory so the inputs and outputs are not tracked in git.**

Also, activate profile changes above in the current session.

.. code-block:: console
$ export PUDL_OUTPUT=/absolute/path/to/pudl_output
$ export PUDL_INPUT=/absolute/path/to/pudl_input
.. _Legacy PUDL Setup:

PUDL Workspace Setup (legacy method)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In previous versions of PUDL, the ``pudl_setup`` script created workspace directories.
PUDL is moving towards using the ``PUDL_OUTPUT`` and ``PUDL_INPUT`` environment
variables instead of the ``pudl_setup`` script because the environment variables are
easier to reference in the codebase.

.. note::

If you set up your workspace using ``pudl_setup`` you don't need to change
anything about your setup. Just re-run ``pudl_setup`` and a new directory
called ``output/`` will be created in your <PUDL_DIR>. You will need to
point ``PUDL_OUTPUT`` at this new directory and ``PUDL_INPUT`` at the
``data/`` directory in <PUDL_DIR>.
The path stored in ``PUDL_OUTPUT`` contains all ETL outputs like ``pudl.sqlite`` and
``hourly_emissions_epacems.parquet``.

.. warning::

In a future release the ``pudl_setup`` command will be removed.
Make sure you set these environment variables to point at separate directories! It
is also **strongly recommended** that you create these directories outside of the
pudl repository directory so the inputs and outputs are not tracked in git.

The ``pudl_setup`` script lets PUDL know where to store inputs and outputs.
The script will not create a new directory based on your arguemnts, so make
sure whatever directory path you pass as <PUDL_DIR> already exists.
Remember that you'll need to either source your shell profile after adding the new
environment variable definitions above, or export them at the command line for them to
be active in the current shell:

.. code-block:: console
$ pudl_setup <PUDL_DIR>
<PUDL_DIR> is the path to the directory where you want PUDL to do its
business -- this is where the datastore will be located and where any outputs
that are generated end up. The script will also put a configuration file called
``.pudl.yml`` in your home directory that records the location of this
workspace and uses it by default in the future. If you run ``pudl_setup`` with
no arguments, it assumes you want to use the current working directory.

The workspace is laid out like this:

==================== ==========================================================
**Directory / File** **Contents**
-------------------- ----------------------------------------------------------
``data/`` Raw data, automatically organized by source, year, etc.
This is the path ``PUDL_INPUT`` should point to.
-------------------- ----------------------------------------------------------
``output/`` The directory into which all the durable products of the
PUDL data processing pipeline will be written.
==================== ==========================================================
$ export PUDL_OUTPUT=/absolute/path/to/pudl_output
$ export PUDL_INPUT=/absolute/path/to/pudl_input
Loading

0 comments on commit a555217

Please sign in to comment.