diff --git a/.github/workflows/cicd.yaml b/.github/workflows/cicd.yaml index 8787452d..66154cfb 100644 --- a/.github/workflows/cicd.yaml +++ b/.github/workflows/cicd.yaml @@ -1,9 +1,18 @@ name: CICD on: + # Run CICD for non-draft pull request pull_request: - branches: [dev, main] - + branches: + - dev + - main + # Also run when the pull request merges (which generates a push) + # So that we can tag the docker image appropriately. + push: + branches: + - dev + - prod + - main jobs: lidar_on_docker: @@ -20,20 +29,22 @@ jobs: - name: Check code neatness (linter) run: docker run lidar_prod_im flake8 - - name: Run tests - fast ones go first. - run: docker run lidar_prod_im python -m pytest -v -m "not slow" --ignore=actions-runner + - name: Run tests & get coverage - fast ones go first. + run: > + docker run + lidar_prod_im + python -m + pytest -rA -v -m "not slow" --ignore=actions-runner - - name: Run slow tests last. + - name: Run slow tests last (evaluation on large file) run: > - docker run + docker run -v /var/data/cicd/CICD_github_assets/M8.4/inputs/evaluation/:/lidar/tests/files/large/ lidar_prod_im python -m - pytest -v -m slow - --ignore=actions-runner - --no-cov + pytest -rA -v -m "slow" --ignore=actions-runner --no-cov - - name: Full module run on LAS subset using CLI. + - name: Test run from CLI on a LAS subset. run: > docker run -v /var/data/cicd/CICD_github_assets/M8.4/inputs/:/inputs/ @@ -45,22 +56,26 @@ jobs: paths.src_las=/inputs/730000_6360000.subset.prototype_format202.las paths.output_dir=/outputs/ - # This is somewhat redundant with unit test but may serve a a doc on usage ? - - name: Evaluate decisions using optimization task (debug mode means on a single, corrected LAS) using CLI. - run: > - docker run - -v /var/data/cicd/CICD_github_assets/M8.4/inputs/evaluation/:/inputs/ - -v /var/data/cicd/CICD_github_assets/M8.4/outputs/evaluation/:/outputs/ lidar_prod_im - python lidar_prod/run.py - print_config=true - +task='optimize' - +building_validation.optimization.debug=true - building_validation.optimization.todo='prepare+evaluate+update' - building_validation.optimization.paths.input_las_dir=/inputs/ - building_validation.optimization.paths.results_output_dir=/outputs/ - building_validation.optimization.paths.building_validation_thresholds_pickle=/inputs/optimized_thresholds.pickle - - - name: clean the server for further uses + # Everything ran so we tag the valid docker image to keep it + # This happens for push events, which are in particular + # triggered when a pull request is merged. + - name: Get the branch name into an environment variable. + if: github.event_name == 'push' + uses: nelonoel/branch-name@v1.0.1 + + - name: Print the branch name + if: github.event_name == 'push' + run: echo ${BRANCH_NAME} + + - name: Tag the docker image with branch name + if: github.event_name == 'push' + run: docker tag lidar_prod_im:latest lidar_prod_im:${BRANCH_NAME} + + - name: Dry run image so that is it not prunned + if: github.event_name == 'push' + run: docker run lidar_prod_im:${BRANCH_NAME} bash + + - name: Clean dangling docker images if: always() # always do it, even if something failed run: docker system prune # remove obsolete docker images (take a HUGE amount of space) diff --git a/.github/workflows/gh-pages.yaml b/.github/workflows/gh-pages.yaml new file mode 100644 index 00000000..595266cb --- /dev/null +++ b/.github/workflows/gh-pages.yaml @@ -0,0 +1,68 @@ +# Workflow name +name: "Documentation Build" + +# Event that must trigger the workflow +on: + push: + branches: + - main # <- only on main branch + - add-documentation # <- on this branch until documentation is merged once in main. + +jobs: + + build-and-deploy: + runs-on: ubuntu-latest + + # Use bash instead of sh for conda activation + defaults: + run: + shell: bash -l {0} + + steps: + # Checkout the repository + - name: "Checkout" + uses: actions/checkout@v2 + + # See https://github.com/conda-incubator/setup-miniconda#caching-environments + + # Setup empty conda environment + - name: Setup a conda-incubator with an empty conda env + uses: conda-incubator/setup-miniconda@v2 + with: + python-version: 3.9.12 + miniforge-variant: Mambaforge + miniforge-version: latest + use-mamba: true + # Environment to create and activate for next steps + activate-environment: lidar_prod + + # Cache the env + # See https://github.com/conda-incubator/setup-miniconda#caching-environments + - name: Get Date + id: get-date + run: echo "::set-output name=today::$(/bin/date -u '+%Y%m%d')" + shell: bash + + - name: Cache conda environment + uses: actions/cache@v2 + with: + path: ${{ env.CONDA }}/envs + key: conda-${{ runner.os }}--${{ runner.arch }}--${{ steps.get-date.outputs.today }}-${{ hashFiles('bash/setup_environment/requirements.yml') }}-${{ env.CACHE_NUMBER }} + id: cache + + - name: Update environment if there was no cached env. + run: mamba env update -n lidar_prod -f bash/setup_environment/requirements.yml + if: steps.cache.outputs.cache-hit != 'true' + + # 2. Sphinx part : install tool and dependencies + - name: "Build Sphinx Doc" + working-directory: ./docs/ + run: make html + + # 3. Déploiement sur les Github Pages + + - name: "Deploy Github Pages" + uses: JamesIves/github-pages-deploy-action@3.7.1 + with: + BRANCH: gh-pages # <- Branch where generated doc files will be commited + FOLDER: ./docs/build/html/ # <- Dir where .nojekyll is created and from which to deploy github pages. diff --git a/Dockerfile b/Dockerfile index 6024aaf3..70087c17 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,49 +1,41 @@ -FROM nvidia/cuda:11.3.1-devel-ubuntu20.04 +FROM mambaorg/micromamba:latest # set the IGN proxy, otherwise apt-get and other applications don't work +# from within our self-hoster action runner ENV http_proxy 'http://192.168.4.9:3128/' ENV https_proxy 'http://192.168.4.9:3128/' -# set the timezone, otherwise it asks for it... and freezes -ENV TZ=Europe/Paris -RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone - # all the apt-get installs +USER root RUN apt-get update && apt-get upgrade -y && apt-get install -y \ - software-properties-common \ - wget \ - git \ - postgis \ - libgl1-mesa-glx libegl1-mesa libxrandr2 libxrandr2 libxss1 libxcursor1 libxcomposite1 libasound2 libxi6 libxtst6 # package needed for anaconda - -# install anaconda -RUN wget --quiet https://repo.anaconda.com/archive/Anaconda3-2021.11-Linux-x86_64.sh -O ~/anaconda.sh -RUN /bin/bash ~/anaconda.sh -b -p /opt/conda && \ - rm ~/anaconda.sh -ENV PATH /opt/conda/bin:$PATH + software-properties-common \ + wget \ + git \ + postgis - -# /lidar becomes the working directory, where the repo content is copied. +# /lidar becomes the working directory, where the repo content +# (where this Dockerfile lives) is copied. WORKDIR /lidar COPY . . -# install the python packages via anaconda -RUN conda env create -f bash/setup_environment/requirements.yml - -# Make RUN commands use the new environment: -SHELL ["conda", "run", "-n", "lidar_prod", "/bin/bash", "-c"] +# Copy requirements so that pip installs can occur smootly. +COPY bash/setup_environment/requirements.yml /tmp/env.yaml +COPY bash/setup_environment/requirements.txt /tmp/requirements.txt +# install the python packages via anaconda +RUN micromamba create --yes --file /tmp/env.yaml +# Sets the environment name since it is not "base" +# This ensure that it is activate when usign "docker run ..." +ENV ENV_NAME lidar_prod +# Make RUN commands here use the environment +ARG MAMBA_DOCKERFILE_ACTIVATE=1 +# List packages and their version +RUN micromamba list # test if pdal is installed (a tricky library!) RUN echo "Make sure pdal is installed:" RUN python -c "import pdal" -# the entrypoint garanties that all command will be runned in the conda environment -ENTRYPOINT ["conda", \ - "run", \ - "-n", \ - "lidar_prod"] - -# cmd for a normal run (non evaluate) +# Example command to run the application from within the image CMD ["python", \ "lidar_prod/run.py", \ "print_config=true", \ diff --git a/README.md b/README.md index 1c055cfb..daa536b5 100755 --- a/README.md +++ b/README.md @@ -1,172 +1,37 @@
-# Semantic Segmentation production - Fusion Module +# Lidar Prod - a tool for the production of Lidar semantic segmentation PyTorch Lightning Config: Hydra -[![](https://shields.io/badge/-Lightning--Hydra--Template-017F2F?style=flat&logo=github&labelColor=303030)](https://github.com/ashleve/lightning-hydra-template) +[![Documentation Build](https://github.com/IGNF/lidar-prod-quality-control/actions/workflows/gh-pages.yaml/badge.svg?event=push)](https://github.com/IGNF/lidar-prod-quality-control/actions/workflows/gh-pages.yaml) +[![CI/CD](https://github.com/IGNF/lidar-prod-quality-control/actions/workflows/cicd.yaml/badge.svg?event=push)](https://github.com/IGNF/lidar-prod-quality-control/actions/workflows/cicd.yaml) +


-## Description -### Context +## Context The Lidar HD project ambitions to map France in 3D using 10 pulse/m² aerial Lidar. The data will be openly available, including a semantic segmentation with a minimal number of classes: ground, vegetation, buildings, vehicles, bridges, others. -A simple geometric rule-based semantic segmentation algorithm was applied on 160km² of Lidar data in three areas, to identify its buildings. An audit of the resulting classification showed a large number of false positive. A thorough inspection and labelling was performed to evaluate the quality of this classification, with an identification of its false positive and false negative. At larger scale, this kind of human inspection would be intractable, and more powerful methods are needed to validate the quality of the segmentation before its diffusion. - -We therefore develop a production module which augments rules-based semantic segmentation algorithms with deep learning neural network predictions and a public building vector database. - -Components are: - -- `application.py`: Fuse together rules-based classification, deep learning building probabilities, and building database, highlighting area of uncertainty for a final human inspection. -- `optimization.py`: Multi-objective hyperparameter optimization of the bulding validation decision thresholds. - -### Process - -The end goal is to edit the input (rules-based) classification as much as we confidently can, and to highlight remaining areas of uncertainty for human inspection. - -**Input**: point cloud that went through a first geometric algorithm that identified `candidates building points` based on geometric rules (e.g. plane surfaces, above 1.5m of the ground, etc.), and for which a semantic segmentation model produced a point-level probability of being a building. The default name for this extra dimension is `building`. You can leverage this [package for aerial lidar deep learning segmentation](https://github.com/IGNF/lidar-deep-segmentation). - -#### A) Building Validation - -Goal: Confirm or refute groups of candidate building points when possible, mark them as unsure elsewise. - -1) Clustering of _candidate buildings points_ into connected components. -2) Point-level decision - 1) Identification of points with ambiguous probability: `high entropy` if entropy $\geq$ E1 - 2) Identification of points that are `overlayed` by a building vector from the database. - 3) Decision at the point-level based on probabilities : - 1) `confirmed` if: - 1) p$\geq$`C1`, or - 2) `overlayed` and p$\geq$ (`C1` * `Cr`), where `Cr` is a relaxation factor that reduces the confidence we require to confirm when a point overlayed by a building vector. - 2) `refuted` if (1-p)$\geq$`R1` -3) Group-level decision : - 1) Uncertain due to high entropy: if proportion of `high entropy` points $\geq$ `E2` - 2) Confirmation: if proportion of `confirmed` points $\geq$ `C2` OR if proportion of `overlayed` points $\geq$ `O1` - 3) Refutation: if proportion of `refuted` points $\geq$ `R2` AND proportion of `overlayed` points < `O1` - 4) Uncertainty: elsewise (this is a safeguard: uncertain groups are supposed to be already captured via their entropy) -4) Update of the point cloud classification - -Decision thresholds `E1`, `E2` , `C1`, `C2`, `R1`, `R2`, `O1` are chosen via a multi-objective hyperparameter optimization that aims to maximize automation, precision, and recall of the decisions. Right now we have automation=91%, precision=98.5%, recall=98.1% on a validation dataset. Illustration comes from older version. - -![](assets/img/LidarBati-BuildingValidationM7.1V2.0.png) - -#### B) Building Completion - -Goal: Confirm points that were too isolated to make up a group but have high-enough probability nevertheless (e.g. walls) - -Among _candidate buildings points_ that have not been clustered in previous step due, identify those which nevertheless meet the requirement to be `confirmed`. -Cluster them together with previously confirmed building points in a relaxed, vertical fashion (higher tolerance, XY plan). -For each cluster, if some points were confirmed, the others are considered to belong to the same building, and are -therefore confirmed as well. - -![](assets/img/LidarBati-BuildingCompletion.png) - - -#### C) Building Identification - -Goal: Highlight potential buildings that were missed by the rule-based algorithm, for human inspection. - -Among points that were **not** _candidate buildings points_ identify those which meet the requirement to be `confirmed`, and cluster them. - -This clustering defines a LAS extra dimensions (`Group`) which indexes newly found cluster that may be some missed buildings. - -![](assets/img/LidarBati-BuildingIdentification.png) - - -## Usage - -### Install dependencies - -```yaml -# clone project -git clone https://github.com/IGNF/lidar-prod-quality-control -cd lidar-prod-quality-control - -# install conda -https://www.anaconda.com/products/individual - - -# create conda environment (you may need to run lines manually as conda may not activate properly from bash script) -source bash/setup_environment/setup_env.sh - -# install postgis to request building database -sudo apt-get install postgis - -# activate using -conda activate lidar_prod -``` - -### Use application as a package - -To run the module from anywhere, you can install as a package in a your virtual environment. - -```bash -# activate an env matching ./bash/setup_env.sh requirements. -conda activate lidar_prod - -# install the package -pip install --upgrade https://github.com/IGNF/lidar-prod-quality-control/tarball/prod # from github directly, using production branch -pip install -e . # from local sources -``` - -To run the module as a package, you will need a source cloud point in LAS format with an additional channel containing predicted building probabilities. The name of this channel is specified by `config.data_format.las_dimensions.ai_building_proba`. - -To run using default configurations of the installed package, use -```bash -python -m lidar_prod.run paths.src_las=[/path/to/file.las] -``` - -You can override the yaml file with flags `--config-path` and `--config-name`. You can also override specific parameters. By default, results are saved to a `./outputs/` folder, but this can be overriden with `paths.output_dir` parameter. See [hydra documentation](https://hydra.cc/docs/next/tutorials/basic/your_first_app/config_file/) for reference on overriding syntax. - -To print default configuration run `python -m lidar_prod.run -h`. For pretty colors, run `python -m lidar_prod.run print_config=true`. - -### Run sequentialy on multiple files - -Hydra supports running the python script with several different values for a parameter via a `--multiruns`|`-m` flag and values separated by a comma. - -```bash -python -m lidar_prod.run --multiruns paths.src_las=[file_1.las],[file_2.las],[file_3.las] -``` - -## Development - -### Use application from source - -Simply run from python sources directly -```bash -# activate an env matching ./bash/setup_env.sh requirements. -conda activate lidar_prod -python lidar_prod/run.py paths.src_las=[/path/to/file.las] -``` - -### Optimization and evaluation of building validation decision thresholds - -Run a multi-objectives hyperparameters optimization of the decision thresholds, to maximize recall and precision directly while also maximizing automation. For this, you need a set of LAS with 1) a channel with predicted building probability, 2) a classification with labels that distinguish false positive, false negative, and true positive from a rules-based building classification. +To produce this classification, geometric rules-based classification are familiar and present advantages such as scalability, high geometric regularity, and predictability. But rules-based algorithm often lack the fine-grain understanding needed for complex Lidar scenes, which results in a need for time-consuming human correction. -```bash -conda activate lidar_prod -python lidar_prod/run.py +task=optimize building_validation.optimization.todo='prepare+optimize+evaluate+update' building_validation.optimization.paths.input_las_dir=[path/to/labelled/val/dataset/] building_validation.optimization.paths.results_output_dir=[path/to/save/results] -``` -Nota: to run on a single file during development, add a `+building_validation.optimization.debug=true` flag to the command line. +Additionnaly, some valuable information exist in 2D public geographical database, but finding a way to leverage it on a point cloud classification is not straightforward considering database incompletness, potential out-of-dateness, and frequent x-y offsets. -Optimized decision threshold will be pickled inside the results directory. +Considering the scale of this task, deep learning is leveraged to as a production tool. A [deep learning library](https://github.com/IGNF/lidar-deep-segmentation) was developed with a focused scope: the multiclass semantic segmentation of large scale, high density aerial Lidar points cloud. Using a classification produced directly by a deep learning model might be tempting, but they usually presents some limitations including unexpected failure modes, inconsistant geometric regularity, noise. -To evaluate the optimized module on a test set, change input las folder, and rerun. You need to specify that no optimization is required using the `todo` params. You also need to give the path to the pickled decision trheshold. +## Content -```bash -conda activate lidar_prod -python lidar_prod/run.py +task=optimize building_validation.optimization.todo='prepare+evaluate+update' building_validation.optimization.paths.input_las_dir=[path/to/labelled/test/dataset/] building_validation.optimization.paths.results_output_dir=[path/to/save/results] building_validation.optimization.paths.building_validation_thresholds_pickle=[path/to/optimized_thresholds.pickle] -``` +Lidar-Prod is a production library which aims at augmenting rules-based semantic segmentation algorithms with deep learning neural network predictions (probabilities) and a public building vector database (BDUni). Its main entry-points are: -### CICD and versions +- `application.py`: The application takes a point cloud and update its Classification dimension based on its deep learning predictions and a public geographic database. +- `optimization.py`: The right balance between automation of decision and error is found via a multi-objective optimization of of the decision thresholds, by means of a simple genetic algorithm. -New features are staged in the `dev` branch, and CICD workflow is run when a pull requets to merge is created. -In Actions, check the output of a full evaluation on a single LAS to spot potential regression. The app is also run -on a subset of a LAS, which can be visually inspected before merging - there can always be surprises. +Our strategy is to fuse together different sources of informations (rules-based classification, deep learning predictions, databases), so that we can ensure a high-quality classification while minimizing the need for human correction. Deep learning probabilities might also be used to highlight area of uncertainty, or to spot elements that were missed by the other approaches. -Package version follows semantic versionning conventions and is defined in `setup.py`. +Right now, the class `building` is the only one that is addressed. The extension to other classes is dependent on the training of multiclass AI model, which requires high quality training datasets that are currently being produced. -Releases are generated when new high-level functionnality are implemented (e.g. a new step in the production process), with a documentation role. Production-ready code is fast-forwarded in the `prod` branch when needed. \ No newline at end of file +> Please refer to the documentation for [installation and usage](https://ignf.github.io/lidar-prod-quality-control/tutorials/install.html). + +> Please refer to the documentation to understand the [production process](https://ignf.github.io/lidar-prod-quality-control/background/production_process.html). diff --git a/bash/setup_environment/requirements.txt b/bash/setup_environment/requirements.txt index eac64ecf..a4178ae2 100755 --- a/bash/setup_environment/requirements.txt +++ b/bash/setup_environment/requirements.txt @@ -4,3 +4,16 @@ postgis-toolkit hydra-core==1.1.* hydra-colorlog==1.1.* optuna==2.10.* + +# --------- RST Linter --------- # +rstcheck==3.3.* + +# --------- Documentation --------- # +sphinx==4.5.* +sphinx_rtd_theme==1.0.* +myst_parser==0.17.* +sphinx_paramlinks==0.5.* +recommonmark==0.7.* +sphinxnotes-mock==1.0.0b0 # still a beta +sphinx-argparse==0.3.* # Using +docutils==0.17 \ No newline at end of file diff --git a/bash/setup_environment/setup_env.sh b/bash/setup_environment/setup_env.sh index 8ee1ab82..4009f400 100755 --- a/bash/setup_environment/setup_env.sh +++ b/bash/setup_environment/setup_env.sh @@ -3,6 +3,10 @@ # Be sure that you are using last pip version # by running pip install --upgrade pip +# Run this from a bash using +# source bash/setup_environment/setup_env.sh + + set -e conda install mamba --yes -n base -c conda-forge # mamba is a conda on steroids diff --git a/configs/building_validation/optimization/default.yaml b/configs/building_validation/optimization/default.yaml index fc611634..5d1bc3d7 100644 --- a/configs/building_validation/optimization/default.yaml +++ b/configs/building_validation/optimization/default.yaml @@ -16,16 +16,19 @@ paths: building_validation_thresholds_pickle: ${.results_output_dir}/optimized_thresholds.pickle # Wher -# CLASSIFICATION CODES of dataset 20211001_building_val, which was inspected and labeled post TerraSolid macro +# CLASSIFICATION CODES of a dataset which was inspected +# and labeled post TerraSolid macro # Those are used to override the app default codes. -labels_from_20211001_building_val: +buildings_correction_labels: codes: true_positives: [19] # building that was found by the macro false_positives: [20, 110, 112, 114, 115] # e.g. trees, hedge, trucks false_negatives: [21] # e.g. buildings under vegetation, low building + # Sometimes a cluster will be ambiguous and we need + # thresholds to decice if it is a TP or FP. min_frac: - true_positives: 0.95 # >=x% of confirmed points --> a building - false_positives: 0.05 # not a building + true_positives: 0.95 # >=x% of confirmed points --> cluster is a building + false_positives: 0.05 # cluster is not a building study: diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..2a52d303 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,24 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). + +# If you would regenerate all docs you could use the following, +# which WILL ERASE CREATED RST files. +# sphinx-apidoc -o "$(SOURCEDIR)"/apidoc_new/ ./../ +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/source/apidoc/lidar_prod.commons.rst b/docs/source/apidoc/lidar_prod.commons.rst new file mode 100644 index 00000000..3e506b7f --- /dev/null +++ b/docs/source/apidoc/lidar_prod.commons.rst @@ -0,0 +1,12 @@ +lidar\_prod.commons +=========================== + + +lidar\_prod.commons.commons module +---------------------------------- + +.. automodule:: lidar_prod.commons.commons + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/source/apidoc/lidar_prod.rst b/docs/source/apidoc/lidar_prod.rst new file mode 100644 index 00000000..b093f9a9 --- /dev/null +++ b/docs/source/apidoc/lidar_prod.rst @@ -0,0 +1,7 @@ +lidar\_prod.run +=================== + +.. automodule:: lidar_prod.run + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/source/apidoc/lidar_prod.tasks.rst b/docs/source/apidoc/lidar_prod.tasks.rst new file mode 100644 index 00000000..91072de7 --- /dev/null +++ b/docs/source/apidoc/lidar_prod.tasks.rst @@ -0,0 +1,53 @@ +lidar\_prod.tasks +========================= + +building\_validation +--------------------------------------------- + +.. automodule:: lidar_prod.tasks.building_validation + :members: + :undoc-members: + :show-inheritance: + +building\_validation\_optimization +----------------------------------------------------------- + +.. automodule:: lidar_prod.tasks.building_validation_optimization + :members: + :undoc-members: + :show-inheritance: + +building\_completion +--------------------------------------------- + +.. automodule:: lidar_prod.tasks.building_completion + :members: + :undoc-members: + :show-inheritance: + +building\_identification +------------------------------------------------- + +.. automodule:: lidar_prod.tasks.building_identification + :members: + :undoc-members: + :show-inheritance: + + + + +cleaning +--------------------------------- + +.. automodule:: lidar_prod.tasks.cleaning + :members: + :undoc-members: + :show-inheritance: + +utils +------------------------------ + +.. automodule:: lidar_prod.tasks.utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/source/background/production_process.md b/docs/source/background/production_process.md new file mode 100644 index 00000000..1a07f13f --- /dev/null +++ b/docs/source/background/production_process.md @@ -0,0 +1,57 @@ +# Production process used to transform point clouds classification + +The end goal of the tool is to edit the input (rules-based) classification as much as we confidently can, and to highlight remaining areas of uncertainty for human inspection. + +**Input**: point cloud that went through a first geometric algorithm that identified `candidates building points` based on geometric rules (e.g. plane surfaces, above 1.5m of the ground, etc.), and for which a semantic segmentation model produced a point-level probability of being a building. The default name for this extra dimension is `building`. You can leverage this [package for aerial lidar deep learning segmentation](https://github.com/IGNF/lidar-deep-segmentation). + +## A) Building Validation + +**Goal**: Confirm or refute groups of candidate building points when possible, mark them as unsure elsewise. + +1) Clustering of _candidate buildings points_ into connected components. +2) Point-level decision + 1) Identification of points with ambiguous probability: `high entropy` if entropy >= E1 + 2) Identification of points that are `overlayed` by a building vector from the database. + 3) Decision at the point-level based on probabilities : + 1) `confirmed` if: + 1) p>=`C1`, or + 2) `overlayed` and p>= (`C1` * `Cr`), where `Cr` is a relaxation factor that reduces the confidence we require to confirm when a point overlayed by a building vector. + 2) `refuted` if (1-p) >= `R1` +3) Group-level decision : + 1) Uncertain due to high entropy: if proportion of `high entropy` points >= `E2` + 2) Confirmation: if proportion of `confirmed` points >= `C2` OR if proportion of `overlayed` points >= `O1` + 3) Refutation: if proportion of `refuted` points >= `R2` AND proportion of `overlayed` points < `O1` + 4) Uncertainty: elsewise (this is a safeguard: uncertain groups are supposed to be already captured via their entropy) +4) Update of the point cloud classification + +Decision thresholds `E1`, `E2` , `C1`, `C2`, `R1`, `R2`, `O1` are chosen via a [multi-objective hyperparameter optimization](/background/thresholds_optimization_process.md) that aims to maximize automation, precision, and recall of the decisions. +Current performances on a 15km² validation dataset, expressed as percentages of clusters, are: +- Automation=91% +- Precision=98.5% +- Recall=98.1%. + +![](/img/LidarBati-BuildingValidationM7.1V2.0.png) + +## B) Building Completion + +**Goal**: Confirm points that were too isolated to make up a group but have high-enough probability nevertheless (e.g. walls) + +Among _candidate buildings points_ that have not been clustered in previous step due, identify those which nevertheless meet the requirement to be `confirmed`. +Cluster them together with previously confirmed building points in a relaxed, vertical fashion (higher tolerance, XY plan). +For each cluster, if some points were confirmed, the others are considered to belong to the same building, and are +therefore confirmed as well. + +![](/img/LidarBati-BuildingCompletion.png) + + +## C) Building Identification + +**Goal**: Highlight potential buildings that were missed by the rule-based algorithm, for human inspection. + +Among points that were **not** _candidate buildings points_ identify those which meet the requirement to be `confirmed`, and cluster them. + +This clustering defines a LAS extra dimensions (`Group`) which indexes newly found cluster that may be some missed buildings. + +![](/img/LidarBati-BuildingIdentification.png) + + diff --git a/docs/source/background/thresholds_optimization_process.md b/docs/source/background/thresholds_optimization_process.md new file mode 100644 index 00000000..1fa1101c --- /dev/null +++ b/docs/source/background/thresholds_optimization_process.md @@ -0,0 +1,18 @@ +# Strategy to find optimal decision thresholds for Building validation + +## Motivations + +As described in section [Building Validation](background/production_process.md) of the production process, the decision to validate or not a group of candidate buildings is based on several decision thresholds. Those thresholds represents different levels of confidence, for different sources of data. + +They may depend on the AI model which produces the probabilities as well as on the rule-based classification from which the clusters of candidates are derived. They are highly coupled. For instance, if a lower probability is required at the point level to be confirmed as a building (threshold `C1`), we might require a higher percentage of confirmed points in a cluster of candidates (thresholds `C2`) to validate it. There must therefore be optimized jointly. + +These thresholds define how much we automate decisions, but also the quantity of errors we may introduce: there is a balance to be found between `recall` (proportion of buildings group that were confirmed), `precision` (proportion of buildings among confirmed groups), and `automation` (proportion of groups for which a decision was made i.e. that are not flagged as "unsure"). + +## Strategy + +We approach the choice of decisions thresholds as a constrained multi-objectives hyperparameters optimization. +We use the [NSGA-II](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.samplers.NSGAIISampler.html#optuna.samplers.NSGAIISampler) algorithm from the optuna optimization library. + +The constraints are defined empirically: recall>=98% and precision>=98%. The genetic algorithms search two maximize the three objectives, but focuses the search to solutions that meet these criteria. + +After a chosen number of generations, the genetic algorithms outputs the [Pareto front](https://en.wikipedia.org/wiki/Pareto_front) i.e. the set of Pareto efficient solutions for which no objective criterion could be increased without another one being reduced. Among Pareto-efficient solutions compliant with the constraint, the final solution is the set of thresholds that maximizes the production of precision, recall, and automation. \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000..4e222ad0 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,143 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import os +import sys +import yaml + +# from unittest import mock + +from hydra.experimental import compose, initialize +from omegaconf import OmegaConf + + +rel_root_path = "./../../" +abs_root_path = os.path.abspath(rel_root_path) +sys.path.insert(0, abs_root_path) + + +# -- Project information ----------------------------------------------------- +with open(os.path.join(abs_root_path, "package_metadata.yaml"), "r") as f: + pm = yaml.safe_load(f) + +release = pm["__version__"] +project = pm["__name__"] +author = pm["__author__"] +copyright = pm["__copyright__"] + +# -- YAML main to print the config into --------------------------------------------------- +# We need to concatenate configs into a single file using hydra +with initialize(config_path=os.path.join(rel_root_path, "configs/"), job_name="config"): + cfg = compose(config_name="config") + print(OmegaConf.to_yaml(cfg)) + build_dir = "../build" + os.makedirs(build_dir, exist_ok=True) + OmegaConf.save(cfg, os.path.join(build_dir, "default_config.yml"), resolve=False) + +# -- General configuration --------------------------------------------------- + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +language = "en" + +# generate autosummary pages +autosummary_generate = True + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +needs_sphinx = "4.5" +extensions = [ + "sphinx.ext.napoleon", # Supports google-style docstrings + "sphinx.ext.autodoc", # auto-generates doc fgrom docstrings + "sphinx.ext.intersphinx", # link to other docs + "sphinx.ext.viewcode", # creates links to view code sources in a new web page + "sphinx.ext.githubpages", # creates .nojekyll file to publish the doc on GitHub Pages. + "myst_parser", # supports markdown syntax for doc pages, and link to markdown pages + "sphinx_paramlinks", # allow to reference params, which is done in pytorch_lightning + "sphinxnotes.mock", # ignore third-parties directive suche as "testcode" - see "mock_directive" args below +] + +# See https://myst-parser.readthedocs.io/en/latest/syntax/optional.html +# To augment markdown parsing +# myst_enable_extensions = [ +# "amsmath", +# "colon_fence", +# "deflist", +# "dollarmath", +# "fieldlist", +# "html_admonition", +# "html_image", +# "linkify", +# "replacements", +# "smartquotes", +# "strikethrough", +# "substitution", +# "tasklist", +# ] + +# Generates slug names for markdown titles +myst_heading_anchors = 3 + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. + +html_theme = "sphinx_rtd_theme" + +html_theme_options = { + "collapse_navigation": False, + "display_version": True, + "navigation_depth": 2, +} + + +intersphinx_mapping = { + "python": ("https://docs.python.org/", None), + # TODO "unknown or unsupported inventory version" error for numpy doc. + # 'numpy': ('http://docs.scipy.org/doc/numpy', None), + "pandas": ("http://pandas.pydata.org/pandas-docs/dev", None), +} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] +modindex_common_prefix = ["lidar_prod."] + +# to_mock = [ +# "tqdm", +# "pdal", +# "python-pdal", +# "hydra", +# "laspy", +# ] + + +# for m in to_mock: +# sys.modules[m] = mock.Mock(name=m) + +# autodoc_mock_imports = [] +# for m in ["numpy", "pdal", ]: +# autodoc_mock_imports.append(m) + +# mock_directives = ["testcode"] diff --git a/docs/source/configs.rst b/docs/source/configs.rst new file mode 100644 index 00000000..b06f863b --- /dev/null +++ b/docs/source/configs.rst @@ -0,0 +1,11 @@ +Default configuration +=============================== + +Configurations are managed with `hydra`. Here, we show the default configuration at a glance. + +Refer to source configurations files in folder `configs` for more information. + +.. _hydra: https://hydra.cc/ + +.. literalinclude:: ../build/default_config.yml + :language: yaml diff --git a/docs/source/guides/development.md b/docs/source/guides/development.md new file mode 100644 index 00000000..073b468b --- /dev/null +++ b/docs/source/guides/development.md @@ -0,0 +1,32 @@ +# Developer's guide + +## Code versionning + +Package version follows semantic versionning conventions and is defined in `setup.py`. + +Releases are generated when new high-level functionnality are implemented (e.g. a new step in the production process), with a documentation role. Production-ready code is fast-forwarded in the `prod` branch when needed to match the `main` branch. + +## Tests + +Tests can be run in an activated environment with. + +```bash +conda activate lidar_prod +python -m pytest +``` + +One test depends on a large, non-versionned file (665MB), which is accessible from the self-hosted action runner, but not publicly available at the moment. The absence of the file makes the test xfail so that it is not required for local development. + +## Continuous Integration (CI) + +New features are developped in ad-hoc branches (e.g. `refactor-database-query`), and merged in the `dev` branch. When ready, `dev` can be merged in `main`. + +CI tests are run for pull request to merge on either `dev` or `main` branches, and on pushes to `dev`, `main`, and `prod`. The CI workflow builds a docker image, runs linting, and tests the code. + +## Continuous Delivery (CD) + +When the event is a push and not a merge request, this means that there was either a direct push to `dev`|`main`|`prod` or that a merge request was accepted. In this case, if the CI workflow passes, the docker image is tagged with the branch name, resulting in e.g. a `lidar_prod_im:prod` image that is up to date with the branch content. See [../tutorials/use.md] for how to leverage such image to run the app. + +Additionnaly, pushes on the `main` branch build this library documentation, which is hosted on Github pages. + + diff --git a/docs/source/guides/thresholds_optimization.md b/docs/source/guides/thresholds_optimization.md new file mode 100644 index 00000000..fbe289d6 --- /dev/null +++ b/docs/source/guides/thresholds_optimization.md @@ -0,0 +1,59 @@ +# How to optimize building validation decision thresholds? + +This guide explains how to optimize decision thresholds following the strategy in [this note](../background/thresholds_optimization_process.md). + +## Requirements + +To optimize the decision thresholds you must be able to evaluate the level of automation that can be reached on data that matches production data. As a result, you need to have _corrected_ data i.e. data of which a rule-based classification was corrected and for which you keep track of the corrections that were made. For building validation, the classification must have codes to distinguish false positive, false negative, and true positive. Theses codes may be configured with parameter `buildings_correction_labels` under configuration group `bulding_validation.optimization`. + +Furthermore, the point cloud data must include predictions from the deep learning model trained to detect buildings. This consists in two channels : a `building` channel with predicted probabilities and an `entropy` channel. + +A large validation dataset might help having a better sense of the app performances. We used 15km² of corrected data to optimize thresholds, but a larger set might provide more diversity. This being said, performance on an unseen test set was almost equal to performance on the validation set, which indicates a robust evaluation for such volume of data. + + +## Running thresholds optimization + +### Finding optimal thresholds + +> Refer to the [installation tutorial](../tutorials/install.md) to set up your python environment. + +Your corrected data must live in a single `input_las_dir` directory as a set of LAS files. +Prepared and updated files will be saved in subfolder of a `results_output_dir` directory (`./prepared` and `./updated/`, respectively). +They will keep the same basename as the original files. +Be sure that the `data_format` configurations match your data, and in particular the (clasification) `codes` and `las_dimensions` configuration groups. +A `todo` string parameter specifies the steps to run by including 1 or more of the following keywords: `prepare` | `otpimize` | `evaluate` | `update`. + +Run the full optimization module with + +```bash +conda activate lidar_prod + +python lidar_prod/run.py \ ++task=optimize building_validation.optimization.todo='prepare+optimize+evaluate+update' \ +building_validation.optimization.paths.input_las_dir=[path/to/labelled/val/dataset/] \ +building_validation.optimization.paths.results_output_dir=[path/to/save/results] +``` + +### Evaluation of optimized thresholds on a test set + +Once an optimal solution was found, you may want to evaluate the decision process on unseen data to evaluate generalization capability. For that, you will need another test folder of corrected data in the same format as before (a different `input_las_dir`). You need to specify that no optimization is required using the `todo` params. You also need to give the path to the pickled decision thresholds from the previous step, and specify a different `results_output_dir` so that prepared data of test and val test are not pooled together. + + +```bash +conda activate lidar_prod + +python lidar_prod/run.py \ ++task=optimize \ +building_validation.optimization.todo='prepare+evaluate+update' \ +building_validation.optimization.paths.input_las_dir=[path/to/labelled/test/dataset/] \ +building_validation.optimization.paths.results_output_dir=[path/to/save/results] \ +building_validation.optimization.paths.building_validation_thresholds_pickle=[path/to/optimized_thresholds.pickle] +``` + +### Utils + +Debug mode: to run on a single file during development, add a `+building_validation.optimization.debug=true` flag to the command line. + + +Reference: +- [Deb et al. (2002) - A fast and elitist multiobjective genetic algorithm\: NSGA-II](https://ieeexplore.ieee.org/document/996017)). \ No newline at end of file diff --git a/assets/img/LidarBati-BuildingCompletion.png b/docs/source/img/LidarBati-BuildingCompletion.png similarity index 100% rename from assets/img/LidarBati-BuildingCompletion.png rename to docs/source/img/LidarBati-BuildingCompletion.png diff --git a/assets/img/LidarBati-BuildingIdentification.png b/docs/source/img/LidarBati-BuildingIdentification.png similarity index 100% rename from assets/img/LidarBati-BuildingIdentification.png rename to docs/source/img/LidarBati-BuildingIdentification.png diff --git a/assets/img/LidarBati-BuildingValidationM7.1V2.0.png b/docs/source/img/LidarBati-BuildingValidationM7.1V2.0.png similarity index 100% rename from assets/img/LidarBati-BuildingValidationM7.1V2.0.png rename to docs/source/img/LidarBati-BuildingValidationM7.1V2.0.png diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 00000000..426c45a7 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,48 @@ +:github_url: https://github.com/IGNF/lidar-prod-quality-control + +Lidar-Prod > Documentation +=================================================== + +.. include:: introduction.md + :parser: myst_parser.sphinx_ + + +.. toctree:: + :maxdepth: 1 + :caption: Background + + background/production_process + background/thresholds_optimization_process + +.. toctree:: + :maxdepth: 1 + :caption: Getting Started + + tutorials/install + tutorials/use + +.. toctree:: + :maxdepth: 1 + :caption: Guides + + guides/thresholds_optimization + guides/development + + +.. TODO: assure that all dosctrings are in third-personn mode. + +.. toctree:: + :maxdepth: 1 + :caption: Package Reference + + apidoc/lidar_prod + apidoc/lidar_prod.tasks + apidoc/lidar_prod.commons + configs + + +Indices and Tables +================== + +* :ref:`genindex` +* :ref:`modindex` \ No newline at end of file diff --git a/docs/source/introduction.md b/docs/source/introduction.md new file mode 100644 index 00000000..27001dd3 --- /dev/null +++ b/docs/source/introduction.md @@ -0,0 +1,8 @@ +Lidar-Prod is a production library which aims at augmenting rules-based semantic segmentation algorithms with deep learning neural network predictions (probabilities) and a public building vector database (BDUni). Its main entry-points are: + +- `application.py`: The application takes a point cloud and update its Classification dimension based on its deep learning predictions and a public geographic database. +- `optimization.py`: The right balance between automation of decision and error is found via a multi-objective optimization of of the decision thresholds, by means of a simple genetic algorithm. + +Our strategy is to fuse together different sources of informations (rules-based classification, deep learning predictions, databases), so that we can ensure a high-quality classification while minimizing the need for human correction. Deep learning probabilities might also be used to highlight area of uncertainty, or to spot elements that were missed by the other approaches. + +Right now, the class `building` is the only one that is addressed. The extension to other classes is dependent on the training of multiclass AI model, which requires high quality training datasets that are currently being produced. \ No newline at end of file diff --git a/docs/source/tutorials/install.md b/docs/source/tutorials/install.md new file mode 100644 index 00000000..4b9122b2 --- /dev/null +++ b/docs/source/tutorials/install.md @@ -0,0 +1,45 @@ +# Installation + +## Set up a virtual environment + +We use [Anaconda](https://anaconda.org/)] to manage and isolate dependencies. +The provided environment setup script also installs [Mamba](https://mamba.readthedocs.io/en/latest/index.html), +which gets on top of conda for faster environment installs. + +```yaml +# clone project +git clone https://github.com/IGNF/lidar-prod-quality-control +cd lidar-prod-quality-control + +# install conda +# see https://www.anaconda.com/products/individual + +# you need to install postgis to request a public database +sudo apt-get install postgis + +# create conda environment +source bash/setup_environment/setup_env.sh + +# activate the virtual env +conda activate lidar_prod +``` + +## Install the app as a python module + +To run the application from anywhere, you can install as a module in a your virtual environment. + +```bash +# activate your env +conda activate lidar_prod + +# install the package from github directly, using production branch +pip install --upgrade https://github.com/IGNF/lidar-prod-quality-control/tarball/prod + +``` + +During development, install in editable mode directly from source with + ```bash + pip install --editable . + ``` + +Then, refert to the [usage page](./use.md). \ No newline at end of file diff --git a/docs/source/tutorials/use.md b/docs/source/tutorials/use.md new file mode 100644 index 00000000..d485881c --- /dev/null +++ b/docs/source/tutorials/use.md @@ -0,0 +1,43 @@ +# Using the app + +## Run within a docker container + +Up to date docker images (named `lidar_prod_im`) are created via Github integration actions (see [Developer's guide](../guides/development.md)). + +To run the app, use + +```bash +docker run \ +-v {local_src_las_dir}:/inputs/ \ +-v {local_output_dir}:/outputs/ +lidar_prod_im \ +python lidar_prod/run.py \ +paths.src_las=/inputs/{src_las_basename}.las +paths.output_dir=/outputs/ +# + other options... + +``` + +A docker image encapsulating the virtual environment and application sources can also be built using the provided Dockerfile. This Dockerfile is not standalone and should be part of the repository (whose content is copied into the image), on the github reference you want to build from. + +## Run as a python module +To run the module as a module, you will need a source cloud point in LAS format with an additional channel containing predicted building probabilities (`ai_building_proba`) and another one containing predictions entropy (`entropy`). The names of thes channel can be specified via hydra config `config.data_format.las_dimensions`. + +To run using default configurations of the installed module, use +```bash +python -m lidar_prod.run paths.src_las= +``` + +You can specify a different yaml config file with the flags `--config-path` and `--config-name`. You can also override specific parameters. By default, results are saved to a `./outputs/` folder, but this can be overriden with `paths.output_dir` parameter. Refer to [hydra documentation](https://hydra.cc/docs/next/tutorials/basic/your_first_app/config_file/) for the overriding syntax. + +To print default configuration run `python -m lidar_prod.run -h`. For pretty colors, run `python -m lidar_prod.run print_config=true`. + +## Run from source directly + +For developments and debugging, you can run the package directly from python sources instead: + +```bash +# activate an env matching ./bash/setup_env.sh requirements. +conda activate lidar_prod +python lidar_prod/run.py paths.src_las=[/path/to/file.las] +``` diff --git a/lidar_prod/run.py b/lidar_prod/run.py index 7dad4a00..c8f22bd1 100755 --- a/lidar_prod/run.py +++ b/lidar_prod/run.py @@ -6,6 +6,11 @@ @hydra.main(config_path="../configs/", config_name="config.yaml") def main(config: DictConfig): + """Main entry point to either apply or optimize thresholds. + + Check the configurations files for usage. + + """ # Imports should be nested inside @hydra.main to optimize tab completion # Read more here: https://github.com/facebookresearch/hydra/issues/934 diff --git a/lidar_prod/tasks/building_validation_optimization.py b/lidar_prod/tasks/building_validation_optimization.py index 6fc58473..27d62c32 100644 --- a/lidar_prod/tasks/building_validation_optimization.py +++ b/lidar_prod/tasks/building_validation_optimization.py @@ -27,7 +27,33 @@ def constraints_func(trial): class BuildingValidationOptimizer: - """Optimization logic for the BuildingValidation decision thresholds.""" + r"""Optimizer of the decision thresholds used by `BuildingValidator`. + + In lidar-prod, each task is implemented by a dedicated python class. + Building Validation is implemented via a :class:`BuildingValidator` class. + We make sure that all parameters used for optimization are the one we + actually use in production. + + For a higher internal cohesion, `BuildingValidator` does not know + anything about optimization, which is taken care of by a + `BuildingValidationOptimizer` python class. + Two dataclasses are used to connect the two objects. + `BuildingValidationClusterInfo` describes the cluster-level information, + necessary to perform a validation. + `thresholds` describes the different thresholds used + in `BuildingValidator` and optimized in `BuildingValidationOptimizer`. + + In Building Validation, the most time-consuming step is the preparation + of data, including the clustering of candidate building points and the + overlay of vectors of buildings from a public databse: + up to several minutes per km² of data. + The `BuildingValidationOptimizer` breaks down the Building Validation steps to make + sure that data preparation only occurs onces. + All outputs and intermediary files are stored in a `results_output_dir` + directory, so that operations may be resumed at any steps, for instance + to rerun a thresholds optimization with a different optimizer configuration. + + """ def __init__( self, @@ -36,7 +62,7 @@ def __init__( building_validator: BuildingValidator, study: optuna.Study, design: Any, - labels_from_20211001_building_val: Any, + buildings_correction_labels: Any, use_final_classification_codes: bool = False, debug=False, ): @@ -46,7 +72,7 @@ def __init__( self.bv = building_validator self.study = study self.design = design - self.labels_from_20211001_building_val = labels_from_20211001_building_val + self.buildings_correction_labels = buildings_correction_labels self.use_final_classification_codes = use_final_classification_codes self.setup() @@ -85,8 +111,8 @@ def setup(self): # We must adapt BuildingValidator to corrected data by specifying the codes to use as candidates self.bv.candidate_buildings_codes = ( - self.labels_from_20211001_building_val.codes.true_positives - + self.labels_from_20211001_building_val.codes.false_positives + self.buildings_correction_labels.codes.true_positives + + self.buildings_correction_labels.codes.false_positives ) # We also specify if, when updating corrected data (for inspection) we want final codes or detailed ones. self.bv.use_final_classification_codes = self.use_final_classification_codes @@ -99,8 +125,10 @@ def setup(self): def prepare(self): """Preparation step. - Cluster clouds and cross with building vector database, then - extract cluster-level information. + Prepares and saves each point cloud in the specified directory, + and extracts all cluster information in a list of + `BuildingValidationClusterInfo` objects that is serialized into + a pickle object. """ clusters = [] @@ -115,7 +143,15 @@ def prepare(self): self._dump_clusters(clusters) def optimize(self): - """Optimization step""" + """Optimization step. + + Deserializes the clusters informations. + Runs the genetic algorithm for N generations. + For each set of decision thresholds, computes the Recall, Precision, + and Automation of the `BuildingValidator`. + Finally, serializes the set of optimal thresholds. + + """ clusters = self._load_clusters() objective = functools.partial(self._objective, clusters=clusters) self.study.optimize(objective, n_trials=self.design.n_trials) @@ -126,25 +162,29 @@ def optimize(self): self.thresholds: thresholds = best_thresholds def evaluate(self) -> dict: - """Evaluates application on prepared clusters and computes performance metrics. + """Evaluation step. - The optimization step results in optimized thresholds, stored in a pickle object. - To use them,specify parameter `building_validation.optimization.paths.building_validation_thresholds_pickle`. - Elsewise, default thresholds will be used. + Deserializes the set of optimal thresholds. + Deserializes the clusters informations. + Computes the Recall, Precision, and Automation of the `BuildingValidator` + on the clusters using optimal thresholds, as well as other metrics + including confusion matrices. + If a validation dataset was used for optimization, this evaluation + may be ran on a test dataset. Returns: dict: a dictionnary of metrics of schema {metric_name:metric_value}. """ clusters = self._load_clusters() - self.set_thresholds_from_pickle_if_available() + self._set_thresholds_from_pickle_if_available() decisions = np.array([self.bv._make_group_decision(c) for c in clusters]) mts_gt = np.array([c.target for c in clusters]) - metrics_dict = self._evaluate_decisions(mts_gt, decisions) + metrics_dict = self.evaluate_decisions(mts_gt, decisions) log.info(f"\n Results:\n{self._get_results_logs_str(metrics_dict)}") return metrics_dict - def set_thresholds_from_pickle_if_available(self): + def _set_thresholds_from_pickle_if_available(self): try: with open(self.paths.building_validation_thresholds_pickle, "rb") as f: self.bv.thresholds = pickle.load(f) @@ -159,11 +199,13 @@ def set_thresholds_from_pickle_if_available(self): def update(self): """Update step. - Update point cloud classification using optimized decision thresholds. + Deserializes the set of optimal thresholds. + `BuildingValidator` updates each prepared point cloud classification + based on those threshods and saves the result. """ log.info(f"Updated las will be saved in {self.paths.results_output_dir}") - self.set_thresholds_from_pickle_if_available() + self._set_thresholds_from_pickle_if_available() for prepared_las_path, target_las_path in tqdm( zip(self.prepared_las_filepaths, self.out_las_filepaths), total=len(self.prepared_las_filepaths), @@ -183,6 +225,7 @@ def _extract_clusters_from_las( Returns: List[BuildingValidationClusterInfo]: cluster information for each cluster of candidate buildings + """ las: laspy.LasData = laspy.read(prepared_las_path) dim_cluster_id = las[ @@ -212,12 +255,12 @@ def _define_MTS_ground_truth_flag(self, targets) -> int: tp_frac = np.mean( np.isin( targets, - self.labels_from_20211001_building_val.codes.true_positives, + self.buildings_correction_labels.codes.true_positives, ) ) - if tp_frac >= self.labels_from_20211001_building_val.min_frac.true_positives: + if tp_frac >= self.buildings_correction_labels.min_frac.true_positives: return self.bv.codes.final.building - elif tp_frac < self.labels_from_20211001_building_val.min_frac.false_positives: + elif tp_frac < self.buildings_correction_labels.min_frac.false_positives: return self.bv.codes.final.not_building return self.bv.codes.final.unsure @@ -242,6 +285,7 @@ def _objective(self, trial, clusters: List[BuildingValidationClusterInfo] = None Returns: float, float, float: automatisation, precision, recall + """ params = { "min_confidence_confirmation": trial.suggest_float( @@ -270,7 +314,7 @@ def _objective(self, trial, clusters: List[BuildingValidationClusterInfo] = None self.bv.thresholds = thresholds(**params) decisions = np.array([self.bv._make_group_decision(c) for c in clusters]) mts_gt = np.array([c.target for c in clusters]) - metrics_dict = self._evaluate_decisions(mts_gt, decisions) + metrics_dict = self.evaluate_decisions(mts_gt, decisions) # WARNING: order should always be automation, precision, recall values = ( @@ -289,7 +333,7 @@ def _objective(self, trial, clusters: List[BuildingValidationClusterInfo] = None return auto, precision, recall def _select_best_rules(self, study): - """Find the trial that meets constraints and that maximizes automation.""" + """Find the trial that meet constraints and that maximizes automation.""" trials = sorted(study.best_trials, key=lambda x: x.values[0], reverse=True) TRIALS_BELOW_ZERO_ARE_VALID = 0 respect_constraints = [ @@ -311,6 +355,7 @@ def _select_best_rules(self, study): return best_rules def _dump_best_rules(self, best_trial_params): + """Serializes best thresholds.""" with open(self.paths.building_validation_thresholds_pickle, "wb") as f: pickle.dump(best_trial_params, f) log.info( @@ -318,31 +363,36 @@ def _dump_best_rules(self, best_trial_params): ) def _dump_clusters(self, clusters): + """Serializes the list of cluster-level information objects.""" with open(self.paths.group_info_pickle_path, "wb") as f: pickle.dump(clusters, f) log.info(f"Pickled groups to {self.paths.group_info_pickle_path}") def _load_clusters(self): + """Deserializes the list of cluster-level information objects.""" with open(self.paths.group_info_pickle_path, "rb") as f: clusters = pickle.load(f) log.info(f"Loading pickled groups from {self.paths.group_info_pickle_path}") return clusters - def _evaluate_decisions(self, mts_gt, ia_decision): - """Evaluate confirmation and refutation decisions. + def evaluate_decisions(self, mts_gt, ia_decision) -> Dict[str, Any]: + r"""Evaluate confirmation and refutation decisions. Get dict of metrics to evaluate how good module decisions were in reference to ground truths. + Targets: U=Unsure, N=No (not a building), Y=Yes (building) - PRedictions : U=Unsure, C=Confirmation, R=Refutation - Confusion Matrix : - predictions - [Uu Ur Uc] - target [Nu Nr Nc] - [Yu Yr Yc] - Maximization criteria: - Proportion of each decision among total of candidate groups. - We want to maximize it. + Predictions : U=Unsure, C=Confirmation, R=Refutation + + Confusion Matrix (horizontal: target, vertical: predictions) + + [Uu Ur Uc] + + [Nu Nr Nc] + + [Yu Yr Yc] + + Automation: Proportion of each decision among total of candidate groups. Accuracies: Confirmation/Refutation Accuracy. @@ -351,8 +401,18 @@ def _evaluate_decisions(self, mts_gt, ia_decision): Quality Precision and Recall, assuming perfect posterior decision for unsure predictions. Only candidate shapes with known ground truths are considered (ambiguous labels are ignored). + Precision : (Yu + Yc) / (Yu + Yc + Nc) + Recall : (Yu + Yc) / (Yu + Yn + Yc) + + Args: + mts_gt (np.array): ground truth of rules-based classification (0, 1, 2) + ia_decision (np.array): AI application decision (0, 1, 2) + + Returns: + dict: dictionnary of metrics. + """ metrics_dict = dict() diff --git a/package_metadata.yaml b/package_metadata.yaml new file mode 100644 index 00000000..cb9cd4ec --- /dev/null +++ b/package_metadata.yaml @@ -0,0 +1,6 @@ +__version__: "1.5.0" +__name__: "lidar_prod" +__url__: "https://github.com/IGNF/lidar-prod-quality-control" +__description__: "A 3D semantic segmentation production tool to augment rules-based Lidar classification with AI and databases." +__author__: "Charles GAYDON" +__copyright__: "2022, Institut National de l'Information Géographique et Forestière" diff --git a/setup.py b/setup.py index 6f8b49f8..6b282f71 100755 --- a/setup.py +++ b/setup.py @@ -1,12 +1,17 @@ from setuptools import find_packages, setup +import yaml + +with open("package_metadata.yaml", "r") as f: + pm = yaml.safe_load(f) setup( - name="lidar_prod", - version="1.5.0", - description="A 3D semantic segmentation production tool to augment rules-based Lidar classification with AI and databases.", - author="Charles GAYDON", - author_email="charles.gaydon@gmail.com", - url="https://github.com/IGNF/lidar-prod-quality-control", - install_requires=[], # env should match the one in bash/setup_environment/setup_env.sh + name=pm["__name__"], + version=pm["__version__"], + url=pm["__url__"], + description=pm["__description__"], + author=pm["__author__"], + install_requires=[ + # assume an environment as described in ./bash/setup_env.sh + ], packages=find_packages(), ) diff --git a/tests/lidar_prod/test_optimization.py b/tests/lidar_prod/test_optimization.py index e932f147..9d42cab3 100644 --- a/tests/lidar_prod/test_optimization.py +++ b/tests/lidar_prod/test_optimization.py @@ -14,7 +14,7 @@ """We test the building validation optimizer against two LAS: -These datasets must have the right classification codes, i.e. the ones defined in labels_from_20211001_building_val. +These datasets must have the right classification codes, i.e. the ones defined in buildings_correction_labels. WARNING: The large LAS cannot be versionned by git. If it is absent from environment, pytest expects the test to fail. This is to enable a shallower run of these tests without the file.