nasaharvest · ivanzvonkov · Oct 3, 2022 · Sep 22, 2022 · Sep 22, 2022 · Sep 22, 2022
diff --git a/.github/workflows/buildings-example-test.yaml b/.github/workflows/buildings-example-test.yaml
@@ -8,6 +8,8 @@ on:
     branches: [ main ]
   pull_request:
     branches: [ main ]
+    paths:
+      - 'buildings-example/**'
 
 jobs:
   test:

diff --git a/.github/workflows/crop-mask-example-test.yaml b/.github/workflows/crop-mask-example-test.yaml
@@ -8,6 +8,8 @@ on:
     branches: [ main ]
   pull_request:
     branches: [ main ]
+    paths:
+      - 'crop-mask-example/**'
 
 jobs:
   test:

diff --git a/.github/workflows/forest-example-test.yaml b/.github/workflows/forest-example-test.yaml
@@ -8,6 +8,8 @@ on:
     branches: [ main ]
   pull_request:
     branches: [ main ]
+    paths:
+      - 'forest-example/**'
 
 jobs:
   test:
@@ -18,19 +20,35 @@ jobs:
     steps:
     - name: Clone repo
       uses: actions/checkout@v2
+      with:
+        ref: ${{ github.event.pull_request.head.ref }}
     - name: Set up python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
         python-version: 3.8
-    - name: Install dependencies
-      run: pip install -r requirements.txt
+    - run: pip install -r requirements.txt
 
-    - name: dvc pull data
+    - uses: google-github-actions/auth@v0
+      with:
+        credentials_json: ${{ secrets.GCP_SA_KEY }}
+    - name: Run data pipeline
       env:
         # https://dvc.org/doc/user-guide/setup-google-drive-remote#authorization
         GDRIVE_CREDENTIALS_DATA: ${{ secrets.GDRIVE_CREDENTIALS_DATA }}
-      run: dvc pull -f
-
+        GCP_SA_KEY: ${{ secrets.GCP_SA_KEY }}
+      run: | 
+        dvc pull -f
+        openmapflow create-datasets --non-interactive
+        dvc commit -f
+        dvc push
+    - name: Push automated dataset updates
+      run: | 
+        git config --global user.name 'Dataset bot'
+        git config --global user.email '[email protected]'
+        git pull
+        git add data
+        git commit -m "Automated dataset updates" || echo "No updates to commit"
+        git push
     - name: Integration test - Project
       run: |
         openmapflow cp templates/integration_test_project.py .

diff --git a/.github/workflows/maize-example-test.yaml b/.github/workflows/maize-example-test.yaml
@@ -8,6 +8,8 @@ on:
     branches: [ main ]
   pull_request:
     branches: [ main ]
+    paths:
+      - 'maize-example/**'
 
 jobs:
   test:

diff --git a/forest-example/data/datasets.dvc b/forest-example/data/datasets.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: 718c5017dec70570f87d1ca1941db208.dir
-  size: 5789085
+- md5: 3d8ac3ef8c473bb3445b9b67a0fdbc33.dir
+  size: 5403436
   nfiles: 1
   path: datasets
diff --git a/forest-example/datasets.py b/forest-example/datasets.py
@@ -24,7 +24,7 @@ def load_labels(self) -> pd.DataFrame:
             PROJECT_ROOT / DataPaths.RAW_LABELS / "hansen_labelled_data.csv"
         )
 
-        df = df.sample(n=1000, random_state=42)
+        df = df.sample(n=1000, random_state=43)
 
         # Rename coordinate columns to be used for getting Earth observation data
         df.rename(columns={"lon": LAT, "lat": LON}, inplace=True)

diff --git a/openmapflow/constants.py b/openmapflow/constants.py
@@ -13,7 +13,7 @@
 TEMPLATE_README = TEMPLATES_DIR / "README.md"
 TEMPLATE_DEPLOY_YML = TEMPLATES_DIR / "github-deploy.yaml"
 TEMPLATE_TEST_YML = TEMPLATES_DIR / "github-test.yaml"
-VERSION = "0.2.0rc1"
+VERSION = "0.2.1rc1"
 
 # -------------- Dataframe column names --------------------------------------
 SOURCE = "source"

diff --git a/openmapflow/ee_exporter.py b/openmapflow/ee_exporter.py
@@ -1,3 +1,5 @@
+import json
+import os
 import warnings
 from datetime import date, timedelta
 from typing import Dict, List, Optional, Union
@@ -104,6 +106,69 @@ def ee_safe_str(s: str):
     return s.replace(".", "-").replace("=", "-").replace("/", "-")[:100]
 
 
+def create_ee_image(
+    polygon: "ee.Geometry.Polygon",
+    start_date: date,
+    end_date: date,
+    days_per_timestep: int = DAYS_PER_TIMESTEP,
+):
+    image_collection_list: List[ee.Image] = []
+    cur_date = start_date
+    cur_end_date = cur_date + timedelta(days=days_per_timestep)
+
+    # first, we get all the S1 images in an exaggerated date range
+    vv_imcol, vh_imcol = get_s1_image_collection(
+        polygon, start_date - timedelta(days=31), end_date + timedelta(days=31)
+    )
+
+    while cur_end_date <= end_date:
+        image_list: List[ee.Image] = []
+
+        # first, the S1 image which gets the entire s1 collection
+        image_list.append(
+            get_single_s1_image(
+                region=polygon,
+                start_date=cur_date,
+                end_date=cur_end_date,
+                vv_imcol=vv_imcol,
+                vh_imcol=vh_imcol,
+            )
+        )
+        for image_function in DYNAMIC_IMAGE_FUNCTIONS:
+            image_list.append(
+                image_function(
+                    region=polygon, start_date=cur_date, end_date=cur_end_date
+                )
+            )
+        image_collection_list.append(ee.Image.cat(image_list))
+
+        cur_date += timedelta(days=days_per_timestep)
+        cur_end_date += timedelta(days=days_per_timestep)
+
+    # now, we want to take our image collection and append the bands into a single image
+    imcoll = ee.ImageCollection(image_collection_list)
+    combine_bands_function = make_combine_bands_function(DYNAMIC_BANDS)
+    img = ee.Image(imcoll.iterate(combine_bands_function))
+
+    # finally, we add the SRTM image seperately since its static in time
+    total_image_list: List[ee.Image] = [img]
+    for static_image_function in STATIC_IMAGE_FUNCTIONS:
+        total_image_list.append(static_image_function(region=polygon))
+
+    return ee.Image.cat(total_image_list)
+
+
+def get_ee_credentials():
+    gcp_sa_key = os.environ.get("GCP_SA_KEY")
+    if gcp_sa_key is not None:
+        gcp_sa_email = json.loads(gcp_sa_key)["client_email"]
+        print(f"Logging into EarthEngine with {gcp_sa_email}")
+        return ee.ServiceAccountCredentials(gcp_sa_email, key_data=gcp_sa_key)
+    else:
+        print("Logging into EarthEngine with default credentials")
+        return "persistent"
+
+
 class EarthEngineExporter:
     """
     Export satellite data from Earth engine. It's called using the following
@@ -121,24 +186,10 @@ class EarthEngineExporter:
     """
 
     def __init__(
-        self,
-        dest_bucket: str,
-        check_ee: bool = False,
-        check_gcp: bool = False,
-        credentials: Optional[str] = None,
-        days_per_timestep: int = DAYS_PER_TIMESTEP,
+        self, dest_bucket: str, check_ee: bool = False, check_gcp: bool = False
     ) -> None:
         self.dest_bucket = dest_bucket
-        self.days_per_timestep = days_per_timestep
-        try:
-            if credentials:
-                ee.Initialize(credentials=credentials)
-            else:
-                ee.Initialize()
-        except Exception:
-            print(
-                "This code may not work if you have not authenticated your earthengine account"
-            )
+        ee.Initialize(get_ee_credentials())
         self.check_ee = check_ee
         self.ee_task_list = get_ee_task_list() if self.check_ee else []
         self.check_gcp = check_gcp
@@ -172,50 +223,7 @@ def _export_for_polygon(
         if len(self.ee_task_list) >= 3000:
             return False
 
-        image_collection_list: List[ee.Image] = []
-        cur_date = start_date
-        cur_end_date = cur_date + timedelta(days=self.days_per_timestep)
-
-        # first, we get all the S1 images in an exaggerated date range
-        vv_imcol, vh_imcol = get_s1_image_collection(
-            polygon, start_date - timedelta(days=31), end_date + timedelta(days=31)
-        )
-
-        while cur_end_date <= end_date:
-            image_list: List[ee.Image] = []
-
-            # first, the S1 image which gets the entire s1 collection
-            image_list.append(
-                get_single_s1_image(
-                    region=polygon,
-                    start_date=cur_date,
-                    end_date=cur_end_date,
-                    vv_imcol=vv_imcol,
-                    vh_imcol=vh_imcol,
-                )
-            )
-            for image_function in DYNAMIC_IMAGE_FUNCTIONS:
-                image_list.append(
-                    image_function(
-                        region=polygon, start_date=cur_date, end_date=cur_end_date
-                    )
-                )
-            image_collection_list.append(ee.Image.cat(image_list))
-
-            cur_date += timedelta(days=self.days_per_timestep)
-            cur_end_date += timedelta(days=self.days_per_timestep)
-
-        # now, we want to take our image collection and append the bands into a single image
-        imcoll = ee.ImageCollection(image_collection_list)
-        combine_bands_function = make_combine_bands_function(DYNAMIC_BANDS)
-        img = ee.Image(imcoll.iterate(combine_bands_function))
-
-        # finally, we add the SRTM image seperately since its static in time
-        total_image_list: List[ee.Image] = [img]
-        for static_image_function in STATIC_IMAGE_FUNCTIONS:
-            total_image_list.append(static_image_function(region=polygon))
-
-        img = ee.Image.cat(total_image_list)
+        img = create_ee_image(polygon, start_date, end_date)
 
         # and finally, export the image
         if not test:
@@ -281,6 +289,9 @@ def export_for_labels(
         for expected_column in [START, END, LAT, LON]:
             assert expected_column in labels
 
+        labels[START] = pd.to_datetime(labels[START]).dt.date
+        labels[END] = pd.to_datetime(labels[END]).dt.date
+
         exports_started = 0
         print(f"Exporting {len(labels)} labels: ")
 
@@ -306,3 +317,33 @@ def export_for_labels(
                 ):
                     print(f"Started {exports_started} exports. Ending export")
                     return None
+
+
+class EarthEngineAPI:
+    """
+    Fetch satellite data from Earth engine by URL.
+    :param credentials: The credentials to use for the export. If not specified,
+        the default credentials will be used
+    """
+
+    def __init__(self) -> None:
+        ee.Initialize(
+            get_ee_credentials(),
+            opt_url="https://earthengine-highvolume.googleapis.com",
+        )
+
+    def get_ee_url(self, lat, lon, start_date, end_date):
+        ee_bbox = EEBoundingBox.from_centre(
+            mid_lat=lat,
+            mid_lon=lon,
+            surrounding_metres=80,
+        ).to_ee_polygon()
+        img = create_ee_image(ee_bbox, start_date, end_date)
+        return img.getDownloadURL(
+            {
+                "region": ee_bbox,
+                "scale": 10,
+                "filePerBand": False,
+                "format": "GEO_TIFF",
+            }
+        )
diff --git a/openmapflow/engineer.py b/openmapflow/engineer.py
@@ -19,7 +19,7 @@
 
 def load_tif(
     filepath: Path,
-    start_date: datetime,
+    start_date: Optional[datetime] = None,
     num_timesteps: Optional[int] = None,
 ):
     r"""
@@ -56,12 +56,16 @@ def load_tif(
         time_specific_da["band"] = range(bands_per_timestep + len(STATIC_BANDS))
         da_split_by_time.append(time_specific_da)
 
-    timesteps = [
-        start_date + timedelta(days=DAYS_PER_TIMESTEP) * i
-        for i in range(len(da_split_by_time))
-    ]
-
-    dynamic_data = xr.concat(da_split_by_time, pd.Index(timesteps, name="time"))
+    if start_date:
+        timesteps = [
+            start_date + timedelta(days=DAYS_PER_TIMESTEP) * i
+            for i in range(len(da_split_by_time))
+        ]
+        dynamic_data = xr.concat(da_split_by_time, pd.Index(timesteps, name="time"))
+    else:
+        dynamic_data = xr.concat(
+            da_split_by_time, pd.Index(range(len(da_split_by_time)), name="time")
+        )
     dynamic_data.attrs["band_descriptions"] = BANDS
 
     return dynamic_data, average_slope