agrc · jacobdadams · Nov 5, 2024 · Nov 4, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/src/wmrc/summarize.py b/src/wmrc/summarize.py
@@ -2,6 +2,7 @@
 Calender_Year__c to create dataframes of the reports that will be used to update the AGOL feature services.
 """
 
+import numpy as np
 import pandas as pd
 
 try:
@@ -198,8 +199,9 @@ def recovery_rates_by_tonnage(records: helpers.SalesForceRecords) -> pd.Series:
     """Calculates a yearly recovery rate based on the Salesforce records.
 
     Recovery rate is opposite of contaminated rate (5% contamination = 95% uncontaminated). Rate is
-    calculated by using the contamination rate to determine contaminated tonnage and comparing that to the total
-    tonnage handled by facilities reporting a contamination rate.
+    calculated by calculating the total in-state MSW recycled per facility and the total received, which comes from
+    dividing that amount by the recovery rate per facility, and then dividing the sums of those two values across all
+    facilities.
 
     Args:
         records (helpers.SalesForceRecords): Helper object containing the Salesforce records
@@ -208,34 +210,31 @@ def recovery_rates_by_tonnage(records: helpers.SalesForceRecords) -> pd.Series:
         pd.Series: recovery rates per year with index name data_year and series name
             "annual_recycling_uncontaminated_rate"
     """
-    #: First, create a modifier to account for material from out-of-state
+    #: Create our various modifiers
     records.df["in_state_modifier"] = (100 - records.df["Out_of_State__c"]) / 100
+    records.df["msw_modifier"] = records.df["Municipal_Solid_Waste__c"] / 100
+    records.df["recovery_rate"] = (100 - records.df["Annual_Recycling_Contamination_Rate__c"]) / 100
 
-    #: Calculate contaminated tonnage
-    records.df["recycling_tons_contaminated"] = (
-        records.df["Annual_Recycling_Contamination_Rate__c"]
-        / 100
-        * records.df["Combined_Total_of_Material_Recycled__c"]
+    #: Amount of material recycled
+    records.df["in_state_msw_recycled"] = (
+        records.df["Combined_Total_of_Material_Recycled__c"]
         * records.df["in_state_modifier"]
+        * records.df["msw_modifier"]
     )
 
-    #: Calculate total tonnage from facilities reporting a contamination rate
-    records.df["recycling_tons_report_contamination_total"] = pd.NA
-    records.df.loc[~records.df["recycling_tons_contaminated"].isnull(), "recycling_tons_report_contamination_total"] = (
-        records.df["Combined_Total_of_Material_Recycled__c"] * records.df["in_state_modifier"]
+    #: Amount of material received derived from recovery rate
+    records.df["in_state_msw_received_for_recycling"] = (
+        records.df["in_state_msw_recycled"] / records.df["recovery_rate"]
     )
 
-    #: Invert to get uncontaminated rate
+    #: Uncontaminated rates by year
     clean_rates = records.df.groupby("Calendar_Year__c").apply(
         lambda year_df: (
-            1
-            - (
-                year_df["recycling_tons_contaminated"].sum()
-                / year_df["recycling_tons_report_contamination_total"].sum()
-            )
+            year_df["in_state_msw_recycled"].sum() / year_df["in_state_msw_received_for_recycling"].sum() * 100
         )
-        * 100
     )
+
+    clean_rates.replace([np.inf, -np.inf], np.nan, inplace=True)  #: Can arise from division by np.nan
     clean_rates.name = "annual_recycling_uncontaminated_rate"
     clean_rates.index.name = "data_year"
     clean_rates.index = clean_rates.index.map(helpers.convert_to_int)

diff --git a/tests/test_summarize.py b/tests/test_summarize.py
@@ -82,17 +82,43 @@ def test_recovery_rates_by_tonnage_happy_path(self, mocker):
             {
                 "Calendar_Year__c": [2022, 2022, 2023, 2023],
                 "Out_of_State__c": [0, 0, 0, 0],
-                "Annual_Recycling_Contamination_Rate__c": [10, 0, 10, 20],
-                "Combined_Total_of_Material_Recycled__c": [100, 100, 100, 100],
+                "Municipal_Solid_Waste__c": [100, 100, 100, 100],
+                "Annual_Recycling_Contamination_Rate__c": [50, 0, 50, 50],
+                "Combined_Total_of_Material_Recycled__c": [50, 100, 50, 40],
             }
         )
 
         output_series = summarize.recovery_rates_by_tonnage(records)
 
         test_df = pd.Series(
             {
-                2022: 95.0,
-                2023: 85.0,
+                2022: 75.0,
+                2023: 50.0,
+            },
+            name="annual_recycling_uncontaminated_rate",
+        )
+        test_df.index.name = "data_year"
+
+        pd.testing.assert_series_equal(output_series, test_df)
+
+    def test_recovery_rates_by_tonnage_replaces_inf_with_zero(self, mocker):
+        records = mocker.Mock()
+        records.df = pd.DataFrame(
+            {
+                "Calendar_Year__c": [2022, 2022, 2023, 2023],
+                "Out_of_State__c": [0, 0, 0, 0],
+                "Municipal_Solid_Waste__c": [100, 100, 100, 100],
+                "Annual_Recycling_Contamination_Rate__c": [np.nan, np.nan, 50, 50],
+                "Combined_Total_of_Material_Recycled__c": [50, 100, 50, 40],
+            }
+        )
+
+        output_series = summarize.recovery_rates_by_tonnage(records)
+
+        test_df = pd.Series(
+            {
+                2022: np.nan,
+                2023: 50.0,
             },
             name="annual_recycling_uncontaminated_rate",
         )
@@ -104,20 +130,20 @@ def test_recovery_rates_by_tonnage_uses_out_of_state_modifier(self, mocker):
         records = mocker.Mock()
         records.df = pd.DataFrame(
             {
-                "facility_name": ["foo", "bar", "foo", "bar"],
                 "Calendar_Year__c": [2022, 2022, 2023, 2023],
                 "Out_of_State__c": [0, 100, 0, 100],
-                "Annual_Recycling_Contamination_Rate__c": [10, 0, 10, 20],
-                "Combined_Total_of_Material_Recycled__c": [100, 100, 100, 100],
+                "Municipal_Solid_Waste__c": [100, 100, 100, 100],
+                "Annual_Recycling_Contamination_Rate__c": [50, 0, 50, 50],
+                "Combined_Total_of_Material_Recycled__c": [50, 100, 50, 40],
             }
         )
 
         output_series = summarize.recovery_rates_by_tonnage(records)
 
         test_df = pd.Series(
             {
-                2022: 90.0,
-                2023: 90.0,
+                2022: 50.0,
+                2023: 50.0,
             },
             name="annual_recycling_uncontaminated_rate",
         )