From b19fe809ea7324da14b7042eccc1db0b2b5049dc Mon Sep 17 00:00:00 2001
From: Tim Paine <3105306+timkpaine@users.noreply.github.com>
Date: Sat, 13 Jul 2024 13:10:24 -0400
Subject: [PATCH] NumPy 2.0 build support

Signed-off-by: Tim Paine <3105306+timkpaine@users.noreply.github.com>
---
 .github/actions/setup-dependencies/action.yml | 14 ---
 .github/workflows/build.yml                   | 93 ++++++++++++++++---
 cpp/csp/python/Common.h                       | 16 ++++
 cpp/csp/python/NumpyConversions.cpp           | 11 ++-
 cpp/csp/python/NumpyConversions.h             |  4 +-
 cpp/csp/python/NumpyInputAdapter.h            | 18 ++--
 .../python/adapters/parquetadapterimpl.cpp    |  4 +-
 csp/tests/impl/test_pandas.py                 | 14 +--
 8 files changed, 124 insertions(+), 50 deletions(-)

diff --git a/.github/actions/setup-dependencies/action.yml b/.github/actions/setup-dependencies/action.yml
index eeaa95f94..261dc70f8 100644
--- a/.github/actions/setup-dependencies/action.yml
+++ b/.github/actions/setup-dependencies/action.yml
@@ -25,10 +25,6 @@ runs:
 
     ################
     # Linux # NOTE: skip for manylinux image
-    # - name: Linux init steps
-    #   shell: bash
-    #   run: make dependencies-vcpkg
-    #   if: ${{ runner.os == 'Linux' }} # skip
 
     ################
     # Mac
@@ -37,16 +33,6 @@ runs:
       run: make dependencies-mac
       if: ${{ runner.os == 'macOS' }}
 
-    # - name: Setup vcpkg cache in shell
-    #   shell: bash
-    #   run: |
-    #     which -a gcc-12
-    #     echo "CC=/usr/local/bin/gcc-12" >> $GITHUB_ENV
-    #     echo "CMAKE_C_COMPILER=/usr/local/bin/gcc-12" >> $GITHUB_ENV
-    #     echo "CXX=/usr/local/bin/g++-12" >> $GITHUB_ENV
-    #     echo "CMAKE_CXX_COMPILER=/usr/local/bin/g++-12" >> $GITHUB_ENV
-    #   if: ${{ runner.os == 'macOS' }}
-
     ################
     # Windows
     - name: Windows init steps (vc143)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ebc00dcff..153ab7564 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -648,7 +648,68 @@ jobs:
     ####################################################
     # Test Dependencies/Regressions                    #
     ####################################################
-    test_dependencies:
+    test_buildtime_dependencies:
+      needs:
+        - initialize
+      strategy:
+        matrix:
+          os:
+            - ubuntu-20.04
+          python-version:
+            - 3.9
+          packages:
+            - '"numpy>=2" "pandas>=2.2" "pyarrow>=16.1"'
+
+      runs-on: ${{ matrix.os }}
+
+      steps:
+        - name: Checkout
+          uses: actions/checkout@v4
+          with:
+              submodules: recursive
+              fetch-depth: 0
+
+        - name: Set up Python ${{ matrix.python-version }}
+          uses: ./.github/actions/setup-python
+          with:
+            version: '${{ matrix.python-version }}'
+
+        - name: Set up Caches
+          uses: ./.github/actions/setup-caches
+
+        - name: Install python dependencies
+          run: make requirements
+
+        - name: Install test dependencies
+          shell: bash
+          run: sudo apt-get install graphviz
+
+        # If we're checking a build-time dependency, install
+        # the dependency, and then try to build
+        - name: Install packages - ${{ matrix.packages }} (build time dependency check)
+          run: python -m pip install -U ${{ matrix.packages }}
+
+        - name: Python Wheel Steps - ${{ matrix.packages }} (build time dependency check)
+          run: make dist-py-cibw
+          env:
+            CIBW_BUILD: "cp39-manylinux*"
+            CIBW_ENVIRONMENT_LINUX: CSP_MANYLINUX="ON" CCACHE_DIR="/host/home/runner/work/csp/csp/.ccache" VCPKG_DEFAULT_BINARY_CACHE="/host${{ env.VCPKG_DEFAULT_BINARY_CACHE }}" VCPKG_DOWNLOADS="/host${{ env.VCPKG_DOWNLOADS }}"
+            CIBW_BUILD_VERBOSITY: 3
+
+        - name: Move Wheel
+          run: mv dist/*.whl .
+
+        - name: Install wheel (build time dependency check)
+          run: python -m pip install -U *manylinux*.whl --target .
+
+        - name: Install packages - ${{ matrix.packages }} (build time dependency check)
+          run: python -m pip install -U ${{ matrix.packages }}
+
+        # Run tests to check dependencies
+        - name: Python Test Steps (build time dependency check)
+          run: make test
+
+    test_runtime_dependencies:
       needs:
         - initialize
         - build
@@ -659,10 +720,10 @@ jobs:
             - ubuntu-20.04
           python-version:
             - 3.9
-          package:
-            - "sqlalchemy>=2"
-            - "sqlalchemy<2"
-            - "numpy==1.19.5"
+          packages:
+            - '"sqlalchemy>=2"'
+            - '"sqlalchemy<2"'
+            - '"numpy==1.19.5"'
 
       runs-on: ${{ matrix.os }}
 
@@ -671,6 +732,7 @@ jobs:
           uses: actions/checkout@v4
           with:
               submodules: recursive
+              fetch-depth: 0
 
         - name: Set up Python ${{ matrix.python-version }}
           uses: ./.github/actions/setup-python
@@ -684,24 +746,26 @@ jobs:
           shell: bash
           run: sudo apt-get install graphviz
 
-        - name: Download wheel
+        - name: Download wheel (run time dependency check)
           uses: actions/download-artifact@v4
           with:
             name: csp-dist-${{ runner.os }}-${{ runner.arch }}-${{ matrix.python-version }}
 
-        - name: Install wheel
+        - name: Install wheel (run time dependency check)
           run: python -m pip install -U *manylinux*.whl --target .
 
-        - name: Install package - ${{ matrix.package }}
-          run: python -m pip install -U "${{ matrix.package }}"
+        - name: Install packages - ${{ matrix.packages }} (run time dependency check)
+          run: python -m pip install -U ${{ matrix.packages }}
 
-        - name: Python Test Steps
+        # Run tests to check dependencies
+        - name: Python Test Steps (run time dependency check)
           run: make test TEST_ARGS="-k TestDBReader"
-          if: ${{ contains( 'sqlalchemy', matrix.package )}}
+          if: ${{ contains( matrix.packages, 'sqlalchemy' )}}
 
-        - name: Python Test Steps
+        # For e.g. numpy dep changes, run all tests
+        - name: Python Test Steps (run time dependency check)
           run: make test
-          if: ${{ contains( 'numpy', matrix.package )}}
+          if: ${{ contains( matrix.packages, 'numpy' )}}
 
     ###########################################################################################################
     #.........................................................................................................#
@@ -750,7 +814,8 @@ jobs:
         - build
         - test
         - test_sdist
-        - test_dependencies
+        - test_buildtime_dependencies
+        - test_runtime_dependencies
 
       if: startsWith(github.ref, 'refs/tags/v')
       runs-on: ubuntu-22.04
diff --git a/cpp/csp/python/Common.h b/cpp/csp/python/Common.h
index e8ecb0b1f..f73eab349 100644
--- a/cpp/csp/python/Common.h
+++ b/cpp/csp/python/Common.h
@@ -7,6 +7,22 @@
 
 #define INIT_PYDATETIME if( !PyDateTimeAPI ) { PyDateTime_IMPORT; }
 
+// NumPy 2.0 Migration
+#include <numpy/numpyconfig.h>
+
+#if NPY_ABI_VERSION >= 0x02000000
+// Define helper for anything that can't
+// be handled by the below helper macros
+#define CSP_NUMPY_2
+
+#else
+
+// Numpy 2.0 helpers
+#define PyDataType_ELSIZE( descr ) ( ( descr ) -> elsize )
+#define PyDataType_C_METADATA( descr ) ( ( descr ) -> c_metadata )
+
+#endif
+
 namespace csp::python
 {
 
diff --git a/cpp/csp/python/NumpyConversions.cpp b/cpp/csp/python/NumpyConversions.cpp
index 982544038..99859c448 100644
--- a/cpp/csp/python/NumpyConversions.cpp
+++ b/cpp/csp/python/NumpyConversions.cpp
@@ -3,6 +3,7 @@
 
 
 #include <csp/core/Time.h>
+#include <csp/python/Common.h>
 #include <csp/python/NumpyConversions.h>
 
 #include <locale>
@@ -59,7 +60,7 @@ int64_t scalingFromNumpyDtUnit( NPY_DATETIMEUNIT base )
 
 NPY_DATETIMEUNIT datetimeUnitFromDescr( PyArray_Descr* descr )
 {
-    PyArray_DatetimeDTypeMetaData* dtypeMeta = (PyArray_DatetimeDTypeMetaData*)(descr -> c_metadata);
+    PyArray_DatetimeDTypeMetaData* dtypeMeta = (PyArray_DatetimeDTypeMetaData*)( PyDataType_C_METADATA( descr ) );
     PyArray_DatetimeMetaData* dtMeta = &(dtypeMeta -> meta);
     return dtMeta -> base;
 }
@@ -68,7 +69,7 @@ static std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> wstr_converte
 
 void stringFromNumpyStr( void* data, std::string& out, char numpy_type, int elem_size_bytes )
 {
-    // strings from numpy arrays are fixed width and zero filled.  
+    // strings from numpy arrays are fixed width and zero filled.
     // if the last char is 0, can treat as null terminated, else use full width
 
     if( numpy_type == NPY_UNICODELTR)
@@ -87,7 +88,11 @@ void stringFromNumpyStr( void* data, std::string& out, char numpy_type, int elem
             out = wstr_converter.to_bytes( wstr );
         }
     }
+#ifdef CSP_NUMPY_2
+    else if( numpy_type == NPY_STRINGLTR )
+#else
     else if( numpy_type == NPY_STRINGLTR || numpy_type == NPY_STRINGLTR2 )
+#endif
     {
         const char * const raw_value = (const char *) data;
 
@@ -144,7 +149,9 @@ void validateNumpyTypeVsCspType( const CspTypePtr & type, char numpy_type_char )
             // everything works as object
             break;
         case NPY_STRINGLTR:
+#ifndef CSP_NUMPY_2
         case NPY_STRINGLTR2:
+#endif
         case NPY_UNICODELTR:
         case NPY_CHARLTR:
             if( cspType != csp::CspType::Type::STRING )
diff --git a/cpp/csp/python/NumpyConversions.h b/cpp/csp/python/NumpyConversions.h
index 04ce787b8..e4d22b547 100644
--- a/cpp/csp/python/NumpyConversions.h
+++ b/cpp/csp/python/NumpyConversions.h
@@ -204,7 +204,7 @@ inline PyObject * createNumpyArray( ValueType valueType, const csp::TimeSeriesPr
     T lastValue;
     if( ts -> valid() )
         lastValue = ts -> lastValueTyped<T>();
-    
+
     DateTime lastTime = ( ts -> valid() ? ts -> lastTime() : DateTime() );
     switch( valueType )
     {
@@ -219,7 +219,7 @@ inline PyObject * createNumpyArray( ValueType valueType, const csp::TimeSeriesPr
         case ValueType::TIMESTAMP_VALUE_TUPLE:
         {
             PyObject * tuple = PyTuple_New( 2 );
-            PyTuple_SET_ITEM( tuple, 0, adjustStartAndEndTime( as_nparray( ts, ts -> timeline(), lastTime, startIndex, 
+            PyTuple_SET_ITEM( tuple, 0, adjustStartAndEndTime( as_nparray( ts, ts -> timeline(), lastTime, startIndex,
                                         endIndex, extrapolateEnd ), startPolicy, endPolicy, startDt, endDt ) );
             PyTuple_SET_ITEM( tuple, 1, as_nparray( ts, ts -> dataline<T>(), lastValue, startIndex, endIndex, extrapolateEnd ) );
             return tuple;
diff --git a/cpp/csp/python/NumpyInputAdapter.h b/cpp/csp/python/NumpyInputAdapter.h
index 0d4bd7c8b..049546eaf 100644
--- a/cpp/csp/python/NumpyInputAdapter.h
+++ b/cpp/csp/python/NumpyInputAdapter.h
@@ -29,18 +29,18 @@ class NumpyCurveAccessor
         m_descr = nullptr;
     }
 
-    NumpyCurveAccessor( PyArrayObject * arr ) 
+    NumpyCurveAccessor( PyArrayObject * arr )
     {
         m_nd = PyArray_NDIM( arr );
         if( m_nd < 2 )
             CSP_THROW( csp::TypeError, "NumpyCurveAccessor is inefficient for a 1-D Numpy array: use PyArray_GETPTR1 to access indexed values" );
-        
+
         // Preprocess strides and dimensions
         npy_intp* strides = PyArray_STRIDES( arr );
         npy_intp* dims = PyArray_DIMS( arr );
         m_outerStride = strides[0];
         m_outerDim = dims[0];
-        m_innerStrides = strides + 1; 
+        m_innerStrides = strides + 1;
         m_innerDims = dims + 1;
 
         m_arr = arr;
@@ -58,7 +58,7 @@ class NumpyCurveAccessor
     {
         if( index >= m_outerDim )
             CSP_THROW( csp::TypeError, "Requested data index out of range in NumpyCurveAccessor" );
-        
+
         // Create a view to the (n-1) dimensional array with (n-1) potentially unnatural strides
         /*
         A note on reference counting for the subarray: NewFromDescr will *steal* a reference to the type descr,
@@ -87,7 +87,7 @@ class NumpyCurveAccessor
 private:
     char* m_data;
     int m_nd;
-    
+
     npy_intp m_outerStride;
     npy_intp m_outerDim;
     npy_intp* m_innerStrides;
@@ -103,7 +103,7 @@ class NumpyInputAdapter : public PullInputAdapter<T>
     using PyArrayObjectPtr = PyPtr<PyArrayObject>;
 
 public:
-    NumpyInputAdapter( Engine * engine, CspTypePtr & type, PyArrayObject * datetimes, 
+    NumpyInputAdapter( Engine * engine, CspTypePtr & type, PyArrayObject * datetimes,
                        PyArrayObject * values ) : PullInputAdapter<T>( engine, type, PushMode::LAST_VALUE ),
                                                   m_datetimes( PyArrayObjectPtr::incref( datetimes ) ),
                                                   m_values( PyArrayObjectPtr::incref( values ) ),
@@ -113,7 +113,7 @@ class NumpyInputAdapter : public PullInputAdapter<T>
         PyArray_Descr* vals_descr = PyArray_DESCR(m_values.ptr());
 
         m_size = static_cast<int>(PyArray_SIZE( datetimes ));
-        m_elem_size = vals_descr -> elsize;
+        m_elem_size = PyDataType_ELSIZE(vals_descr);
         m_val_type = vals_descr -> type;
 
         char out_type = m_val_type;
@@ -123,7 +123,7 @@ class NumpyInputAdapter : public PullInputAdapter<T>
             m_valueAccessor = std::make_unique<NumpyCurveAccessor>( m_values.ptr() );
         }
         validateNumpyTypeVsCspType( type, out_type );
-        
+
 
         auto dt_type = dts_descr -> type;
         if( dt_type != NPY_DATETIMELTR && dt_type != NPY_OBJECTLTR )
@@ -166,7 +166,7 @@ class NumpyInputAdapter : public PullInputAdapter<T>
 
             ++m_index;
         }
- 
+
         PullInputAdapter<T>::start( start, end );
     }
 
diff --git a/cpp/csp/python/adapters/parquetadapterimpl.cpp b/cpp/csp/python/adapters/parquetadapterimpl.cpp
index a7f41aaef..ebe76d168 100644
--- a/cpp/csp/python/adapters/parquetadapterimpl.cpp
+++ b/cpp/csp/python/adapters/parquetadapterimpl.cpp
@@ -303,7 +303,7 @@ class NumpyUnicodeArrayWriter : public TypedDialectGenericListWriterInterface<st
                                                          PyObject_Repr( ( PyObject * ) PyArray_DESCR( arrayObject ) ) ) );
         }
 
-        auto elementSize = PyArray_DESCR( arrayObject ) -> elsize;
+        auto elementSize = PyDataType_ELSIZE( PyArray_DESCR( arrayObject ) );
         auto ndim        = PyArray_NDIM( arrayObject );
 
         CSP_TRUE_OR_THROW_RUNTIME( ndim == 1, "While writing to parquet expected numpy array with 1 dimension" << " got " << ndim );
@@ -451,7 +451,7 @@ class NumpyUnicodeReaderImpl final : public TypedDialectGenericListReaderInterfa
     {
         auto arrayObject = reinterpret_cast<PyArrayObject *>(csp::python::toPythonBorrowed( list ));
         std::wstring_convert<std::codecvt_utf8<char32_t>,char32_t> converter;
-        auto elementSize = PyArray_DESCR( arrayObject ) -> elsize;
+        auto elementSize = PyDataType_ELSIZE( PyArray_DESCR( arrayObject ) );
         auto wideValue = converter.from_bytes( value );
         auto nElementsToCopy = std::min( int(elementSize / sizeof(char32_t)), int( wideValue.size() + 1 ) );
         std::copy_n( wideValue.c_str(), nElementsToCopy, reinterpret_cast<char32_t*>(PyArray_GETPTR1( arrayObject, index )) );
diff --git a/csp/tests/impl/test_pandas.py b/csp/tests/impl/test_pandas.py
index ae9d6e592..bed701247 100644
--- a/csp/tests/impl/test_pandas.py
+++ b/csp/tests/impl/test_pandas.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import unittest
 from datetime import datetime, timedelta
-from numpy import NaN
+from numpy import nan
 
 import csp
 from csp.impl.pandas import make_pandas
@@ -37,7 +37,7 @@ def test_make_pandas_basic(self):
         # - pandas is a bit inconsistent with whether or not it sets freq on the DateTimeIndex, so we drop for comparison.
         # - when there is missing data in an integer column, it uses float NaN and hence the column becomes float type
         idx = pd.DatetimeIndex([start + dt2, start + dt2 + dt1, start + dt2 + 2 * dt1])
-        target = pd.DataFrame({"x": [2, 3, 4], "y": [1.0, NaN, 2.0]}, index=idx)
+        target = pd.DataFrame({"x": [2, 3, 4], "y": [1.0, nan, 2.0]}, index=idx)
         out1[1][1].index.freq = None
         pd.testing.assert_frame_equal(out1[1][1], target)
 
@@ -51,7 +51,7 @@ def test_make_pandas_basic(self):
         idx = pd.DatetimeIndex(
             [start + dt2, start + dt2 + dt1, start + dt2 + 2 * dt1, start + dt2 + 3 * dt1, start + dt2 + 4 * dt1]
         )
-        target = pd.DataFrame({"x": [2, 3, 4, 5, 6], "y": [1.0, NaN, 2.0, NaN, 3.0]}, index=idx)
+        target = pd.DataFrame({"x": [2, 3, 4, 5, 6], "y": [1.0, nan, 2.0, nan, 3.0]}, index=idx)
         out1[2][1].index.freq = None
         pd.testing.assert_frame_equal(out1[2][1], target)
 
@@ -93,12 +93,12 @@ def test_make_pandas_window(self):
         # - pandas is a bit inconsistent with whether or not it sets freq on the DateTimeIndex, so we drop for comparison.
         # - when there is missing data in an integer column, it uses float NaN and hence the column becomes float type
         idx = pd.DatetimeIndex([start + dt2, start + dt2 + dt1, start + dt2 + 2 * dt1])
-        target = pd.DataFrame({"x": [NaN, 3.0, 4.0], "y": [1.0, NaN, 2.0]}, index=idx)
+        target = pd.DataFrame({"x": [nan, 3.0, 4.0], "y": [1.0, nan, 2.0]}, index=idx)
         out1[1][1].index.freq = None
         pd.testing.assert_frame_equal(out1[1][1], target)
 
         idx = pd.DatetimeIndex([start + dt2 + dt1, start + dt2 + 2 * dt1])
-        target = pd.DataFrame({"x": [3, 4], "y": [NaN, 2.0]}, index=idx)
+        target = pd.DataFrame({"x": [3, 4], "y": [nan, 2.0]}, index=idx)
         out2[1][1].index.freq = None
         pd.testing.assert_frame_equal(out2[1][1], target)
 
@@ -135,12 +135,12 @@ def test_make_pandas_init(self):
         pd.testing.assert_frame_equal(out1[0][1], target)
 
         idx = pd.DatetimeIndex([start + dt1])
-        target = pd.DataFrame({"x": [1], "y": [NaN]}, index=idx)
+        target = pd.DataFrame({"x": [1], "y": [nan]}, index=idx)
         pd.testing.assert_frame_equal(out1[1][1], target)
         pd.testing.assert_frame_equal(out1[2][1], target)
 
         idx = pd.DatetimeIndex([start + dt1, start + dt2])
-        target = pd.DataFrame({"x": [1, 2], "y": [NaN, 1.0]}, index=idx)
+        target = pd.DataFrame({"x": [1, 2], "y": [nan, 1.0]}, index=idx)
         pd.testing.assert_frame_equal(out1[3][1], target)
 
         ## out2