From b19fe809ea7324da14b7042eccc1db0b2b5049dc Mon Sep 17 00:00:00 2001 From: Tim Paine <3105306+timkpaine@users.noreply.github.com> Date: Sat, 13 Jul 2024 13:10:24 -0400 Subject: [PATCH] NumPy 2.0 build support Signed-off-by: Tim Paine <3105306+timkpaine@users.noreply.github.com> --- .github/actions/setup-dependencies/action.yml | 14 --- .github/workflows/build.yml | 93 ++++++++++++++++--- cpp/csp/python/Common.h | 16 ++++ cpp/csp/python/NumpyConversions.cpp | 11 ++- cpp/csp/python/NumpyConversions.h | 4 +- cpp/csp/python/NumpyInputAdapter.h | 18 ++-- .../python/adapters/parquetadapterimpl.cpp | 4 +- csp/tests/impl/test_pandas.py | 14 +-- 8 files changed, 124 insertions(+), 50 deletions(-) diff --git a/.github/actions/setup-dependencies/action.yml b/.github/actions/setup-dependencies/action.yml index eeaa95f94..261dc70f8 100644 --- a/.github/actions/setup-dependencies/action.yml +++ b/.github/actions/setup-dependencies/action.yml @@ -25,10 +25,6 @@ runs: ################ # Linux # NOTE: skip for manylinux image - # - name: Linux init steps - # shell: bash - # run: make dependencies-vcpkg - # if: ${{ runner.os == 'Linux' }} # skip ################ # Mac @@ -37,16 +33,6 @@ runs: run: make dependencies-mac if: ${{ runner.os == 'macOS' }} - # - name: Setup vcpkg cache in shell - # shell: bash - # run: | - # which -a gcc-12 - # echo "CC=/usr/local/bin/gcc-12" >> $GITHUB_ENV - # echo "CMAKE_C_COMPILER=/usr/local/bin/gcc-12" >> $GITHUB_ENV - # echo "CXX=/usr/local/bin/g++-12" >> $GITHUB_ENV - # echo "CMAKE_CXX_COMPILER=/usr/local/bin/g++-12" >> $GITHUB_ENV - # if: ${{ runner.os == 'macOS' }} - ################ # Windows - name: Windows init steps (vc143) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ebc00dcff..153ab7564 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -648,7 +648,68 @@ jobs: #################################################### # Test Dependencies/Regressions # #################################################### - test_dependencies: + test_buildtime_dependencies: + needs: + - initialize + strategy: + matrix: + os: + - ubuntu-20.04 + python-version: + - 3.9 + packages: + - '"numpy>=2" "pandas>=2.2" "pyarrow>=16.1"' + + runs-on: ${{ matrix.os }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - name: Set up Python ${{ matrix.python-version }} + uses: ./.github/actions/setup-python + with: + version: '${{ matrix.python-version }}' + + - name: Set up Caches + uses: ./.github/actions/setup-caches + + - name: Install python dependencies + run: make requirements + + - name: Install test dependencies + shell: bash + run: sudo apt-get install graphviz + + # If we're checking a build-time dependency, install + # the dependency, and then try to build + - name: Install packages - ${{ matrix.packages }} (build time dependency check) + run: python -m pip install -U ${{ matrix.packages }} + + - name: Python Wheel Steps - ${{ matrix.packages }} (build time dependency check) + run: make dist-py-cibw + env: + CIBW_BUILD: "cp39-manylinux*" + CIBW_ENVIRONMENT_LINUX: CSP_MANYLINUX="ON" CCACHE_DIR="/host/home/runner/work/csp/csp/.ccache" VCPKG_DEFAULT_BINARY_CACHE="/host${{ env.VCPKG_DEFAULT_BINARY_CACHE }}" VCPKG_DOWNLOADS="/host${{ env.VCPKG_DOWNLOADS }}" + CIBW_BUILD_VERBOSITY: 3 + + - name: Move Wheel + run: mv dist/*.whl . + + - name: Install wheel (build time dependency check) + run: python -m pip install -U *manylinux*.whl --target . + + - name: Install packages - ${{ matrix.packages }} (build time dependency check) + run: python -m pip install -U ${{ matrix.packages }} + + # Run tests to check dependencies + - name: Python Test Steps (build time dependency check) + run: make test + + test_runtime_dependencies: needs: - initialize - build @@ -659,10 +720,10 @@ jobs: - ubuntu-20.04 python-version: - 3.9 - package: - - "sqlalchemy>=2" - - "sqlalchemy<2" - - "numpy==1.19.5" + packages: + - '"sqlalchemy>=2"' + - '"sqlalchemy<2"' + - '"numpy==1.19.5"' runs-on: ${{ matrix.os }} @@ -671,6 +732,7 @@ jobs: uses: actions/checkout@v4 with: submodules: recursive + fetch-depth: 0 - name: Set up Python ${{ matrix.python-version }} uses: ./.github/actions/setup-python @@ -684,24 +746,26 @@ jobs: shell: bash run: sudo apt-get install graphviz - - name: Download wheel + - name: Download wheel (run time dependency check) uses: actions/download-artifact@v4 with: name: csp-dist-${{ runner.os }}-${{ runner.arch }}-${{ matrix.python-version }} - - name: Install wheel + - name: Install wheel (run time dependency check) run: python -m pip install -U *manylinux*.whl --target . - - name: Install package - ${{ matrix.package }} - run: python -m pip install -U "${{ matrix.package }}" + - name: Install packages - ${{ matrix.packages }} (run time dependency check) + run: python -m pip install -U ${{ matrix.packages }} - - name: Python Test Steps + # Run tests to check dependencies + - name: Python Test Steps (run time dependency check) run: make test TEST_ARGS="-k TestDBReader" - if: ${{ contains( 'sqlalchemy', matrix.package )}} + if: ${{ contains( matrix.packages, 'sqlalchemy' )}} - - name: Python Test Steps + # For e.g. numpy dep changes, run all tests + - name: Python Test Steps (run time dependency check) run: make test - if: ${{ contains( 'numpy', matrix.package )}} + if: ${{ contains( matrix.packages, 'numpy' )}} ########################################################################################################### #.........................................................................................................# @@ -750,7 +814,8 @@ jobs: - build - test - test_sdist - - test_dependencies + - test_buildtime_dependencies + - test_runtime_dependencies if: startsWith(github.ref, 'refs/tags/v') runs-on: ubuntu-22.04 diff --git a/cpp/csp/python/Common.h b/cpp/csp/python/Common.h index e8ecb0b1f..f73eab349 100644 --- a/cpp/csp/python/Common.h +++ b/cpp/csp/python/Common.h @@ -7,6 +7,22 @@ #define INIT_PYDATETIME if( !PyDateTimeAPI ) { PyDateTime_IMPORT; } +// NumPy 2.0 Migration +#include + +#if NPY_ABI_VERSION >= 0x02000000 +// Define helper for anything that can't +// be handled by the below helper macros +#define CSP_NUMPY_2 + +#else + +// Numpy 2.0 helpers +#define PyDataType_ELSIZE( descr ) ( ( descr ) -> elsize ) +#define PyDataType_C_METADATA( descr ) ( ( descr ) -> c_metadata ) + +#endif + namespace csp::python { diff --git a/cpp/csp/python/NumpyConversions.cpp b/cpp/csp/python/NumpyConversions.cpp index 982544038..99859c448 100644 --- a/cpp/csp/python/NumpyConversions.cpp +++ b/cpp/csp/python/NumpyConversions.cpp @@ -3,6 +3,7 @@ #include +#include #include #include @@ -59,7 +60,7 @@ int64_t scalingFromNumpyDtUnit( NPY_DATETIMEUNIT base ) NPY_DATETIMEUNIT datetimeUnitFromDescr( PyArray_Descr* descr ) { - PyArray_DatetimeDTypeMetaData* dtypeMeta = (PyArray_DatetimeDTypeMetaData*)(descr -> c_metadata); + PyArray_DatetimeDTypeMetaData* dtypeMeta = (PyArray_DatetimeDTypeMetaData*)( PyDataType_C_METADATA( descr ) ); PyArray_DatetimeMetaData* dtMeta = &(dtypeMeta -> meta); return dtMeta -> base; } @@ -68,7 +69,7 @@ static std::wstring_convert, char32_t> wstr_converte void stringFromNumpyStr( void* data, std::string& out, char numpy_type, int elem_size_bytes ) { - // strings from numpy arrays are fixed width and zero filled. + // strings from numpy arrays are fixed width and zero filled. // if the last char is 0, can treat as null terminated, else use full width if( numpy_type == NPY_UNICODELTR) @@ -87,7 +88,11 @@ void stringFromNumpyStr( void* data, std::string& out, char numpy_type, int elem out = wstr_converter.to_bytes( wstr ); } } +#ifdef CSP_NUMPY_2 + else if( numpy_type == NPY_STRINGLTR ) +#else else if( numpy_type == NPY_STRINGLTR || numpy_type == NPY_STRINGLTR2 ) +#endif { const char * const raw_value = (const char *) data; @@ -144,7 +149,9 @@ void validateNumpyTypeVsCspType( const CspTypePtr & type, char numpy_type_char ) // everything works as object break; case NPY_STRINGLTR: +#ifndef CSP_NUMPY_2 case NPY_STRINGLTR2: +#endif case NPY_UNICODELTR: case NPY_CHARLTR: if( cspType != csp::CspType::Type::STRING ) diff --git a/cpp/csp/python/NumpyConversions.h b/cpp/csp/python/NumpyConversions.h index 04ce787b8..e4d22b547 100644 --- a/cpp/csp/python/NumpyConversions.h +++ b/cpp/csp/python/NumpyConversions.h @@ -204,7 +204,7 @@ inline PyObject * createNumpyArray( ValueType valueType, const csp::TimeSeriesPr T lastValue; if( ts -> valid() ) lastValue = ts -> lastValueTyped(); - + DateTime lastTime = ( ts -> valid() ? ts -> lastTime() : DateTime() ); switch( valueType ) { @@ -219,7 +219,7 @@ inline PyObject * createNumpyArray( ValueType valueType, const csp::TimeSeriesPr case ValueType::TIMESTAMP_VALUE_TUPLE: { PyObject * tuple = PyTuple_New( 2 ); - PyTuple_SET_ITEM( tuple, 0, adjustStartAndEndTime( as_nparray( ts, ts -> timeline(), lastTime, startIndex, + PyTuple_SET_ITEM( tuple, 0, adjustStartAndEndTime( as_nparray( ts, ts -> timeline(), lastTime, startIndex, endIndex, extrapolateEnd ), startPolicy, endPolicy, startDt, endDt ) ); PyTuple_SET_ITEM( tuple, 1, as_nparray( ts, ts -> dataline(), lastValue, startIndex, endIndex, extrapolateEnd ) ); return tuple; diff --git a/cpp/csp/python/NumpyInputAdapter.h b/cpp/csp/python/NumpyInputAdapter.h index 0d4bd7c8b..049546eaf 100644 --- a/cpp/csp/python/NumpyInputAdapter.h +++ b/cpp/csp/python/NumpyInputAdapter.h @@ -29,18 +29,18 @@ class NumpyCurveAccessor m_descr = nullptr; } - NumpyCurveAccessor( PyArrayObject * arr ) + NumpyCurveAccessor( PyArrayObject * arr ) { m_nd = PyArray_NDIM( arr ); if( m_nd < 2 ) CSP_THROW( csp::TypeError, "NumpyCurveAccessor is inefficient for a 1-D Numpy array: use PyArray_GETPTR1 to access indexed values" ); - + // Preprocess strides and dimensions npy_intp* strides = PyArray_STRIDES( arr ); npy_intp* dims = PyArray_DIMS( arr ); m_outerStride = strides[0]; m_outerDim = dims[0]; - m_innerStrides = strides + 1; + m_innerStrides = strides + 1; m_innerDims = dims + 1; m_arr = arr; @@ -58,7 +58,7 @@ class NumpyCurveAccessor { if( index >= m_outerDim ) CSP_THROW( csp::TypeError, "Requested data index out of range in NumpyCurveAccessor" ); - + // Create a view to the (n-1) dimensional array with (n-1) potentially unnatural strides /* A note on reference counting for the subarray: NewFromDescr will *steal* a reference to the type descr, @@ -87,7 +87,7 @@ class NumpyCurveAccessor private: char* m_data; int m_nd; - + npy_intp m_outerStride; npy_intp m_outerDim; npy_intp* m_innerStrides; @@ -103,7 +103,7 @@ class NumpyInputAdapter : public PullInputAdapter using PyArrayObjectPtr = PyPtr; public: - NumpyInputAdapter( Engine * engine, CspTypePtr & type, PyArrayObject * datetimes, + NumpyInputAdapter( Engine * engine, CspTypePtr & type, PyArrayObject * datetimes, PyArrayObject * values ) : PullInputAdapter( engine, type, PushMode::LAST_VALUE ), m_datetimes( PyArrayObjectPtr::incref( datetimes ) ), m_values( PyArrayObjectPtr::incref( values ) ), @@ -113,7 +113,7 @@ class NumpyInputAdapter : public PullInputAdapter PyArray_Descr* vals_descr = PyArray_DESCR(m_values.ptr()); m_size = static_cast(PyArray_SIZE( datetimes )); - m_elem_size = vals_descr -> elsize; + m_elem_size = PyDataType_ELSIZE(vals_descr); m_val_type = vals_descr -> type; char out_type = m_val_type; @@ -123,7 +123,7 @@ class NumpyInputAdapter : public PullInputAdapter m_valueAccessor = std::make_unique( m_values.ptr() ); } validateNumpyTypeVsCspType( type, out_type ); - + auto dt_type = dts_descr -> type; if( dt_type != NPY_DATETIMELTR && dt_type != NPY_OBJECTLTR ) @@ -166,7 +166,7 @@ class NumpyInputAdapter : public PullInputAdapter ++m_index; } - + PullInputAdapter::start( start, end ); } diff --git a/cpp/csp/python/adapters/parquetadapterimpl.cpp b/cpp/csp/python/adapters/parquetadapterimpl.cpp index a7f41aaef..ebe76d168 100644 --- a/cpp/csp/python/adapters/parquetadapterimpl.cpp +++ b/cpp/csp/python/adapters/parquetadapterimpl.cpp @@ -303,7 +303,7 @@ class NumpyUnicodeArrayWriter : public TypedDialectGenericListWriterInterface elsize; + auto elementSize = PyDataType_ELSIZE( PyArray_DESCR( arrayObject ) ); auto ndim = PyArray_NDIM( arrayObject ); CSP_TRUE_OR_THROW_RUNTIME( ndim == 1, "While writing to parquet expected numpy array with 1 dimension" << " got " << ndim ); @@ -451,7 +451,7 @@ class NumpyUnicodeReaderImpl final : public TypedDialectGenericListReaderInterfa { auto arrayObject = reinterpret_cast(csp::python::toPythonBorrowed( list )); std::wstring_convert,char32_t> converter; - auto elementSize = PyArray_DESCR( arrayObject ) -> elsize; + auto elementSize = PyDataType_ELSIZE( PyArray_DESCR( arrayObject ) ); auto wideValue = converter.from_bytes( value ); auto nElementsToCopy = std::min( int(elementSize / sizeof(char32_t)), int( wideValue.size() + 1 ) ); std::copy_n( wideValue.c_str(), nElementsToCopy, reinterpret_cast(PyArray_GETPTR1( arrayObject, index )) ); diff --git a/csp/tests/impl/test_pandas.py b/csp/tests/impl/test_pandas.py index ae9d6e592..bed701247 100644 --- a/csp/tests/impl/test_pandas.py +++ b/csp/tests/impl/test_pandas.py @@ -1,7 +1,7 @@ import pandas as pd import unittest from datetime import datetime, timedelta -from numpy import NaN +from numpy import nan import csp from csp.impl.pandas import make_pandas @@ -37,7 +37,7 @@ def test_make_pandas_basic(self): # - pandas is a bit inconsistent with whether or not it sets freq on the DateTimeIndex, so we drop for comparison. # - when there is missing data in an integer column, it uses float NaN and hence the column becomes float type idx = pd.DatetimeIndex([start + dt2, start + dt2 + dt1, start + dt2 + 2 * dt1]) - target = pd.DataFrame({"x": [2, 3, 4], "y": [1.0, NaN, 2.0]}, index=idx) + target = pd.DataFrame({"x": [2, 3, 4], "y": [1.0, nan, 2.0]}, index=idx) out1[1][1].index.freq = None pd.testing.assert_frame_equal(out1[1][1], target) @@ -51,7 +51,7 @@ def test_make_pandas_basic(self): idx = pd.DatetimeIndex( [start + dt2, start + dt2 + dt1, start + dt2 + 2 * dt1, start + dt2 + 3 * dt1, start + dt2 + 4 * dt1] ) - target = pd.DataFrame({"x": [2, 3, 4, 5, 6], "y": [1.0, NaN, 2.0, NaN, 3.0]}, index=idx) + target = pd.DataFrame({"x": [2, 3, 4, 5, 6], "y": [1.0, nan, 2.0, nan, 3.0]}, index=idx) out1[2][1].index.freq = None pd.testing.assert_frame_equal(out1[2][1], target) @@ -93,12 +93,12 @@ def test_make_pandas_window(self): # - pandas is a bit inconsistent with whether or not it sets freq on the DateTimeIndex, so we drop for comparison. # - when there is missing data in an integer column, it uses float NaN and hence the column becomes float type idx = pd.DatetimeIndex([start + dt2, start + dt2 + dt1, start + dt2 + 2 * dt1]) - target = pd.DataFrame({"x": [NaN, 3.0, 4.0], "y": [1.0, NaN, 2.0]}, index=idx) + target = pd.DataFrame({"x": [nan, 3.0, 4.0], "y": [1.0, nan, 2.0]}, index=idx) out1[1][1].index.freq = None pd.testing.assert_frame_equal(out1[1][1], target) idx = pd.DatetimeIndex([start + dt2 + dt1, start + dt2 + 2 * dt1]) - target = pd.DataFrame({"x": [3, 4], "y": [NaN, 2.0]}, index=idx) + target = pd.DataFrame({"x": [3, 4], "y": [nan, 2.0]}, index=idx) out2[1][1].index.freq = None pd.testing.assert_frame_equal(out2[1][1], target) @@ -135,12 +135,12 @@ def test_make_pandas_init(self): pd.testing.assert_frame_equal(out1[0][1], target) idx = pd.DatetimeIndex([start + dt1]) - target = pd.DataFrame({"x": [1], "y": [NaN]}, index=idx) + target = pd.DataFrame({"x": [1], "y": [nan]}, index=idx) pd.testing.assert_frame_equal(out1[1][1], target) pd.testing.assert_frame_equal(out1[2][1], target) idx = pd.DatetimeIndex([start + dt1, start + dt2]) - target = pd.DataFrame({"x": [1, 2], "y": [NaN, 1.0]}, index=idx) + target = pd.DataFrame({"x": [1, 2], "y": [nan, 1.0]}, index=idx) pd.testing.assert_frame_equal(out1[3][1], target) ## out2