diff --git a/Common/include/linear_algebra/CPreconditioner.hpp b/Common/include/linear_algebra/CPreconditioner.hpp index e4fc7cf159f..4eff5f947fb 100644 --- a/Common/include/linear_algebra/CPreconditioner.hpp +++ b/Common/include/linear_algebra/CPreconditioner.hpp @@ -77,6 +77,18 @@ class CPreconditioner { template CPreconditioner::~CPreconditioner() {} +/*! + * \class CIdentityPreconditioner + * \brief No-op preconditioner used when Krylov solvers run without preconditioning. + */ +template +class CIdentityPreconditioner final : public CPreconditioner { + public: + inline void operator()(const CSysVector& u, CSysVector& v) const override { v = u; } + + inline bool IsIdentity() const override { return true; } +}; + /*! * \class CJacobiPreconditioner * \brief Specialization of preconditioner that uses CSysMatrix class. @@ -332,6 +344,9 @@ CPreconditioner* CPreconditioner::Create(ENUM_LINEAR_SOL CPreconditioner* prec = nullptr; switch (kind) { + case NO_PRECONDITIONER: + prec = new CIdentityPreconditioner(); + break; case JACOBI: prec = new CJacobiPreconditioner(jacobian, geometry, config); break; diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp index ecb26959a06..6368670587e 100644 --- a/Common/include/linear_algebra/CSysMatrix.hpp +++ b/Common/include/linear_algebra/CSysMatrix.hpp @@ -108,6 +108,16 @@ struct CSysMatrixComms { MPI_QUANTITIES commType = MPI_QUANTITIES::SOLUTION_MATRIX); }; +/*! + * \brief Opaque storage for CUDA/cuSPARSE resources used by the GPU SpMV path. + */ +struct CudaSpMVResources; + +/*! + * \brief Release cached CUDA/cuSPARSE resources used by the GPU SpMV path. + */ +void ReleaseCudaSpMVResources(CudaSpMVResources*& resources); + /*! * \class CSysMatrix * \ingroup SpLinSys @@ -148,8 +158,10 @@ class CSysMatrix { ScalarType* d_matrix; /*!< \brief Device Pointer to store the matrix values on the GPU. */ const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */ const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */ + ScalarType* d_invM = nullptr; /*!< \brief Device inverse diagonal blocks for the Jacobi preconditioner. */ bool useCuda = false; /*!< \brief Boolean that indicates whether user has enabled CUDA or not. Mainly used to conditionally free GPU memory in the class destructor. */ + mutable CudaSpMVResources* spmv_resources = nullptr; /*!< \brief Cached cuSPARSE resources for GPU SpMV. */ ScalarType* ILU_matrix; /*!< \brief Entries of the ILU sparse matrix. */ unsigned long nnz_ilu; /*!< \brief Number of possible nonzero entries in the matrix (ILU). */ @@ -924,6 +936,13 @@ class CSysMatrix { */ void BuildJacobiPreconditioner(); + /*! + * \brief Build the Jacobi preconditioner on the GPU/device side. + * \note This helper is intended as the implementation hook for GPU-resident Krylov solvers. + * The actual implementation belongs in CSysMatrixGPU.cu. + */ + void BuildJacobiPreconditionerGPU(); + /*! * \brief Multiply CSysVector by the preconditioner * \param[in] vec - CSysVector to be multiplied by the preconditioner. @@ -934,6 +953,14 @@ class CSysMatrix { void ComputeJacobiPreconditioner(const CSysVector& vec, CSysVector& prod, CGeometry* geometry, const CConfig* config) const; + /*! + * \brief Apply the Jacobi preconditioner on the GPU/device side. + * \note This helper is intended as the implementation hook for GPU-resident Krylov solvers. + * The actual implementation belongs in CSysMatrixGPU.cu. + */ + void ComputeJacobiPreconditionerGPU(const CSysVector& vec, CSysVector& prod, + CGeometry* geometry, const CConfig* config) const; + /*! * \brief Build the ILU preconditioner. */ diff --git a/Common/include/linear_algebra/CSysSolve.hpp b/Common/include/linear_algebra/CSysSolve.hpp index 2ea3cbf7df3..af76e93c07d 100644 --- a/Common/include/linear_algebra/CSysSolve.hpp +++ b/Common/include/linear_algebra/CSysSolve.hpp @@ -304,6 +304,20 @@ class CSysSolve { ScalarType& residual, bool monitoring, const CConfig* config, FgcrodrMode mode, unsigned long custom_m) const; + template + unsigned long FGMRES_LinSolverImpl(const VectorType& b, VectorType& x, const ProductType& mat_vec, + const PrecondType& precond, ScalarType tol, unsigned long m, ScalarType& residual, + bool monitoring, const CConfig* config, const FGMRESOps& ops) const; + + /*! + * \brief CUDA/GPU backend wrapper for the Flexible GMRES solver. + * \note The algorithmic implementation is shared with the host solver. This helper + * only provides GPU vector-operation dispatch and synchronization. + */ + unsigned long FGMRES_LinSolver_GPU(const VectorType& b, VectorType& x, const ProductType& mat_vec, + const PrecondType& precond, ScalarType tol, unsigned long m, ScalarType& residual, + bool monitoring, const CConfig* config) const; + /*! * \brief Creates the inner solver for nested preconditioning if the settings allow it. * \returns True if the inner solver can be used. diff --git a/Common/include/linear_algebra/CSysSolveFGMRES.inl b/Common/include/linear_algebra/CSysSolveFGMRES.inl new file mode 100644 index 00000000000..cef424a729b --- /dev/null +++ b/Common/include/linear_algebra/CSysSolveFGMRES.inl @@ -0,0 +1,267 @@ +/*! + * \file CSysSolveFGMRES.inl + * \brief Shared implementation of the Flexible GMRES linear solver. + * \author Jesse Li + * \version 8.5.0 "Harrier" + * + * SU2 Project Website: https://su2code.github.io + * + * The SU2 Project is maintained by the SU2 Foundation + * (http://su2foundation.org) + * + * Copyright 2012-2026, SU2 Contributors (cf. AUTHORS.md) + * + * SU2 is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * SU2 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with SU2. If not, see . + */ + +namespace fgmres_impl { + +template +bool Orthogonalize(unsigned long i, su2matrix& H, std::vector>& V, + const FGMRESOps& ops) { + for (unsigned long k = 0; k <= i; k++) { + H(k, i) = ops.Dot(V[k], V[i + 1]); + ops.AddScaled(V[i + 1], -H(k, i), V[k]); + } + + for (unsigned long k = 0; k <= i; k++) { + const ScalarType dh = ops.Dot(V[k], V[i + 1]); + H(k, i) += dh; + ops.AddScaled(V[i + 1], -dh, V[k]); + } + + const ScalarType nrm = ops.Norm(V[i + 1]); + if (nrm <= ScalarType(0) || nrm != nrm) { + H(i + 1, i) = ScalarType(0); + return false; + } + + H(i + 1, i) = nrm; + ops.Divide(V[i + 1], nrm); + return true; +} + +} // namespace fgmres_impl + +template +template +unsigned long CSysSolve::FGMRES_LinSolverImpl(const VectorType& b, VectorType& x, + const ProductType& mat_vec, const PrecondType& precond, + ScalarType tol, unsigned long m, ScalarType& residual, + bool monitoring, const CConfig* config, + const FGMRESOps& ops) const { + SU2_ZONE_SCOPED + + const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE); + const bool flexible = !precond.IsIdentity(); + const bool nestedParallel = ops.NestedParallel(); + + /*--- Check the subspace size. ---*/ + + if (m < 1) { + SU2_MPI::Error("Number of linear solver iterations must be greater than 0.", CURRENT_FUNCTION); + } + + if (m > 1000) { + SU2_MPI::Error("FGMRES subspace is too large (>1000).", CURRENT_FUNCTION); + } + + /*--- Allocate if not allocated yet. ---*/ + + if (V.size() <= m || (flexible && Z.size() <= m)) { + BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS { + V.resize(m + 1); + for (auto& w : V) w.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr); + if (flexible) { + Z.resize(m + 1); + for (auto& z : Z) z.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr); + } + } + END_SU2_OMP_SAFE_GLOBAL_ACCESS + } + + /*--- Define various arrays. In parallel, each thread of each rank has and works + on its own thread, since calculations on these arrays are based on dot products + (reduced across all threads and ranks) all threads do the same computations. ---*/ + + su2vector g(m + 1), sn(m + 1), cs(m + 1), y(m); + g = ScalarType(0); + sn = ScalarType(0); + cs = ScalarType(0); + y = ScalarType(0); + + su2matrix H(m + 1, m); + H = ScalarType(0); + + ops.PrepareInput(b, x, xIsZero); + + /*--- Calculate the norm of the rhs vector. ---*/ + + ScalarType norm0 = ops.Norm(b); + + /*--- Calculate the initial residual (actually the negative residual) and compute its norm. ---*/ + + if (!xIsZero) { + mat_vec(x, V[0]); + ops.Subtract(V[0], b); + } else { + ops.AssignNegative(V[0], b); + } + + ScalarType beta = xIsZero ? norm0 : ops.Norm(V[0]); + + /*--- FGMRES shares V and Z with FGCRODR, reset deflation because we're breaking + * relations between subspaces that FGCRODR relies on for recycling. ---*/ + + SU2_OMP_MASTER + ResetDeflation(); + END_SU2_OMP_MASTER + + /*--- Set the norm to the initial initial residual value. ---*/ + + if (tol_type == LinearToleranceType::RELATIVE) norm0 = beta; + + if (beta < tol * norm0 || beta < eps) { + /*--- System is already solved. ---*/ + + if (masterRank) { + SU2_OMP_MASTER + cout << "CSysSolve::FGMRES(): system solved by initial guess." << endl; + END_SU2_OMP_MASTER + } + residual = beta; + ops.FinalizeOutput(x); + return 0; + } + + /*--- Normalize residual to get w_{0} (the negative sign is because w[0] + holds the negative residual, as mentioned above). ---*/ + + ops.Divide(V[0], -beta); + + /*--- Initialize the RHS of the reduced system. ---*/ + + g[0] = beta; + + /*--- Output header information including initial residual. ---*/ + + unsigned long i = 0; + if (monitoring && masterRank) { + SU2_OMP_MASTER { + WriteHeader("FGMRES", tol, beta); + WriteHistory(i, beta / norm0); + } + END_SU2_OMP_MASTER + } + + /*--- Loop over all search directions. ---*/ + + for (i = 0; i < m; i++) { + /*--- Check if solution has converged. ---*/ + + if (beta < tol * norm0) break; + + if (flexible) { + /*--- Precondition the CSysVector v[i] and store result in z[i]. ---*/ + + precond(V[i], Z[i]); + + /*--- Add to Krylov subspace. ---*/ + + mat_vec(Z[i], V[i + 1]); + } else { + mat_vec(V[i], V[i + 1]); + } + + /*--- Modified Gram-Schmidt orthogonalization. ---*/ + + bool orthog_ok; + if constexpr (FGMRESOps::UseSolverModGramSchmidt) { + if (nestedParallel) { + /*--- "omp parallel if" does not work well here. ---*/ + SU2_OMP_PARALLEL + orthog_ok = ModGramSchmidt(true, i, H, V); + END_SU2_OMP_PARALLEL + } else { + orthog_ok = ModGramSchmidt(false, i, H, V); + } + } else { + orthog_ok = fgmres_impl::Orthogonalize(i, H, V, ops); + } + + if (!orthog_ok) { + if (masterRank) { + SU2_OMP_MASTER + cout << "WARNING: FGMRES orthogonalization failed, linear solver diverged." << endl; + END_SU2_OMP_MASTER + } + break; + } + + /*--- Apply old Givens rotations to new column of the Hessenberg matrix then generate the + new Givens rotation matrix and apply it to the last two elements of H[:][i] and g. ---*/ + + for (unsigned long k = 0; k < i; k++) ApplyGivens(sn[k], cs[k], H(k, i), H(k + 1, i)); + GenerateGivens(H(i, i), H(i + 1, i), sn[i], cs[i]); + ApplyGivens(sn[i], cs[i], g[i], g[i + 1]); + + /*--- Set L2 norm of residual and check if solution has converged. ---*/ + + beta = fabs(g[i + 1]); + + /*--- Output the relative residual if necessary. ---*/ + + if (monitoring && masterRank && ((i + 1) % monitorFreq == 0)) { + SU2_OMP_MASTER + WriteHistory(i + 1, beta / norm0); + END_SU2_OMP_MASTER + } + } + + /*--- Solve the least-squares system and update solution. ---*/ + + SolveReduced(i, H, g, y); + + const auto& basis = flexible ? Z : V; + + ops.UpdateSolution(i, basis, y, x, nestedParallel); + ops.FinalizeOutput(x); + + /*--- Recalculate final (neg.) residual (this should be optional). ---*/ + + if (monitoring && config->GetComm_Level() == COMM_FULL) { + if (masterRank) { + SU2_OMP_MASTER + WriteFinalResidual("FGMRES", i, beta / norm0); + END_SU2_OMP_MASTER + } + + if (recomputeRes) { + mat_vec(x, V[0]); + ops.Subtract(V[0], b); + ScalarType res = ops.Norm(V[0]); + + if (fabs(res - beta) > tol * 10) { + if (masterRank) { + SU2_OMP_MASTER + WriteWarning(beta, res, tol); + END_SU2_OMP_MASTER + } + } + } + } + + residual = beta / norm0; + return i; +} diff --git a/Common/include/linear_algebra/CSysVector.hpp b/Common/include/linear_algebra/CSysVector.hpp index 1498b549bbb..d67185156b9 100644 --- a/Common/include/linear_algebra/CSysVector.hpp +++ b/Common/include/linear_algebra/CSysVector.hpp @@ -37,6 +37,10 @@ #include "vector_expressions.hpp" #include "../../include/CConfig.hpp" +#ifdef __CUDACC__ +#include "GPUComms.cuh" +#endif + /*! * \brief OpenMP worksharing construct used in CSysVector for loops. * \note The loop will only run in parallel if methods are called from a @@ -59,6 +63,116 @@ #define END_CSYSVEC_PARFOR #endif +namespace VecExpr { + +enum class DeviceAssignOp { Assign, Add, Subtract, Multiply, Divide }; + +template +class CDeviceVectorView : public CVecExpr, Scalar> { + private: + Scalar* data = nullptr; + unsigned long size = 0; + + public: + static constexpr bool StoreAsRef = false; + + CDeviceVectorView(Scalar* data_, unsigned long size_) : data(data_), size(size_) {} + +#ifdef __CUDACC__ + __device__ const Scalar& operator[](size_t i) const { return data[i]; } + + template + CDeviceVectorView& operator=(const CVecExpr& expr); + + CDeviceVectorView& operator=(Scalar val); + +#define MAKE_DEVICE_COMPOUND(OP) \ + template \ + CDeviceVectorView& operator OP(const CVecExpr& expr); + MAKE_DEVICE_COMPOUND(+=) + MAKE_DEVICE_COMPOUND(-=) + MAKE_DEVICE_COMPOUND(*=) + MAKE_DEVICE_COMPOUND(/=) +#undef MAKE_DEVICE_COMPOUND + +#define MAKE_DEVICE_SCALAR_COMPOUND(OP) CDeviceVectorView& operator OP(Scalar val); + MAKE_DEVICE_SCALAR_COMPOUND(+=) + MAKE_DEVICE_SCALAR_COMPOUND(-=) + MAKE_DEVICE_SCALAR_COMPOUND(*=) + MAKE_DEVICE_SCALAR_COMPOUND(/=) +#undef MAKE_DEVICE_SCALAR_COMPOUND +#endif +}; + +#ifdef __CUDACC__ +template +__global__ void DeviceAssignKernel(Scalar* data, unsigned long size, T expr) { + const unsigned long i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= size) return; + + if constexpr (Op == DeviceAssignOp::Assign) { + data[i] = expr[i]; + } else if constexpr (Op == DeviceAssignOp::Add) { + data[i] += expr[i]; + } else if constexpr (Op == DeviceAssignOp::Subtract) { + data[i] -= expr[i]; + } else if constexpr (Op == DeviceAssignOp::Multiply) { + data[i] *= expr[i]; + } else { + data[i] /= expr[i]; + } +} + +template +inline void AssignDeviceExpression(Scalar* data, unsigned long size, const CVecExpr& expr) { + if (size == 0) return; + constexpr unsigned block_size = 256; + const auto grid_size = static_cast((size + block_size - 1) / block_size); + DeviceAssignKernel<<>>(data, size, expr.derived()); + gpuErrChk(cudaPeekAtLastError()); +} + +template +template +CDeviceVectorView& CDeviceVectorView::operator=(const CVecExpr& expr) { + AssignDeviceExpression(data, size, expr); + return *this; +} + +template +CDeviceVectorView& CDeviceVectorView::operator=(Scalar val) { + AssignDeviceExpression(data, size, Bcast(val)); + return *this; +} + +#define MAKE_DEVICE_COMPOUND(OP, ASSIGN_OP) \ + template \ + template \ + CDeviceVectorView& CDeviceVectorView::operator OP(const CVecExpr& expr) { \ + AssignDeviceExpression(data, size, expr); \ + return *this; \ + } +MAKE_DEVICE_COMPOUND(+=, DeviceAssignOp::Add) +MAKE_DEVICE_COMPOUND(-=, DeviceAssignOp::Subtract) +MAKE_DEVICE_COMPOUND(*=, DeviceAssignOp::Multiply) +MAKE_DEVICE_COMPOUND(/=, DeviceAssignOp::Divide) +#undef MAKE_DEVICE_COMPOUND + +#define MAKE_DEVICE_SCALAR_COMPOUND(OP, ASSIGN_OP) \ + template \ + CDeviceVectorView& CDeviceVectorView::operator OP(Scalar val) { \ + AssignDeviceExpression(data, size, Bcast(val)); \ + return *this; \ + } +MAKE_DEVICE_SCALAR_COMPOUND(+=, DeviceAssignOp::Add) +MAKE_DEVICE_SCALAR_COMPOUND(-=, DeviceAssignOp::Subtract) +MAKE_DEVICE_SCALAR_COMPOUND(*=, DeviceAssignOp::Multiply) +MAKE_DEVICE_SCALAR_COMPOUND(/=, DeviceAssignOp::Divide) +#undef MAKE_DEVICE_SCALAR_COMPOUND +#endif + +} // namespace VecExpr + /*! * \class CSysVector * \ingroup SpLinSys @@ -235,16 +349,32 @@ class CSysVector : public VecExpr::CVecExpr, ScalarType> void DtHTransfer(bool trigger = true) const; /*! - * \brief Sets all the elements of the GPU vector to a certain value - * \param[in] trigger - boolean value that decides whether to conduct the transfer or not. True by default. + * \brief Dot product between this vector and another vector on the device. + * \note Explicit GPU helper for solver-side reductions. + * \param[in] other - Input vector. + * \return Dot product result. */ - void GPUSetVal(ScalarType val, bool trigger = true) const; + ScalarType GPUDot(const CSysVector& other) const; + + /*! + * \brief L2 norm of this vector on the device. + * \note Explicit GPU helper for solver-side reductions. + * \return L2 norm result. + */ + ScalarType GPUNorm() const; /*! * \brief return device pointer that points to the CSysVector values in GPU memory */ inline ScalarType* GetDevicePointer() const { return d_vec_val; } + /*! + * \brief Return an expression-template view of the device values. + */ + inline VecExpr::CDeviceVectorView device() const { + return VecExpr::CDeviceVectorView(d_vec_val, nElm); + } + /*! * \brief return the number of local elements in the CSysVector */ diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh index 13854381877..eb727477f21 100644 --- a/Common/include/linear_algebra/GPUComms.cuh +++ b/Common/include/linear_algebra/GPUComms.cuh @@ -25,6 +25,11 @@ * License along with SU2. If not, see . */ +#pragma once + +#ifndef SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH +#define SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH + #include #include @@ -51,3 +56,5 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t } #define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); } + +#endif // SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH diff --git a/Common/include/linear_algebra/vector_expressions.hpp b/Common/include/linear_algebra/vector_expressions.hpp index a0d0ce28901..b2c6a50f5a7 100644 --- a/Common/include/linear_algebra/vector_expressions.hpp +++ b/Common/include/linear_algebra/vector_expressions.hpp @@ -39,6 +39,12 @@ namespace VecExpr { /// \addtogroup VecExpr /// @{ +#ifdef __CUDACC__ +#define SU2_CUDA_HOST_DEVICE __host__ __device__ +#else +#define SU2_CUDA_HOST_DEVICE +#endif + /*! * \brief Base vector expression class. * \ingroup BLAS @@ -59,7 +65,7 @@ class CVecExpr { /*! * \brief Cast the expression to Derived, usually to allow evaluation via operator[]. */ - FORCEINLINE const Derived& derived() const { return static_cast(*this); } + SU2_CUDA_HOST_DEVICE FORCEINLINE const Derived& derived() const { return static_cast(*this); } // Allowed from C++14, allows nested expression propagation without // manually calling derived() on the expression being evaluated. @@ -76,8 +82,8 @@ class Bcast : public CVecExpr, Scalar> { public: static constexpr bool StoreAsRef = false; - FORCEINLINE Bcast(const Scalar& x_) : x(x_) {} - FORCEINLINE const Scalar& operator[](size_t) const { return x; } + SU2_CUDA_HOST_DEVICE FORCEINLINE Bcast(const Scalar& x_) : x(x_) {} + SU2_CUDA_HOST_DEVICE FORCEINLINE const Scalar& operator[](size_t) const { return x; } }; /*! @@ -120,19 +126,19 @@ namespace math = ::std; /*--- Macro to create expression classes (EXPR) and overloads (FUN) for unary * functions, based on their coefficient-wise implementation (IMPL). ---*/ -#define MAKE_UNARY_FUN(FUN, EXPR, IMPL) \ - /*!--- Expression class. ---*/ \ - template \ - class EXPR : public CVecExpr, Scalar> { \ - store_t u; \ - \ - public: \ - static constexpr bool StoreAsRef = false; \ - FORCEINLINE EXPR(const U& u_) : u(u_) {} \ - FORCEINLINE auto operator[](size_t i) const RETURNS(IMPL(u[i])) \ - }; \ - /*!--- Function overload, returns an expression object. ---*/ \ - template \ +#define MAKE_UNARY_FUN(FUN, EXPR, IMPL) \ + /*!--- Expression class. ---*/ \ + template \ + class EXPR : public CVecExpr, Scalar> { \ + store_t u; \ + \ + public: \ + static constexpr bool StoreAsRef = false; \ + FORCEINLINE EXPR(const U& u_) : u(u_) {} \ + SU2_CUDA_HOST_DEVICE FORCEINLINE auto operator[](size_t i) const RETURNS(IMPL(u[i])) \ + }; \ + /*!--- Function overload, returns an expression object. ---*/ \ + template \ FORCEINLINE auto FUN(const CVecExpr& u) RETURNS(EXPR(u.derived())) #define sign_impl(x) Scalar(1 - 2 * (x < 0)) @@ -158,7 +164,7 @@ MAKE_UNARY_FUN(sign, sign_, sign_impl) public: \ static constexpr bool StoreAsRef = false; \ FORCEINLINE EXPR(const U& u_, const V& v_) : u(u_), v(v_) {} \ - FORCEINLINE auto operator[](size_t i) const RETURNS(IMPL(u[i], v[i])) \ + SU2_CUDA_HOST_DEVICE FORCEINLINE auto operator[](size_t i) const RETURNS(IMPL(u[i], v[i])) \ }; \ /*!--- Vector with vector function overload. ---*/ \ template \ @@ -241,4 +247,5 @@ MAKE_BINARY_FUN(operator>, gt_, gt_impl) #undef MAKE_BINARY_FUN /// @} +#undef SU2_CUDA_HOST_DEVICE } // namespace VecExpr diff --git a/Common/include/option_structure.hpp b/Common/include/option_structure.hpp index 4c521c29239..65ba6dfa1ab 100644 --- a/Common/include/option_structure.hpp +++ b/Common/include/option_structure.hpp @@ -2517,6 +2517,7 @@ enum ENUM_LINEAR_SOLVER_PREC { PASTIX_ILU=10, /*!< \brief PaStiX ILU(k) preconditioner. */ PASTIX_LU_P, /*!< \brief PaStiX LU as preconditioner. */ PASTIX_LDLT_P, /*!< \brief PaStiX LDLT as preconditioner. */ + NO_PRECONDITIONER=20, /*!< \brief No preconditioner. */ }; static const MapType Linear_Solver_Prec_Map = { MakePair("JACOBI", JACOBI) @@ -2526,6 +2527,7 @@ static const MapType Linear_Solver_Prec_Ma MakePair("PASTIX_ILU", PASTIX_ILU) MakePair("PASTIX_LU", PASTIX_LU_P) MakePair("PASTIX_LDLT", PASTIX_LDLT_P) + MakePair("NONE", NO_PRECONDITIONER) }; /*! diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp index 00ef51c2023..fafa093734e 100644 --- a/Common/src/linear_algebra/CSysMatrix.cpp +++ b/Common/src/linear_algebra/CSysMatrix.cpp @@ -54,6 +54,7 @@ CSysMatrix::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::G col_ind_ilu = nullptr; invM = nullptr; + d_invM = nullptr; #ifdef USE_MKL MatrixMatrixProductJitter = nullptr; @@ -67,6 +68,8 @@ template CSysMatrix::~CSysMatrix() { SU2_ZONE_SCOPED + ReleaseCudaSpMVResources(spmv_resources); + delete[] omp_partitions; MemoryAllocation::aligned_free(ILU_matrix); MemoryAllocation::aligned_free(matrix); @@ -76,6 +79,7 @@ CSysMatrix::~CSysMatrix() { GPUMemoryAllocation::gpu_free(d_matrix); GPUMemoryAllocation::gpu_free(d_row_ptr); GPUMemoryAllocation::gpu_free(d_col_ind); + GPUMemoryAllocation::gpu_free(d_invM); } #ifdef USE_MKL @@ -86,6 +90,10 @@ CSysMatrix::~CSysMatrix() { #endif } +#ifndef HAVE_CUDA +void ReleaseCudaSpMVResources(CudaSpMVResources*& resources) { resources = nullptr; } +#endif + template void CSysMatrix::Initialize(unsigned long npoint, unsigned long npointdomain, unsigned short nvar, unsigned short neqn, bool EdgeConnect, CGeometry* geometry, @@ -196,6 +204,10 @@ void CSysMatrix::Initialize(unsigned long npoint, unsigned long npoi if (diag_needed) allocAndInit(invM, nPointDomain * nVar * nEqn); + if (useCuda && diag_needed) { + d_invM = GPUMemoryAllocation::gpu_alloc(nPointDomain * nVar * nEqn * sizeof(ScalarType)); + } + /*--- Thread parallel initialization. ---*/ int num_threads = omp_get_max_threads(); @@ -672,6 +684,19 @@ void CSysMatrix::MatrixVectorProduct(const CSysVector& v template void CSysMatrix::BuildJacobiPreconditioner() { SU2_ZONE_SCOPED + + if (useCuda) { +#ifdef HAVE_CUDA + BuildJacobiPreconditionerGPU(); + return; +#else + SU2_MPI::Error( + "\nError in building Jacobi preconditioner\nENABLE_CUDA is set to YES\nPlease compile with CUDA options " + "enabled in Meson to access GPU Functions", + CURRENT_FUNCTION); +#endif + } + /*--- Build Jacobi preconditioner (M = D), compute and store the inverses of the diagonal blocks. ---*/ SU2_OMP_FOR_DYN(omp_heavy_size) for (unsigned long iPoint = 0; iPoint < nPointDomain; iPoint++) @@ -684,6 +709,19 @@ void CSysMatrix::ComputeJacobiPreconditioner(const CSysVector& prod, CGeometry* geometry, const CConfig* config) const { SU2_ZONE_SCOPED + + if (config->GetCUDA()) { +#ifdef HAVE_CUDA + ComputeJacobiPreconditionerGPU(vec, prod, geometry, config); + return; +#else + SU2_MPI::Error( + "\nError in applying Jacobi preconditioner\nENABLE_CUDA is set to YES\nPlease compile with CUDA options " + "enabled in Meson to access GPU Functions", + CURRENT_FUNCTION); +#endif + } + /*--- Apply Jacobi preconditioner, y = D^{-1} * x, the inverse of the diagonal is already known. ---*/ SU2_OMP_BARRIER SU2_OMP_FOR_DYN(omp_heavy_size) diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu index a3f1a77ca40..b8bb16ce5a9 100644 --- a/Common/src/linear_algebra/CSysMatrixGPU.cu +++ b/Common/src/linear_algebra/CSysMatrixGPU.cu @@ -55,6 +55,38 @@ inline cusparseIndexType_t GetCusparseIndexType() { } } +struct CudaSpMVResources { + cusparseHandle_t handle = nullptr; + cusparseConstSpMatDescr_t mat = nullptr; + size_t buffer_size = 0; + void* buffer = nullptr; +}; + +void ReleaseCudaSpMVResources(CudaSpMVResources*& resources) { + if (resources == nullptr) { + return; + } + + if (resources->buffer != nullptr) { + gpuErrChk(cudaFree(resources->buffer)); + resources->buffer = nullptr; + resources->buffer_size = 0; + } + + if (resources->mat != nullptr) { + cusparseErrChk(cusparseDestroySpMat(resources->mat)); + resources->mat = nullptr; + } + + if (resources->handle != nullptr) { + cusparseErrChk(cusparseDestroy(resources->handle)); + resources->handle = nullptr; + } + + delete resources; + resources = nullptr; +} + template constexpr cudaDataType GetCudaDataType() { if constexpr (std::is_same::value) { @@ -83,8 +115,6 @@ void CSysMatrix::GPUMatrixVectorProduct(const CSysVector ScalarType* d_vec = vec.GetDevicePointer(); ScalarType* d_prod = prod.GetDevicePointer(); - vec.HtDTransfer(); - const auto indexType = GetCusparseIndexType(); const auto valueType = GetCudaDataType(); @@ -100,42 +130,56 @@ void CSysMatrix::GPUMatrixVectorProduct(const CSysVector const ScalarType alpha = 1.0; const ScalarType beta = 0.0; - cusparseHandle_t handle = nullptr; - cusparseConstSpMatDescr_t matA = nullptr; - cusparseDnVecDescr_t vecX = nullptr; - cusparseDnVecDescr_t vecY = nullptr; + if (spmv_resources == nullptr) { + spmv_resources = new CudaSpMVResources; - cusparseErrChk(cusparseCreate(&handle)); + cusparseErrChk(cusparseCreate(&spmv_resources->handle)); - cusparseErrChk(cusparseCreateConstBsr(&matA, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr, d_col_ind, d_matrix, - indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, valueType, CUSPARSE_ORDER_ROW)); + cusparseErrChk(cusparseCreateConstBsr(&spmv_resources->mat, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr, + d_col_ind, d_matrix, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, + valueType, CUSPARSE_ORDER_ROW)); + } + + cusparseDnVecDescr_t vecX = nullptr; + cusparseDnVecDescr_t vecY = nullptr; cusparseErrChk(cusparseCreateDnVec(&vecX, xSize, d_vec, valueType)); cusparseErrChk(cusparseCreateDnVec(&vecY, ySize, d_prod, valueType)); - size_t bufferSize = 0; - void* dBuffer = nullptr; + size_t required_buffer_size = 0; - cusparseErrChk(cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, - valueType, CUSPARSE_SPMV_BSR_ALG1, &bufferSize)); + cusparseErrChk(cusparseSpMV_bufferSize(spmv_resources->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, + spmv_resources->mat, vecX, &beta, vecY, valueType, CUSPARSE_SPMV_BSR_ALG1, + &required_buffer_size)); - if (bufferSize > 0) { - gpuErrChk(cudaMalloc(&dBuffer, bufferSize)); - } + if (required_buffer_size > spmv_resources->buffer_size) { + if (spmv_resources->buffer != nullptr) { + gpuErrChk(cudaFree(spmv_resources->buffer)); + } - cusparseErrChk(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, valueType, - CUSPARSE_SPMV_BSR_ALG1, dBuffer)); + if (required_buffer_size > 0) { + gpuErrChk(cudaMalloc(&spmv_resources->buffer, required_buffer_size)); + } else { + spmv_resources->buffer = nullptr; + } - if (dBuffer != nullptr) { - gpuErrChk(cudaFree(dBuffer)); + spmv_resources->buffer_size = required_buffer_size; } + cusparseErrChk(cusparseSpMV(spmv_resources->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmv_resources->mat, + vecX, &beta, vecY, valueType, CUSPARSE_SPMV_BSR_ALG1, spmv_resources->buffer)); + cusparseErrChk(cusparseDestroyDnVec(vecY)); cusparseErrChk(cusparseDestroyDnVec(vecX)); - cusparseErrChk(cusparseDestroySpMat(matA)); - cusparseErrChk(cusparseDestroy(handle)); - - prod.DtHTransfer(); } - -template class CSysMatrix; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits. +template void CSysMatrix::HtDTransfer(bool trigger) const; +template void CSysMatrix::GPUMatrixVectorProduct(const CSysVector& vec, + CSysVector& prod, CGeometry* geometry, + const CConfig* config) const; + +#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION) +template void CSysMatrix::HtDTransfer(bool trigger) const; +template void CSysMatrix::GPUMatrixVectorProduct(const CSysVector& vec, + CSysVector& prod, CGeometry* geometry, + const CConfig* config) const; +#endif diff --git a/Common/src/linear_algebra/CSysPreconditionerGPU.cu b/Common/src/linear_algebra/CSysPreconditionerGPU.cu new file mode 100644 index 00000000000..adee93264eb --- /dev/null +++ b/Common/src/linear_algebra/CSysPreconditionerGPU.cu @@ -0,0 +1,109 @@ +/*! + * \file CSysPreconditionerGPU.cu + * \brief CUDA/GPU skeleton implementations for matrix-based preconditioners. + * \author Jesse Li + * \version 8.5.0 "Harrier" + * + * SU2 Project Website: https://su2code.github.io + * + * The SU2 Project is maintained by the SU2 Foundation + * (http://su2foundation.org) + * + * Copyright 2012-2026, SU2 Contributors (cf. AUTHORS.md) + * + * SU2 is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * SU2 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with SU2. If not, see . + */ + +#include "../../include/linear_algebra/CSysMatrix.inl" +#include "../../include/linear_algebra/GPUComms.cuh" + +namespace { + +template +__global__ void ApplyJacobiPreconditionerKernel(const ScalarType* invM, const ScalarType* vec, ScalarType* prod, + unsigned long nPointDomain, unsigned long nVar) { + const auto iPoint = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (iPoint >= nPointDomain) return; + + const auto block = &invM[iPoint * nVar * nVar]; + const auto rhs = &vec[iPoint * nVar]; + auto out = &prod[iPoint * nVar]; + + for (auto iVar = 0ul; iVar < nVar; ++iVar) { + ScalarType sum = ScalarType(0); + for (auto jVar = 0ul; jVar < nVar; ++jVar) { + sum += block[iVar * nVar + jVar] * rhs[jVar]; + } + out[iVar] = sum; + } +} + +} // namespace + +template +void CSysMatrix::BuildJacobiPreconditionerGPU() { + SU2_ZONE_SCOPED + + if (nVar != nEqn) { + SU2_MPI::Error("CUDA Jacobi preconditioner requires square blocks.", CURRENT_FUNCTION); + } + + if (invM == nullptr) { + SU2_MPI::Error("CUDA Jacobi preconditioner was requested without host inverse block storage.", CURRENT_FUNCTION); + } + + if (d_invM == nullptr) { + d_invM = GPUMemoryAllocation::gpu_alloc(nPointDomain * nVar * nEqn * sizeof(ScalarType)); + } + + for (auto iPoint = 0ul; iPoint < nPointDomain; ++iPoint) { + InverseDiagonalBlock(iPoint, &(invM[iPoint * nVar * nVar])); + } + + gpuErrChk(cudaMemcpy(d_invM, invM, nPointDomain * nVar * nVar * sizeof(ScalarType), cudaMemcpyHostToDevice)); +} + +template +void CSysMatrix::ComputeJacobiPreconditionerGPU(const CSysVector& vec, + CSysVector& prod, CGeometry* geometry, + const CConfig* config) const { + (void)geometry; + (void)config; + + SU2_ZONE_SCOPED + + if (d_invM == nullptr) { + SU2_MPI::Error("CUDA Jacobi preconditioner used before BuildJacobiPreconditionerGPU.", CURRENT_FUNCTION); + } + + constexpr unsigned threadsPerBlock = 128; + const auto blocks = static_cast((nPointDomain + threadsPerBlock - 1) / threadsPerBlock); + ApplyJacobiPreconditionerKernel<<>>(d_invM, vec.GetDevicePointer(), prod.GetDevicePointer(), + nPointDomain, nVar); + gpuErrChk(cudaPeekAtLastError()); +} + +template void CSysMatrix::BuildJacobiPreconditionerGPU(); +template void CSysMatrix::ComputeJacobiPreconditionerGPU(const CSysVector& vec, + CSysVector& prod, + CGeometry* geometry, + const CConfig* config) const; + +#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION) +template void CSysMatrix::BuildJacobiPreconditionerGPU(); +template void CSysMatrix::ComputeJacobiPreconditionerGPU(const CSysVector& vec, + CSysVector& prod, + CGeometry* geometry, + const CConfig* config) const; +#endif diff --git a/Common/src/linear_algebra/CSysSolve.cpp b/Common/src/linear_algebra/CSysSolve.cpp index cd100f311ab..1045ae68bbc 100644 --- a/Common/src/linear_algebra/CSysSolve.cpp +++ b/Common/src/linear_algebra/CSysSolve.cpp @@ -131,8 +131,35 @@ void LinearCombination(bool parallel, Ts&&... args) { } } +template +class CHostFGMRESOps { + public: + static constexpr bool UseSolverModGramSchmidt = true; + + bool NestedParallel() const { return !omp_in_parallel() && omp_get_max_threads() > 1; } + + void PrepareInput(const CSysVector&, CSysVector&, bool) const {} + + void FinalizeOutput(CSysVector&) const {} + + ScalarType Norm(const CSysVector& v) const { return v.norm(); } + + void AssignNegative(CSysVector& dst, const CSysVector& src) const { dst = -src; } + + void Subtract(CSysVector& dst, const CSysVector& src) const { dst -= src; } + + void Divide(CSysVector& dst, ScalarType val) const { dst /= val; } + + void UpdateSolution(unsigned long n, const std::vector>& basis, const su2vector& y, + CSysVector& x, bool nestedParallel) const { + LinearCombination(nestedParallel, n, basis, y, x, true); + } +}; + } // namespace +#include "../../include/linear_algebra/CSysSolveFGMRES.inl" + template CSysSolve::CSysSolve(LINEAR_SOLVER_MODE linear_solver_mode) : eps(linSolEpsilon()), @@ -421,200 +448,19 @@ unsigned long CSysSolve::FGMRES_LinSolver(const CSysVector 1; - - /*--- Check the subspace size ---*/ - - if (m < 1) { - SU2_MPI::Error("Number of linear solver iterations must be greater than 0.", CURRENT_FUNCTION); - } - - if (m > 1000) { - SU2_MPI::Error("FGMRES subspace is too large (>1000).", CURRENT_FUNCTION); - } - - /*--- Allocate if not allocated yet ---*/ - - if (V.size() <= m || (flexible && Z.size() <= m)) { - BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS { - V.resize(m + 1); - for (auto& w : V) w.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr); - if (flexible) { - Z.resize(m + 1); - for (auto& z : Z) z.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr); - } - } - END_SU2_OMP_SAFE_GLOBAL_ACCESS - } - - /*--- Define various arrays. In parallel, each thread of each rank has and works - on its own thread, since calculations on these arrays are based on dot products - (reduced across all threads and ranks) all threads do the same computations. ---*/ - - su2vector g(m + 1), sn(m + 1), cs(m + 1), y(m); - g = ScalarType(0); - sn = ScalarType(0); - cs = ScalarType(0); - y = ScalarType(0); - su2matrix H(m + 1, m); - H = ScalarType(0); - - /*--- Calculate the norm of the rhs vector. ---*/ - - ScalarType norm0 = b.norm(); - - /*--- Calculate the initial residual (actually the negative residual) and compute its norm. ---*/ - - if (!xIsZero) { - mat_vec(x, V[0]); - V[0] -= b; - } else { - V[0] = -b; - } - - ScalarType beta = xIsZero ? norm0 : V[0].norm(); - - /*--- FGMRES shares V and Z with FGCRODR, reset deflation because we're breaking - * relations between subspaces that FGCRODR relies on for recycling. ---*/ - - SU2_OMP_MASTER - ResetDeflation(); - END_SU2_OMP_MASTER - - /*--- Set the norm to the initial initial residual value ---*/ - - if (tol_type == LinearToleranceType::RELATIVE) norm0 = beta; - - if (beta < tol * norm0 || beta < eps) { - /*--- System is already solved ---*/ - - if (masterRank) { - SU2_OMP_MASTER - cout << "CSysSolve::FGMRES(): system solved by initial guess." << endl; - END_SU2_OMP_MASTER - } - residual = beta; - return 0; - } - - /*--- Normalize residual to get w_{0} (the negative sign is because w[0] - holds the negative residual, as mentioned above). ---*/ - - V[0] /= -beta; - - /*--- Initialize the RHS of the reduced system ---*/ - - g[0] = beta; - - /*--- Output header information including initial residual ---*/ - - unsigned long i = 0; - if (monitoring && masterRank) { - SU2_OMP_MASTER { - WriteHeader("FGMRES", tol, beta); - WriteHistory(i, beta / norm0); - } - END_SU2_OMP_MASTER - } - - /*--- Loop over all search directions ---*/ - - for (i = 0; i < m; i++) { - /*--- Check if solution has converged ---*/ - - if (beta < tol * norm0) break; - - if (flexible) { - /*--- Precondition the CSysVector v[i] and store result in z[i] ---*/ - - precond(V[i], Z[i]); - - /*--- Add to Krylov subspace ---*/ - - mat_vec(Z[i], V[i + 1]); - } else { - mat_vec(V[i], V[i + 1]); - } - - /*--- Modified Gram-Schmidt orthogonalization ---*/ - - bool orthog_ok; - if (nestedParallel) { - /*--- "omp parallel if" does not work well here ---*/ - SU2_OMP_PARALLEL - orthog_ok = ModGramSchmidt(true, i, H, V); - END_SU2_OMP_PARALLEL - } else { - orthog_ok = ModGramSchmidt(false, i, H, V); - } - - if (!orthog_ok) { - if (masterRank) { - SU2_OMP_MASTER - cout << "WARNING: FGMRES orthogonalization failed, linear solver diverged." << endl; - END_SU2_OMP_MASTER - } - break; - } - - /*--- Apply old Givens rotations to new column of the Hessenberg matrix then generate the - new Givens rotation matrix and apply it to the last two elements of H[:][i] and g ---*/ - - for (unsigned long k = 0; k < i; k++) ApplyGivens(sn[k], cs[k], H(k, i), H(k + 1, i)); - GenerateGivens(H(i, i), H(i + 1, i), sn[i], cs[i]); - ApplyGivens(sn[i], cs[i], g[i], g[i + 1]); - - /*--- Set L2 norm of residual and check if solution has converged ---*/ - - beta = fabs(g[i + 1]); - - /*--- Output the relative residual if necessary ---*/ - - if (monitoring && masterRank && ((i + 1) % monitorFreq == 0)) { - SU2_OMP_MASTER - WriteHistory(i + 1, beta / norm0); - END_SU2_OMP_MASTER - } - } - - /*--- Solve the least-squares system and update solution ---*/ - - SolveReduced(i, H, g, y); - - const auto& basis = flexible ? Z : V; - - LinearCombination(nestedParallel, i, basis, y, x, true); - - /*--- Recalculate final (neg.) residual (this should be optional) ---*/ - - if (monitoring && config->GetComm_Level() == COMM_FULL) { - if (masterRank) { - SU2_OMP_MASTER - WriteFinalResidual("FGMRES", i, beta / norm0); - END_SU2_OMP_MASTER - } - - if (recomputeRes) { - mat_vec(x, V[0]); - V[0] -= b; - ScalarType res = V[0].norm(); - - if (fabs(res - beta) > tol * 10) { - if (masterRank) { - SU2_OMP_MASTER - WriteWarning(beta, res, tol); - END_SU2_OMP_MASTER - } - } - } + if (config->GetCUDA()) { +#ifdef HAVE_CUDA + return FGMRES_LinSolver_GPU(b, x, mat_vec, precond, tol, m, residual, monitoring, config); +#else + SU2_MPI::Error( + "\nError in launching FGMRES solver\nENABLE_CUDA is set to YES\nPlease compile with CUDA options enabled " + "in Meson to access GPU Functions", + CURRENT_FUNCTION); +#endif } - residual = beta / norm0; - return i; + return FGMRES_LinSolverImpl(b, x, mat_vec, precond, tol, m, residual, monitoring, config, + CHostFGMRESOps()); } template diff --git a/Common/src/linear_algebra/CSysSolveGPU.cu b/Common/src/linear_algebra/CSysSolveGPU.cu new file mode 100644 index 00000000000..d618cc223b5 --- /dev/null +++ b/Common/src/linear_algebra/CSysSolveGPU.cu @@ -0,0 +1,120 @@ +/*! + * \file CSysSolveGPU.cu + * \brief CUDA/GPU backend dispatch for Krylov solvers. + * \author Jesse Li + * \version 8.5.0 "Harrier" + * + * SU2 Project Website: https://su2code.github.io + * + * The SU2 Project is maintained by the SU2 Foundation + * (http://su2foundation.org) + * + * Copyright 2012-2026, SU2 Contributors (cf. AUTHORS.md) + * + * SU2 is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * SU2 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with SU2. If not, see . + */ + +#include "../../include/linear_algebra/CSysSolve.hpp" +#include "../../include/linear_algebra/CMatrixVectorProduct.hpp" +#include "../../include/linear_algebra/CPreconditioner.hpp" +#include "../../include/linear_algebra/GPUComms.cuh" + +void SU2_GPU_BeginSolverBLASContext(); +void SU2_GPU_EndSolverBLASContext(); + +namespace { + +class CGPUSolverBLASContextGuard { + public: + CGPUSolverBLASContextGuard() { SU2_GPU_BeginSolverBLASContext(); } + + ~CGPUSolverBLASContextGuard() { SU2_GPU_EndSolverBLASContext(); } + + CGPUSolverBLASContextGuard(const CGPUSolverBLASContextGuard&) = delete; + CGPUSolverBLASContextGuard& operator=(const CGPUSolverBLASContextGuard&) = delete; +}; + +template +class CDeviceFGMRESOps { + public: + static constexpr bool UseSolverModGramSchmidt = false; + + bool NestedParallel() const { return false; } + + void PrepareInput(const CSysVector& b, CSysVector& x, bool x_is_zero) const { + b.HtDTransfer(); + if (x_is_zero) { + x.device() = ScalarType(0); + } else { + x.HtDTransfer(); + } + } + + void FinalizeOutput(CSysVector& x) const { x.DtHTransfer(); } + + ScalarType Norm(const CSysVector& v) const { return v.GPUNorm(); } + + void AssignNegative(CSysVector& dst, const CSysVector& src) const { + dst.device() = -src.device(); + } + + void Subtract(CSysVector& dst, const CSysVector& src) const { + dst.device() -= src.device(); + } + + void Divide(CSysVector& dst, ScalarType val) const { dst.device() /= val; } + + ScalarType Dot(const CSysVector& lhs, const CSysVector& rhs) const { + return lhs.GPUDot(rhs); + } + + void AddScaled(CSysVector& dst, ScalarType alpha, const CSysVector& src) const { + dst.device() += alpha * src.device(); + } + + void UpdateSolution(unsigned long n, const std::vector>& basis, + const su2vector& y, CSysVector& x, bool) const { + for (unsigned long k = 0; k < n; k++) { + AddScaled(x, y[k], basis[k]); + } + } +}; + +} // namespace + +#include "../../include/linear_algebra/CSysSolveFGMRES.inl" + +template +unsigned long CSysSolve::FGMRES_LinSolver_GPU(const VectorType& b, VectorType& x, + const ProductType& mat_vec, const PrecondType& precond, + ScalarType tol, unsigned long m, ScalarType& residual, + bool monitoring, const CConfig* config) const { + CGPUSolverBLASContextGuard blas_context; + return FGMRES_LinSolverImpl(b, x, mat_vec, precond, tol, m, residual, monitoring, config, + CDeviceFGMRESOps()); +} + +/*--- Explicit instantiations for the GPU backend wrapper. + * Keep these aligned with the scalar types instantiated for CSysSolve. ---*/ +template unsigned long CSysSolve::FGMRES_LinSolver_GPU( + const CSysVector& b, CSysVector& x, + const CMatrixVectorProduct& mat_vec, const CPreconditioner& precond, + su2mixedfloat tol, unsigned long m, su2mixedfloat& residual, bool monitoring, const CConfig* config) const; + +#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION) +template unsigned long CSysSolve::FGMRES_LinSolver_GPU( + const CSysVector& b, CSysVector& x, + const CMatrixVectorProduct& mat_vec, const CPreconditioner& precond, + passivedouble tol, unsigned long m, passivedouble& residual, bool monitoring, const CConfig* config) const; +#endif diff --git a/Common/src/linear_algebra/CSysVectorGPU.cu b/Common/src/linear_algebra/CSysVectorGPU.cu index 94ec17bb88f..dca72c0d76b 100644 --- a/Common/src/linear_algebra/CSysVectorGPU.cu +++ b/Common/src/linear_algebra/CSysVectorGPU.cu @@ -27,23 +27,129 @@ #include "../../include/linear_algebra/CSysVector.hpp" #include "../../include/linear_algebra/GPUComms.cuh" +#include +#include -template -void CSysVector::HtDTransfer(bool trigger) const -{ - if(trigger) gpuErrChk(cudaMemcpy((void*)(d_vec_val), (void*)&vec_val[0], (sizeof(ScalarType)*nElm), cudaMemcpyHostToDevice)); +namespace { + +thread_local cublasHandle_t active_solver_blas_handle = nullptr; +thread_local unsigned active_solver_blas_depth = 0; + +cublasHandle_t GetBlasHandle(const char* creation_error, bool& owns_handle) { + if (active_solver_blas_handle != nullptr) { + owns_handle = false; + return active_solver_blas_handle; + } + + cublasHandle_t handle = nullptr; + owns_handle = true; + if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { + SU2_MPI::Error(creation_error, CURRENT_FUNCTION); + } + return handle; +} + +void ReleaseBlasHandle(cublasHandle_t handle, bool owns_handle, const char* destruction_error) { + if (owns_handle && handle != nullptr && cublasDestroy(handle) != CUBLAS_STATUS_SUCCESS) { + SU2_MPI::Error(destruction_error, CURRENT_FUNCTION); + } +} + +} // namespace + +void SU2_GPU_BeginSolverBLASContext() { + if (active_solver_blas_depth == 0) { + cublasHandle_t handle = nullptr; + if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { + SU2_MPI::Error("cuBLAS handle creation failed for the GPU linear solver context.", CURRENT_FUNCTION); + } + active_solver_blas_handle = handle; + } + ++active_solver_blas_depth; +} + +void SU2_GPU_EndSolverBLASContext() { + if (active_solver_blas_depth == 0) { + SU2_MPI::Error("GPU linear solver BLAS context ended without a matching begin.", CURRENT_FUNCTION); + } + + --active_solver_blas_depth; + if (active_solver_blas_depth == 0) { + auto status = cublasDestroy(active_solver_blas_handle); + active_solver_blas_handle = nullptr; + if (status != CUBLAS_STATUS_SUCCESS) { + SU2_MPI::Error("cuBLAS handle destruction failed for the GPU linear solver context.", CURRENT_FUNCTION); + } + } } -template -void CSysVector::DtHTransfer(bool trigger) const -{ - if(trigger) gpuErrChk(cudaMemcpy((void*)(&vec_val[0]), (void*)d_vec_val, (sizeof(ScalarType)*nElm), cudaMemcpyDeviceToHost)); +template +void CSysVector::HtDTransfer(bool trigger) const { + if (trigger) + gpuErrChk(cudaMemcpy((void*)(d_vec_val), (void*)&vec_val[0], (sizeof(ScalarType) * nElm), cudaMemcpyHostToDevice)); } -template -void CSysVector::GPUSetVal(ScalarType val, bool trigger) const -{ - if(trigger) gpuErrChk(cudaMemset((void*)(d_vec_val), val, (sizeof(ScalarType)*nElm))); +template +void CSysVector::DtHTransfer(bool trigger) const { + if (trigger) + gpuErrChk(cudaMemcpy((void*)(&vec_val[0]), (void*)d_vec_val, (sizeof(ScalarType) * nElm), cudaMemcpyDeviceToHost)); } -template class CSysVector; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits. +template +ScalarType CSysVector::GPUDot(const CSysVector& other) const { + bool owns_handle = false; + cublasHandle_t handle = GetBlasHandle("cuBLAS handle creation failed in CSysVector::GPUDot.", owns_handle); + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + + ScalarType local_dot = ScalarType(0); + + if constexpr (std::is_same_v) { + status = cublasSdot(handle, static_cast(nElmDomain), GetDevicePointer(), 1, other.GetDevicePointer(), 1, + &local_dot); + } else if constexpr (std::is_same_v) { + status = cublasDdot(handle, static_cast(nElmDomain), GetDevicePointer(), 1, other.GetDevicePointer(), 1, + &local_dot); + } else { + ReleaseBlasHandle(handle, owns_handle, "cuBLAS handle destruction failed in CSysVector::GPUDot."); + SU2_MPI::Error("Unsupported ScalarType in CSysVector::GPUDot.", CURRENT_FUNCTION); + return ScalarType(0); + } + + if (status != CUBLAS_STATUS_SUCCESS) { + ReleaseBlasHandle(handle, owns_handle, "cuBLAS handle destruction failed in CSysVector::GPUDot."); + SU2_MPI::Error("cuBLAS dot failed in CSysVector::GPUDot.", CURRENT_FUNCTION); + return ScalarType(0); + } + + ReleaseBlasHandle(handle, owns_handle, "cuBLAS handle destruction failed in CSysVector::GPUDot."); + + ScalarType global_dot = ScalarType(0); + const auto mpi_type = (sizeof(ScalarType) < sizeof(double)) ? MPI_FLOAT : MPI_DOUBLE; + SelectMPIWrapper::W::Allreduce(&local_dot, &global_dot, 1, mpi_type, MPI_SUM, SU2_MPI::GetComm()); + + return global_dot; +} + +template +ScalarType CSysVector::GPUNorm() const { + return sqrt(GPUDot(*this)); +} + +template void CSysVector::HtDTransfer(bool trigger) const; +template void CSysVector::DtHTransfer(bool trigger) const; +template su2mixedfloat CSysVector::GPUDot(const CSysVector& other) const; +template su2mixedfloat CSysVector::GPUNorm() const; + +#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION) +template void CSysVector::HtDTransfer(bool trigger) const; +template void CSysVector::DtHTransfer(bool trigger) const; +template passivedouble CSysVector::GPUDot(const CSysVector& other) const; +template passivedouble CSysVector::GPUNorm() const; +#endif + +#ifdef CODI_REVERSE_TYPE +template void CSysVector::HtDTransfer(bool trigger) const; +template void CSysVector::DtHTransfer(bool trigger) const; +template su2double CSysVector::GPUDot(const CSysVector& other) const; +template su2double CSysVector::GPUNorm() const; +#endif diff --git a/Common/src/linear_algebra/meson.build b/Common/src/linear_algebra/meson.build index 7b880b29c1e..29bb63cd680 100644 --- a/Common/src/linear_algebra/meson.build +++ b/Common/src/linear_algebra/meson.build @@ -6,5 +6,5 @@ common_src += files(['CSysSolve_b.cpp', 'blas_structure.cpp']) if get_option('enable-cuda') - common_src += files(['CSysMatrixGPU.cu', 'CSysVectorGPU.cu',]) + common_src += files(['CSysMatrixGPU.cu', 'CSysVectorGPU.cu', 'CSysSolveGPU.cu', 'CSysPreconditionerGPU.cu',]) endif diff --git a/Common/src/meson.build b/Common/src/meson.build index f385c4a32ed..710e764303f 100644 --- a/Common/src/meson.build +++ b/Common/src/meson.build @@ -6,6 +6,27 @@ common_src =files(['graph_coloring_structure.cpp', '../include/parallelization/mpi_structure.cpp', '../include/parallelization/omp_structure.cpp']) +common_cuda_cpp_args = [] +foreach arg : su2_cpp_args + if arg.startswith('-D') or arg.startswith('-U') + common_cuda_cpp_args += arg + endif +endforeach + +common_cuda_rev_args = common_cuda_cpp_args +foreach arg : codi_rev_args + if arg.startswith('-D') or arg.startswith('-U') + common_cuda_rev_args += arg + endif +endforeach + +common_cuda_for_args = common_cuda_cpp_args +foreach arg : codi_for_args + if arg.startswith('-D') or arg.startswith('-U') + common_cuda_for_args += arg + endif +endforeach + subdir('linear_algebra') subdir('toolboxes') subdir('geometry') @@ -24,7 +45,8 @@ if get_option('enable-normal') common_src, install : false, dependencies : su2_deps, - cpp_args: [default_warning_flags, su2_cpp_args]) + cpp_args: [default_warning_flags, su2_cpp_args], + cuda_args: common_cuda_cpp_args) common_dep = declare_dependency(link_with: common, include_directories : common_include) @@ -36,7 +58,8 @@ if get_option('enable-autodiff') common_src, install : false, dependencies : [su2_deps, codi_dep], - cpp_args: [default_warning_flags, su2_cpp_args, codi_rev_args]) + cpp_args: [default_warning_flags, su2_cpp_args, codi_rev_args], + cuda_args: common_cuda_rev_args) commonAD_dep = declare_dependency(link_with: commonAD, include_directories : common_include) @@ -49,7 +72,8 @@ if get_option('enable-directdiff') common_src, install : false, dependencies : [su2_deps, codi_dep], - cpp_args: [default_warning_flags, su2_cpp_args, codi_for_args]) + cpp_args: [default_warning_flags, su2_cpp_args, codi_for_args], + cuda_args: common_cuda_for_args) commonDD_dep = declare_dependency(link_with: commonDD, include_directories : common_include) diff --git a/meson.build b/meson.build index d43e2b031d8..43786c100e9 100644 --- a/meson.build +++ b/meson.build @@ -20,13 +20,18 @@ python = pymod.find_installation() if get_option('enable-cuda') add_languages('cuda') add_global_arguments('-arch=sm_86', language : 'cuda') - cuda_deps = [meson.get_compiler('cuda').find_library('cusparse', required : true)] + cuda_deps = [ + meson.get_compiler('cuda').find_library('cusparse', required : true), + meson.get_compiler('cuda').find_library('cublas', required : true), + ] else cuda_deps = [] endif su2_cpp_args = [] su2_deps = [declare_dependency(include_directories: 'externals/CLI11')] + cuda_deps +codi_rev_args = [] +codi_for_args = [] default_warning_flags = [] if build_machine.system() != 'windows'