Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions Common/include/linear_algebra/CPreconditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,18 @@ class CPreconditioner {
template <class ScalarType>
CPreconditioner<ScalarType>::~CPreconditioner() {}

/*!
* \class CIdentityPreconditioner
* \brief No-op preconditioner used when Krylov solvers run without preconditioning.
*/
template <class ScalarType>
class CIdentityPreconditioner final : public CPreconditioner<ScalarType> {
public:
inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override { v = u; }

inline bool IsIdentity() const override { return true; }
};

/*!
* \class CJacobiPreconditioner
* \brief Specialization of preconditioner that uses CSysMatrix class.
Expand Down Expand Up @@ -332,6 +344,9 @@ CPreconditioner<ScalarType>* CPreconditioner<ScalarType>::Create(ENUM_LINEAR_SOL
CPreconditioner<ScalarType>* prec = nullptr;

switch (kind) {
case NO_PRECONDITIONER:
prec = new CIdentityPreconditioner<ScalarType>();
break;
case JACOBI:
prec = new CJacobiPreconditioner<ScalarType>(jacobian, geometry, config);
break;
Expand Down
27 changes: 27 additions & 0 deletions Common/include/linear_algebra/CSysMatrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,16 @@ struct CSysMatrixComms {
MPI_QUANTITIES commType = MPI_QUANTITIES::SOLUTION_MATRIX);
};

/*!
* \brief Opaque storage for CUDA/cuSPARSE resources used by the GPU SpMV path.
*/
struct CudaSpMVResources;

/*!
* \brief Release cached CUDA/cuSPARSE resources used by the GPU SpMV path.
*/
void ReleaseCudaSpMVResources(CudaSpMVResources*& resources);

/*!
* \class CSysMatrix
* \ingroup SpLinSys
Expand Down Expand Up @@ -148,8 +158,10 @@ class CSysMatrix {
ScalarType* d_matrix; /*!< \brief Device Pointer to store the matrix values on the GPU. */
const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
ScalarType* d_invM = nullptr; /*!< \brief Device inverse diagonal blocks for the Jacobi preconditioner. */
bool useCuda = false; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
Mainly used to conditionally free GPU memory in the class destructor. */
mutable CudaSpMVResources* spmv_resources = nullptr; /*!< \brief Cached cuSPARSE resources for GPU SpMV. */

ScalarType* ILU_matrix; /*!< \brief Entries of the ILU sparse matrix. */
unsigned long nnz_ilu; /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
Expand Down Expand Up @@ -924,6 +936,13 @@ class CSysMatrix {
*/
void BuildJacobiPreconditioner();

/*!
* \brief Build the Jacobi preconditioner on the GPU/device side.
* \note This helper is intended as the implementation hook for GPU-resident Krylov solvers.
* The actual implementation belongs in CSysMatrixGPU.cu.
*/
void BuildJacobiPreconditionerGPU();

/*!
* \brief Multiply CSysVector by the preconditioner
* \param[in] vec - CSysVector to be multiplied by the preconditioner.
Expand All @@ -934,6 +953,14 @@ class CSysMatrix {
void ComputeJacobiPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
const CConfig* config) const;

/*!
* \brief Apply the Jacobi preconditioner on the GPU/device side.
* \note This helper is intended as the implementation hook for GPU-resident Krylov solvers.
* The actual implementation belongs in CSysMatrixGPU.cu.
*/
void ComputeJacobiPreconditionerGPU(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
CGeometry* geometry, const CConfig* config) const;

/*!
* \brief Build the ILU preconditioner.
*/
Expand Down
14 changes: 14 additions & 0 deletions Common/include/linear_algebra/CSysSolve.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,20 @@ class CSysSolve {
ScalarType& residual, bool monitoring, const CConfig* config, FgcrodrMode mode,
unsigned long custom_m) const;

template <class FGMRESOps>
unsigned long FGMRES_LinSolverImpl(const VectorType& b, VectorType& x, const ProductType& mat_vec,
const PrecondType& precond, ScalarType tol, unsigned long m, ScalarType& residual,
bool monitoring, const CConfig* config, const FGMRESOps& ops) const;

/*!
* \brief CUDA/GPU backend wrapper for the Flexible GMRES solver.
* \note The algorithmic implementation is shared with the host solver. This helper
* only provides GPU vector-operation dispatch and synchronization.
*/
unsigned long FGMRES_LinSolver_GPU(const VectorType& b, VectorType& x, const ProductType& mat_vec,
const PrecondType& precond, ScalarType tol, unsigned long m, ScalarType& residual,
bool monitoring, const CConfig* config) const;

/*!
* \brief Creates the inner solver for nested preconditioning if the settings allow it.
* \returns True if the inner solver can be used.
Expand Down
267 changes: 267 additions & 0 deletions Common/include/linear_algebra/CSysSolveFGMRES.inl
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
/*!
* \file CSysSolveFGMRES.inl
* \brief Shared implementation of the Flexible GMRES linear solver.
* \author Jesse Li
* \version 8.5.0 "Harrier"
*
* SU2 Project Website: https://su2code.github.io
*
* The SU2 Project is maintained by the SU2 Foundation
* (http://su2foundation.org)
*
* Copyright 2012-2026, SU2 Contributors (cf. AUTHORS.md)
*
* SU2 is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* SU2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with SU2. If not, see <http://www.gnu.org/licenses/>.
*/

namespace fgmres_impl {

template <class ScalarType, class FGMRESOps>
bool Orthogonalize(unsigned long i, su2matrix<ScalarType>& H, std::vector<CSysVector<ScalarType>>& V,
const FGMRESOps& ops) {
for (unsigned long k = 0; k <= i; k++) {
H(k, i) = ops.Dot(V[k], V[i + 1]);
ops.AddScaled(V[i + 1], -H(k, i), V[k]);
}

for (unsigned long k = 0; k <= i; k++) {
const ScalarType dh = ops.Dot(V[k], V[i + 1]);
H(k, i) += dh;
ops.AddScaled(V[i + 1], -dh, V[k]);
}

const ScalarType nrm = ops.Norm(V[i + 1]);
if (nrm <= ScalarType(0) || nrm != nrm) {
H(i + 1, i) = ScalarType(0);
return false;
}

H(i + 1, i) = nrm;
ops.Divide(V[i + 1], nrm);
return true;
}

} // namespace fgmres_impl

template <class ScalarType>
template <class FGMRESOps>
unsigned long CSysSolve<ScalarType>::FGMRES_LinSolverImpl(const VectorType& b, VectorType& x,
const ProductType& mat_vec, const PrecondType& precond,
ScalarType tol, unsigned long m, ScalarType& residual,
bool monitoring, const CConfig* config,
const FGMRESOps& ops) const {
SU2_ZONE_SCOPED

const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE);
const bool flexible = !precond.IsIdentity();
const bool nestedParallel = ops.NestedParallel();

/*--- Check the subspace size. ---*/

if (m < 1) {
SU2_MPI::Error("Number of linear solver iterations must be greater than 0.", CURRENT_FUNCTION);
}

if (m > 1000) {
SU2_MPI::Error("FGMRES subspace is too large (>1000).", CURRENT_FUNCTION);
}

/*--- Allocate if not allocated yet. ---*/

if (V.size() <= m || (flexible && Z.size() <= m)) {
BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS {
V.resize(m + 1);
for (auto& w : V) w.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr);
if (flexible) {
Z.resize(m + 1);
for (auto& z : Z) z.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr);
}
}
END_SU2_OMP_SAFE_GLOBAL_ACCESS
}

/*--- Define various arrays. In parallel, each thread of each rank has and works
on its own thread, since calculations on these arrays are based on dot products
(reduced across all threads and ranks) all threads do the same computations. ---*/

su2vector<ScalarType> g(m + 1), sn(m + 1), cs(m + 1), y(m);
g = ScalarType(0);
sn = ScalarType(0);
cs = ScalarType(0);
y = ScalarType(0);

su2matrix<ScalarType> H(m + 1, m);
H = ScalarType(0);

ops.PrepareInput(b, x, xIsZero);

/*--- Calculate the norm of the rhs vector. ---*/

ScalarType norm0 = ops.Norm(b);

/*--- Calculate the initial residual (actually the negative residual) and compute its norm. ---*/

if (!xIsZero) {
mat_vec(x, V[0]);
ops.Subtract(V[0], b);
} else {
ops.AssignNegative(V[0], b);
}

ScalarType beta = xIsZero ? norm0 : ops.Norm(V[0]);

/*--- FGMRES shares V and Z with FGCRODR, reset deflation because we're breaking
* relations between subspaces that FGCRODR relies on for recycling. ---*/

SU2_OMP_MASTER
ResetDeflation();
END_SU2_OMP_MASTER

/*--- Set the norm to the initial initial residual value. ---*/

if (tol_type == LinearToleranceType::RELATIVE) norm0 = beta;

if (beta < tol * norm0 || beta < eps) {
/*--- System is already solved. ---*/

if (masterRank) {
SU2_OMP_MASTER
cout << "CSysSolve::FGMRES(): system solved by initial guess." << endl;
END_SU2_OMP_MASTER
}
residual = beta;
ops.FinalizeOutput(x);
return 0;
}

/*--- Normalize residual to get w_{0} (the negative sign is because w[0]
holds the negative residual, as mentioned above). ---*/

ops.Divide(V[0], -beta);

/*--- Initialize the RHS of the reduced system. ---*/

g[0] = beta;

/*--- Output header information including initial residual. ---*/

unsigned long i = 0;
if (monitoring && masterRank) {
SU2_OMP_MASTER {
WriteHeader("FGMRES", tol, beta);
WriteHistory(i, beta / norm0);
}
END_SU2_OMP_MASTER
}

/*--- Loop over all search directions. ---*/

for (i = 0; i < m; i++) {
/*--- Check if solution has converged. ---*/

if (beta < tol * norm0) break;

if (flexible) {
/*--- Precondition the CSysVector v[i] and store result in z[i]. ---*/

precond(V[i], Z[i]);

/*--- Add to Krylov subspace. ---*/

mat_vec(Z[i], V[i + 1]);
} else {
mat_vec(V[i], V[i + 1]);
}

/*--- Modified Gram-Schmidt orthogonalization. ---*/

bool orthog_ok;
if constexpr (FGMRESOps::UseSolverModGramSchmidt) {
if (nestedParallel) {
/*--- "omp parallel if" does not work well here. ---*/
SU2_OMP_PARALLEL
orthog_ok = ModGramSchmidt(true, i, H, V);
END_SU2_OMP_PARALLEL
} else {
orthog_ok = ModGramSchmidt(false, i, H, V);
}
} else {
orthog_ok = fgmres_impl::Orthogonalize(i, H, V, ops);
}

if (!orthog_ok) {
if (masterRank) {
SU2_OMP_MASTER
cout << "WARNING: FGMRES orthogonalization failed, linear solver diverged." << endl;
END_SU2_OMP_MASTER
}
break;
}

/*--- Apply old Givens rotations to new column of the Hessenberg matrix then generate the
new Givens rotation matrix and apply it to the last two elements of H[:][i] and g. ---*/

for (unsigned long k = 0; k < i; k++) ApplyGivens(sn[k], cs[k], H(k, i), H(k + 1, i));
GenerateGivens(H(i, i), H(i + 1, i), sn[i], cs[i]);
ApplyGivens(sn[i], cs[i], g[i], g[i + 1]);

/*--- Set L2 norm of residual and check if solution has converged. ---*/

beta = fabs(g[i + 1]);

/*--- Output the relative residual if necessary. ---*/

if (monitoring && masterRank && ((i + 1) % monitorFreq == 0)) {
SU2_OMP_MASTER
WriteHistory(i + 1, beta / norm0);
END_SU2_OMP_MASTER
}
}

/*--- Solve the least-squares system and update solution. ---*/

SolveReduced(i, H, g, y);

const auto& basis = flexible ? Z : V;

ops.UpdateSolution(i, basis, y, x, nestedParallel);
ops.FinalizeOutput(x);

/*--- Recalculate final (neg.) residual (this should be optional). ---*/

if (monitoring && config->GetComm_Level() == COMM_FULL) {
if (masterRank) {
SU2_OMP_MASTER
WriteFinalResidual("FGMRES", i, beta / norm0);
END_SU2_OMP_MASTER
}

if (recomputeRes) {
mat_vec(x, V[0]);
ops.Subtract(V[0], b);
ScalarType res = ops.Norm(V[0]);

if (fabs(res - beta) > tol * 10) {
if (masterRank) {
SU2_OMP_MASTER
WriteWarning(beta, res, tol);
END_SU2_OMP_MASTER
}
}
}
}

residual = beta / norm0;
return i;
}
Loading
Loading