diff --git a/Common/include/linear_algebra/CPreconditioner.hpp b/Common/include/linear_algebra/CPreconditioner.hpp
index e4fc7cf159f..4eff5f947fb 100644
--- a/Common/include/linear_algebra/CPreconditioner.hpp
+++ b/Common/include/linear_algebra/CPreconditioner.hpp
@@ -77,6 +77,18 @@ class CPreconditioner {
 template <class ScalarType>
 CPreconditioner<ScalarType>::~CPreconditioner() {}
 
+/*!
+ * \class CIdentityPreconditioner
+ * \brief No-op preconditioner used when Krylov solvers run without preconditioning.
+ */
+template <class ScalarType>
+class CIdentityPreconditioner final : public CPreconditioner<ScalarType> {
+ public:
+  inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override { v = u; }
+
+  inline bool IsIdentity() const override { return true; }
+};
+
 /*!
  * \class CJacobiPreconditioner
  * \brief Specialization of preconditioner that uses CSysMatrix class.
@@ -332,6 +344,9 @@ CPreconditioner<ScalarType>* CPreconditioner<ScalarType>::Create(ENUM_LINEAR_SOL
   CPreconditioner<ScalarType>* prec = nullptr;
 
   switch (kind) {
+    case NO_PRECONDITIONER:
+      prec = new CIdentityPreconditioner<ScalarType>();
+      break;
     case JACOBI:
       prec = new CJacobiPreconditioner<ScalarType>(jacobian, geometry, config);
       break;
diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
index ecb26959a06..6368670587e 100644
--- a/Common/include/linear_algebra/CSysMatrix.hpp
+++ b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -108,6 +108,16 @@ struct CSysMatrixComms {
                        MPI_QUANTITIES commType = MPI_QUANTITIES::SOLUTION_MATRIX);
 };
 
+/*!
+ * \brief Opaque storage for CUDA/cuSPARSE resources used by the GPU SpMV path.
+ */
+struct CudaSpMVResources;
+
+/*!
+ * \brief Release cached CUDA/cuSPARSE resources used by the GPU SpMV path.
+ */
+void ReleaseCudaSpMVResources(CudaSpMVResources*& resources);
+
 /*!
  * \class CSysMatrix
  * \ingroup SpLinSys
@@ -148,8 +158,10 @@ class CSysMatrix {
   ScalarType* d_matrix;           /*!< \brief Device Pointer to store the matrix values on the GPU. */
   const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
   const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
+  ScalarType* d_invM = nullptr;   /*!< \brief Device inverse diagonal blocks for the Jacobi preconditioner. */
   bool useCuda = false;           /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
                                      Mainly used to conditionally free GPU memory in the class destructor. */
+  mutable CudaSpMVResources* spmv_resources = nullptr; /*!< \brief Cached cuSPARSE resources for GPU SpMV. */
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -924,6 +936,13 @@ class CSysMatrix {
    */
   void BuildJacobiPreconditioner();
 
+  /*!
+   * \brief Build the Jacobi preconditioner on the GPU/device side.
+   * \note This helper is intended as the implementation hook for GPU-resident Krylov solvers.
+   *       The actual implementation belongs in CSysMatrixGPU.cu.
+   */
+  void BuildJacobiPreconditionerGPU();
+
   /*!
    * \brief Multiply CSysVector by the preconditioner
    * \param[in] vec - CSysVector to be multiplied by the preconditioner.
@@ -934,6 +953,14 @@ class CSysMatrix {
   void ComputeJacobiPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
                                    const CConfig* config) const;
 
+  /*!
+   * \brief Apply the Jacobi preconditioner on the GPU/device side.
+   * \note This helper is intended as the implementation hook for GPU-resident Krylov solvers.
+   *       The actual implementation belongs in CSysMatrixGPU.cu.
+   */
+  void ComputeJacobiPreconditionerGPU(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
+                                      CGeometry* geometry, const CConfig* config) const;
+
   /*!
    * \brief Build the ILU preconditioner.
    */
diff --git a/Common/include/linear_algebra/CSysSolve.hpp b/Common/include/linear_algebra/CSysSolve.hpp
index 2ea3cbf7df3..af76e93c07d 100644
--- a/Common/include/linear_algebra/CSysSolve.hpp
+++ b/Common/include/linear_algebra/CSysSolve.hpp
@@ -304,6 +304,20 @@ class CSysSolve {
                                       ScalarType& residual, bool monitoring, const CConfig* config, FgcrodrMode mode,
                                       unsigned long custom_m) const;
 
+  template <class FGMRESOps>
+  unsigned long FGMRES_LinSolverImpl(const VectorType& b, VectorType& x, const ProductType& mat_vec,
+                                     const PrecondType& precond, ScalarType tol, unsigned long m, ScalarType& residual,
+                                     bool monitoring, const CConfig* config, const FGMRESOps& ops) const;
+
+  /*!
+   * \brief CUDA/GPU backend wrapper for the Flexible GMRES solver.
+   * \note The algorithmic implementation is shared with the host solver. This helper
+   *       only provides GPU vector-operation dispatch and synchronization.
+   */
+  unsigned long FGMRES_LinSolver_GPU(const VectorType& b, VectorType& x, const ProductType& mat_vec,
+                                     const PrecondType& precond, ScalarType tol, unsigned long m, ScalarType& residual,
+                                     bool monitoring, const CConfig* config) const;
+
   /*!
    * \brief Creates the inner solver for nested preconditioning if the settings allow it.
    * \returns True if the inner solver can be used.
diff --git a/Common/include/linear_algebra/CSysSolveFGMRES.inl b/Common/include/linear_algebra/CSysSolveFGMRES.inl
new file mode 100644
index 00000000000..cef424a729b
--- /dev/null
+++ b/Common/include/linear_algebra/CSysSolveFGMRES.inl
@@ -0,0 +1,267 @@
+/*!
+ * \file CSysSolveFGMRES.inl
+ * \brief Shared implementation of the Flexible GMRES linear solver.
+ * \author Jesse Li
+ * \version 8.5.0 "Harrier"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2026, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+namespace fgmres_impl {
+
+template <class ScalarType, class FGMRESOps>
+bool Orthogonalize(unsigned long i, su2matrix<ScalarType>& H, std::vector<CSysVector<ScalarType>>& V,
+                   const FGMRESOps& ops) {
+  for (unsigned long k = 0; k <= i; k++) {
+    H(k, i) = ops.Dot(V[k], V[i + 1]);
+    ops.AddScaled(V[i + 1], -H(k, i), V[k]);
+  }
+
+  for (unsigned long k = 0; k <= i; k++) {
+    const ScalarType dh = ops.Dot(V[k], V[i + 1]);
+    H(k, i) += dh;
+    ops.AddScaled(V[i + 1], -dh, V[k]);
+  }
+
+  const ScalarType nrm = ops.Norm(V[i + 1]);
+  if (nrm <= ScalarType(0) || nrm != nrm) {
+    H(i + 1, i) = ScalarType(0);
+    return false;
+  }
+
+  H(i + 1, i) = nrm;
+  ops.Divide(V[i + 1], nrm);
+  return true;
+}
+
+}  // namespace fgmres_impl
+
+template <class ScalarType>
+template <class FGMRESOps>
+unsigned long CSysSolve<ScalarType>::FGMRES_LinSolverImpl(const VectorType& b, VectorType& x,
+                                                          const ProductType& mat_vec, const PrecondType& precond,
+                                                          ScalarType tol, unsigned long m, ScalarType& residual,
+                                                          bool monitoring, const CConfig* config,
+                                                          const FGMRESOps& ops) const {
+  SU2_ZONE_SCOPED
+
+  const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE);
+  const bool flexible = !precond.IsIdentity();
+  const bool nestedParallel = ops.NestedParallel();
+
+  /*--- Check the subspace size. ---*/
+
+  if (m < 1) {
+    SU2_MPI::Error("Number of linear solver iterations must be greater than 0.", CURRENT_FUNCTION);
+  }
+
+  if (m > 1000) {
+    SU2_MPI::Error("FGMRES subspace is too large (>1000).", CURRENT_FUNCTION);
+  }
+
+  /*--- Allocate if not allocated yet. ---*/
+
+  if (V.size() <= m || (flexible && Z.size() <= m)) {
+    BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS {
+      V.resize(m + 1);
+      for (auto& w : V) w.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr);
+      if (flexible) {
+        Z.resize(m + 1);
+        for (auto& z : Z) z.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr);
+      }
+    }
+    END_SU2_OMP_SAFE_GLOBAL_ACCESS
+  }
+
+  /*--- Define various arrays. In parallel, each thread of each rank has and works
+   on its own thread, since calculations on these arrays are based on dot products
+   (reduced across all threads and ranks) all threads do the same computations. ---*/
+
+  su2vector<ScalarType> g(m + 1), sn(m + 1), cs(m + 1), y(m);
+  g = ScalarType(0);
+  sn = ScalarType(0);
+  cs = ScalarType(0);
+  y = ScalarType(0);
+
+  su2matrix<ScalarType> H(m + 1, m);
+  H = ScalarType(0);
+
+  ops.PrepareInput(b, x, xIsZero);
+
+  /*--- Calculate the norm of the rhs vector. ---*/
+
+  ScalarType norm0 = ops.Norm(b);
+
+  /*--- Calculate the initial residual (actually the negative residual) and compute its norm. ---*/
+
+  if (!xIsZero) {
+    mat_vec(x, V[0]);
+    ops.Subtract(V[0], b);
+  } else {
+    ops.AssignNegative(V[0], b);
+  }
+
+  ScalarType beta = xIsZero ? norm0 : ops.Norm(V[0]);
+
+  /*--- FGMRES shares V and Z with FGCRODR, reset deflation because we're breaking
+   * relations between subspaces that FGCRODR relies on for recycling. ---*/
+
+  SU2_OMP_MASTER
+  ResetDeflation();
+  END_SU2_OMP_MASTER
+
+  /*--- Set the norm to the initial initial residual value. ---*/
+
+  if (tol_type == LinearToleranceType::RELATIVE) norm0 = beta;
+
+  if (beta < tol * norm0 || beta < eps) {
+    /*--- System is already solved. ---*/
+
+    if (masterRank) {
+      SU2_OMP_MASTER
+      cout << "CSysSolve::FGMRES(): system solved by initial guess." << endl;
+      END_SU2_OMP_MASTER
+    }
+    residual = beta;
+    ops.FinalizeOutput(x);
+    return 0;
+  }
+
+  /*--- Normalize residual to get w_{0} (the negative sign is because w[0]
+        holds the negative residual, as mentioned above). ---*/
+
+  ops.Divide(V[0], -beta);
+
+  /*--- Initialize the RHS of the reduced system. ---*/
+
+  g[0] = beta;
+
+  /*--- Output header information including initial residual. ---*/
+
+  unsigned long i = 0;
+  if (monitoring && masterRank) {
+    SU2_OMP_MASTER {
+      WriteHeader("FGMRES", tol, beta);
+      WriteHistory(i, beta / norm0);
+    }
+    END_SU2_OMP_MASTER
+  }
+
+  /*--- Loop over all search directions. ---*/
+
+  for (i = 0; i < m; i++) {
+    /*--- Check if solution has converged. ---*/
+
+    if (beta < tol * norm0) break;
+
+    if (flexible) {
+      /*--- Precondition the CSysVector v[i] and store result in z[i]. ---*/
+
+      precond(V[i], Z[i]);
+
+      /*--- Add to Krylov subspace. ---*/
+
+      mat_vec(Z[i], V[i + 1]);
+    } else {
+      mat_vec(V[i], V[i + 1]);
+    }
+
+    /*--- Modified Gram-Schmidt orthogonalization. ---*/
+
+    bool orthog_ok;
+    if constexpr (FGMRESOps::UseSolverModGramSchmidt) {
+      if (nestedParallel) {
+        /*--- "omp parallel if" does not work well here. ---*/
+        SU2_OMP_PARALLEL
+        orthog_ok = ModGramSchmidt(true, i, H, V);
+        END_SU2_OMP_PARALLEL
+      } else {
+        orthog_ok = ModGramSchmidt(false, i, H, V);
+      }
+    } else {
+      orthog_ok = fgmres_impl::Orthogonalize(i, H, V, ops);
+    }
+
+    if (!orthog_ok) {
+      if (masterRank) {
+        SU2_OMP_MASTER
+        cout << "WARNING: FGMRES orthogonalization failed, linear solver diverged." << endl;
+        END_SU2_OMP_MASTER
+      }
+      break;
+    }
+
+    /*--- Apply old Givens rotations to new column of the Hessenberg matrix then generate the
+     new Givens rotation matrix and apply it to the last two elements of H[:][i] and g. ---*/
+
+    for (unsigned long k = 0; k < i; k++) ApplyGivens(sn[k], cs[k], H(k, i), H(k + 1, i));
+    GenerateGivens(H(i, i), H(i + 1, i), sn[i], cs[i]);
+    ApplyGivens(sn[i], cs[i], g[i], g[i + 1]);
+
+    /*--- Set L2 norm of residual and check if solution has converged. ---*/
+
+    beta = fabs(g[i + 1]);
+
+    /*--- Output the relative residual if necessary. ---*/
+
+    if (monitoring && masterRank && ((i + 1) % monitorFreq == 0)) {
+      SU2_OMP_MASTER
+      WriteHistory(i + 1, beta / norm0);
+      END_SU2_OMP_MASTER
+    }
+  }
+
+  /*--- Solve the least-squares system and update solution. ---*/
+
+  SolveReduced(i, H, g, y);
+
+  const auto& basis = flexible ? Z : V;
+
+  ops.UpdateSolution(i, basis, y, x, nestedParallel);
+  ops.FinalizeOutput(x);
+
+  /*--- Recalculate final (neg.) residual (this should be optional). ---*/
+
+  if (monitoring && config->GetComm_Level() == COMM_FULL) {
+    if (masterRank) {
+      SU2_OMP_MASTER
+      WriteFinalResidual("FGMRES", i, beta / norm0);
+      END_SU2_OMP_MASTER
+    }
+
+    if (recomputeRes) {
+      mat_vec(x, V[0]);
+      ops.Subtract(V[0], b);
+      ScalarType res = ops.Norm(V[0]);
+
+      if (fabs(res - beta) > tol * 10) {
+        if (masterRank) {
+          SU2_OMP_MASTER
+          WriteWarning(beta, res, tol);
+          END_SU2_OMP_MASTER
+        }
+      }
+    }
+  }
+
+  residual = beta / norm0;
+  return i;
+}
diff --git a/Common/include/linear_algebra/CSysVector.hpp b/Common/include/linear_algebra/CSysVector.hpp
index 1498b549bbb..d67185156b9 100644
--- a/Common/include/linear_algebra/CSysVector.hpp
+++ b/Common/include/linear_algebra/CSysVector.hpp
@@ -37,6 +37,10 @@
 #include "vector_expressions.hpp"
 #include "../../include/CConfig.hpp"
 
+#ifdef __CUDACC__
+#include "GPUComms.cuh"
+#endif
+
 /*!
  * \brief OpenMP worksharing construct used in CSysVector for loops.
  * \note The loop will only run in parallel if methods are called from a
@@ -59,6 +63,116 @@
 #define END_CSYSVEC_PARFOR
 #endif
 
+namespace VecExpr {
+
+enum class DeviceAssignOp { Assign, Add, Subtract, Multiply, Divide };
+
+template <class Scalar>
+class CDeviceVectorView : public CVecExpr<CDeviceVectorView<Scalar>, Scalar> {
+ private:
+  Scalar* data = nullptr;
+  unsigned long size = 0;
+
+ public:
+  static constexpr bool StoreAsRef = false;
+
+  CDeviceVectorView(Scalar* data_, unsigned long size_) : data(data_), size(size_) {}
+
+#ifdef __CUDACC__
+  __device__ const Scalar& operator[](size_t i) const { return data[i]; }
+
+  template <class T>
+  CDeviceVectorView& operator=(const CVecExpr<T, Scalar>& expr);
+
+  CDeviceVectorView& operator=(Scalar val);
+
+#define MAKE_DEVICE_COMPOUND(OP) \
+  template <class T>             \
+  CDeviceVectorView& operator OP(const CVecExpr<T, Scalar>& expr);
+  MAKE_DEVICE_COMPOUND(+=)
+  MAKE_DEVICE_COMPOUND(-=)
+  MAKE_DEVICE_COMPOUND(*=)
+  MAKE_DEVICE_COMPOUND(/=)
+#undef MAKE_DEVICE_COMPOUND
+
+#define MAKE_DEVICE_SCALAR_COMPOUND(OP) CDeviceVectorView& operator OP(Scalar val);
+  MAKE_DEVICE_SCALAR_COMPOUND(+=)
+  MAKE_DEVICE_SCALAR_COMPOUND(-=)
+  MAKE_DEVICE_SCALAR_COMPOUND(*=)
+  MAKE_DEVICE_SCALAR_COMPOUND(/=)
+#undef MAKE_DEVICE_SCALAR_COMPOUND
+#endif
+};
+
+#ifdef __CUDACC__
+template <DeviceAssignOp Op, class Scalar, class T>
+__global__ void DeviceAssignKernel(Scalar* data, unsigned long size, T expr) {
+  const unsigned long i = static_cast<unsigned long>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (i >= size) return;
+
+  if constexpr (Op == DeviceAssignOp::Assign) {
+    data[i] = expr[i];
+  } else if constexpr (Op == DeviceAssignOp::Add) {
+    data[i] += expr[i];
+  } else if constexpr (Op == DeviceAssignOp::Subtract) {
+    data[i] -= expr[i];
+  } else if constexpr (Op == DeviceAssignOp::Multiply) {
+    data[i] *= expr[i];
+  } else {
+    data[i] /= expr[i];
+  }
+}
+
+template <DeviceAssignOp Op, class Scalar, class T>
+inline void AssignDeviceExpression(Scalar* data, unsigned long size, const CVecExpr<T, Scalar>& expr) {
+  if (size == 0) return;
+  constexpr unsigned block_size = 256;
+  const auto grid_size = static_cast<unsigned>((size + block_size - 1) / block_size);
+  DeviceAssignKernel<Op><<<grid_size, block_size>>>(data, size, expr.derived());
+  gpuErrChk(cudaPeekAtLastError());
+}
+
+template <class Scalar>
+template <class T>
+CDeviceVectorView<Scalar>& CDeviceVectorView<Scalar>::operator=(const CVecExpr<T, Scalar>& expr) {
+  AssignDeviceExpression<DeviceAssignOp::Assign>(data, size, expr);
+  return *this;
+}
+
+template <class Scalar>
+CDeviceVectorView<Scalar>& CDeviceVectorView<Scalar>::operator=(Scalar val) {
+  AssignDeviceExpression<DeviceAssignOp::Assign>(data, size, Bcast<Scalar>(val));
+  return *this;
+}
+
+#define MAKE_DEVICE_COMPOUND(OP, ASSIGN_OP)                                                            \
+  template <class Scalar>                                                                              \
+  template <class T>                                                                                   \
+  CDeviceVectorView<Scalar>& CDeviceVectorView<Scalar>::operator OP(const CVecExpr<T, Scalar>& expr) { \
+    AssignDeviceExpression<ASSIGN_OP>(data, size, expr);                                               \
+    return *this;                                                                                      \
+  }
+MAKE_DEVICE_COMPOUND(+=, DeviceAssignOp::Add)
+MAKE_DEVICE_COMPOUND(-=, DeviceAssignOp::Subtract)
+MAKE_DEVICE_COMPOUND(*=, DeviceAssignOp::Multiply)
+MAKE_DEVICE_COMPOUND(/=, DeviceAssignOp::Divide)
+#undef MAKE_DEVICE_COMPOUND
+
+#define MAKE_DEVICE_SCALAR_COMPOUND(OP, ASSIGN_OP)                                \
+  template <class Scalar>                                                         \
+  CDeviceVectorView<Scalar>& CDeviceVectorView<Scalar>::operator OP(Scalar val) { \
+    AssignDeviceExpression<ASSIGN_OP>(data, size, Bcast<Scalar>(val));            \
+    return *this;                                                                 \
+  }
+MAKE_DEVICE_SCALAR_COMPOUND(+=, DeviceAssignOp::Add)
+MAKE_DEVICE_SCALAR_COMPOUND(-=, DeviceAssignOp::Subtract)
+MAKE_DEVICE_SCALAR_COMPOUND(*=, DeviceAssignOp::Multiply)
+MAKE_DEVICE_SCALAR_COMPOUND(/=, DeviceAssignOp::Divide)
+#undef MAKE_DEVICE_SCALAR_COMPOUND
+#endif
+
+}  // namespace VecExpr
+
 /*!
  * \class CSysVector
  * \ingroup SpLinSys
@@ -235,16 +349,32 @@ class CSysVector : public VecExpr::CVecExpr<CSysVector<ScalarType>, ScalarType>
   void DtHTransfer(bool trigger = true) const;
 
   /*!
-   * \brief Sets all the elements of the GPU vector to a certain value
-   * \param[in] trigger - boolean value that decides whether to conduct the transfer or not. True by default.
+   * \brief Dot product between this vector and another vector on the device.
+   * \note Explicit GPU helper for solver-side reductions.
+   * \param[in] other - Input vector.
+   * \return Dot product result.
    */
-  void GPUSetVal(ScalarType val, bool trigger = true) const;
+  ScalarType GPUDot(const CSysVector& other) const;
+
+  /*!
+   * \brief L2 norm of this vector on the device.
+   * \note Explicit GPU helper for solver-side reductions.
+   * \return L2 norm result.
+   */
+  ScalarType GPUNorm() const;
 
   /*!
    * \brief return device pointer that points to the CSysVector values in GPU memory
    */
   inline ScalarType* GetDevicePointer() const { return d_vec_val; }
 
+  /*!
+   * \brief Return an expression-template view of the device values.
+   */
+  inline VecExpr::CDeviceVectorView<ScalarType> device() const {
+    return VecExpr::CDeviceVectorView<ScalarType>(d_vec_val, nElm);
+  }
+
   /*!
    * \brief return the number of local elements in the CSysVector
    */
diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
index 13854381877..eb727477f21 100644
--- a/Common/include/linear_algebra/GPUComms.cuh
+++ b/Common/include/linear_algebra/GPUComms.cuh
@@ -25,6 +25,11 @@
 * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
 */
 
+#pragma once
+
+#ifndef SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH
+#define SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH
+
 #include<cuda_runtime.h>
 #include<iostream>
 
@@ -51,3 +56,5 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
 }
 
 #define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+
+#endif  // SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH
diff --git a/Common/include/linear_algebra/vector_expressions.hpp b/Common/include/linear_algebra/vector_expressions.hpp
index a0d0ce28901..b2c6a50f5a7 100644
--- a/Common/include/linear_algebra/vector_expressions.hpp
+++ b/Common/include/linear_algebra/vector_expressions.hpp
@@ -39,6 +39,12 @@ namespace VecExpr {
 /// \addtogroup VecExpr
 /// @{
 
+#ifdef __CUDACC__
+#define SU2_CUDA_HOST_DEVICE __host__ __device__
+#else
+#define SU2_CUDA_HOST_DEVICE
+#endif
+
 /*!
  * \brief Base vector expression class.
  * \ingroup BLAS
@@ -59,7 +65,7 @@ class CVecExpr {
   /*!
    * \brief Cast the expression to Derived, usually to allow evaluation via operator[].
    */
-  FORCEINLINE const Derived& derived() const { return static_cast<const Derived&>(*this); }
+  SU2_CUDA_HOST_DEVICE FORCEINLINE const Derived& derived() const { return static_cast<const Derived&>(*this); }
 
   // Allowed from C++14, allows nested expression propagation without
   // manually calling derived() on the expression being evaluated.
@@ -76,8 +82,8 @@ class Bcast : public CVecExpr<Bcast<Scalar>, Scalar> {
 
  public:
   static constexpr bool StoreAsRef = false;
-  FORCEINLINE Bcast(const Scalar& x_) : x(x_) {}
-  FORCEINLINE const Scalar& operator[](size_t) const { return x; }
+  SU2_CUDA_HOST_DEVICE FORCEINLINE Bcast(const Scalar& x_) : x(x_) {}
+  SU2_CUDA_HOST_DEVICE FORCEINLINE const Scalar& operator[](size_t) const { return x; }
 };
 
 /*!
@@ -120,19 +126,19 @@ namespace math = ::std;
 /*--- Macro to create expression classes (EXPR) and overloads (FUN) for unary
  * functions, based on their coefficient-wise implementation (IMPL). ---*/
 
-#define MAKE_UNARY_FUN(FUN, EXPR, IMPL)                             \
-  /*!--- Expression class. ---*/                                    \
-  template <class U, class Scalar>                                  \
-  class EXPR : public CVecExpr<EXPR<U, Scalar>, Scalar> {           \
-    store_t<const U> u;                                             \
-                                                                    \
-   public:                                                          \
-    static constexpr bool StoreAsRef = false;                       \
-    FORCEINLINE EXPR(const U& u_) : u(u_) {}                        \
-    FORCEINLINE auto operator[](size_t i) const RETURNS(IMPL(u[i])) \
-  };                                                                \
-  /*!--- Function overload, returns an expression object. ---*/     \
-  template <class U, class S>                                       \
+#define MAKE_UNARY_FUN(FUN, EXPR, IMPL)                                                  \
+  /*!--- Expression class. ---*/                                                         \
+  template <class U, class Scalar>                                                       \
+  class EXPR : public CVecExpr<EXPR<U, Scalar>, Scalar> {                                \
+    store_t<const U> u;                                                                  \
+                                                                                         \
+   public:                                                                               \
+    static constexpr bool StoreAsRef = false;                                            \
+    FORCEINLINE EXPR(const U& u_) : u(u_) {}                                             \
+    SU2_CUDA_HOST_DEVICE FORCEINLINE auto operator[](size_t i) const RETURNS(IMPL(u[i])) \
+  };                                                                                     \
+  /*!--- Function overload, returns an expression object. ---*/                          \
+  template <class U, class S>                                                            \
   FORCEINLINE auto FUN(const CVecExpr<U, S>& u) RETURNS(EXPR<U, S>(u.derived()))
 
 #define sign_impl(x) Scalar(1 - 2 * (x < 0))
@@ -158,7 +164,7 @@ MAKE_UNARY_FUN(sign, sign_, sign_impl)
    public:                                                                                                            \
     static constexpr bool StoreAsRef = false;                                                                         \
     FORCEINLINE EXPR(const U& u_, const V& v_) : u(u_), v(v_) {}                                                      \
-    FORCEINLINE auto operator[](size_t i) const RETURNS(IMPL(u[i], v[i]))                                             \
+    SU2_CUDA_HOST_DEVICE FORCEINLINE auto operator[](size_t i) const RETURNS(IMPL(u[i], v[i]))                       \
   };                                                                                                                  \
   /*!--- Vector with vector function overload. ---*/                                                                  \
   template <class U, class V, class S>                                                                                \
@@ -241,4 +247,5 @@ MAKE_BINARY_FUN(operator>, gt_, gt_impl)
 #undef MAKE_BINARY_FUN
 
 /// @}
+#undef SU2_CUDA_HOST_DEVICE
 }  // namespace VecExpr
diff --git a/Common/include/option_structure.hpp b/Common/include/option_structure.hpp
index 4c521c29239..65ba6dfa1ab 100644
--- a/Common/include/option_structure.hpp
+++ b/Common/include/option_structure.hpp
@@ -2517,6 +2517,7 @@ enum ENUM_LINEAR_SOLVER_PREC {
   PASTIX_ILU=10,  /*!< \brief PaStiX ILU(k) preconditioner. */
   PASTIX_LU_P,    /*!< \brief PaStiX LU as preconditioner. */
   PASTIX_LDLT_P,  /*!< \brief PaStiX LDLT as preconditioner. */
+  NO_PRECONDITIONER=20, /*!< \brief No preconditioner. */
 };
 static const MapType<std::string, ENUM_LINEAR_SOLVER_PREC> Linear_Solver_Prec_Map = {
   MakePair("JACOBI", JACOBI)
@@ -2526,6 +2527,7 @@ static const MapType<std::string, ENUM_LINEAR_SOLVER_PREC> Linear_Solver_Prec_Ma
   MakePair("PASTIX_ILU", PASTIX_ILU)
   MakePair("PASTIX_LU", PASTIX_LU_P)
   MakePair("PASTIX_LDLT", PASTIX_LDLT_P)
+  MakePair("NONE", NO_PRECONDITIONER)
 };
 
 /*!
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
index 00ef51c2023..fafa093734e 100644
--- a/Common/src/linear_algebra/CSysMatrix.cpp
+++ b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -54,6 +54,7 @@ CSysMatrix<ScalarType>::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::G
   col_ind_ilu = nullptr;
 
   invM = nullptr;
+  d_invM = nullptr;
 
 #ifdef USE_MKL
   MatrixMatrixProductJitter = nullptr;
@@ -67,6 +68,8 @@ template <class ScalarType>
 CSysMatrix<ScalarType>::~CSysMatrix() {
   SU2_ZONE_SCOPED
 
+  ReleaseCudaSpMVResources(spmv_resources);
+
   delete[] omp_partitions;
   MemoryAllocation::aligned_free(ILU_matrix);
   MemoryAllocation::aligned_free(matrix);
@@ -76,6 +79,7 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
     GPUMemoryAllocation::gpu_free(d_matrix);
     GPUMemoryAllocation::gpu_free(d_row_ptr);
     GPUMemoryAllocation::gpu_free(d_col_ind);
+    GPUMemoryAllocation::gpu_free(d_invM);
   }
 
 #ifdef USE_MKL
@@ -86,6 +90,10 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
 #endif
 }
 
+#ifndef HAVE_CUDA
+void ReleaseCudaSpMVResources(CudaSpMVResources*& resources) { resources = nullptr; }
+#endif
+
 template <class ScalarType>
 void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npointdomain, unsigned short nvar,
                                         unsigned short neqn, bool EdgeConnect, CGeometry* geometry,
@@ -196,6 +204,10 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
 
   if (diag_needed) allocAndInit(invM, nPointDomain * nVar * nEqn);
 
+  if (useCuda && diag_needed) {
+    d_invM = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(nPointDomain * nVar * nEqn * sizeof(ScalarType));
+  }
+
   /*--- Thread parallel initialization. ---*/
 
   int num_threads = omp_get_max_threads();
@@ -672,6 +684,19 @@ void CSysMatrix<ScalarType>::MatrixVectorProduct(const CSysVector<ScalarType>& v
 template <class ScalarType>
 void CSysMatrix<ScalarType>::BuildJacobiPreconditioner() {
   SU2_ZONE_SCOPED
+
+  if (useCuda) {
+#ifdef HAVE_CUDA
+    BuildJacobiPreconditionerGPU();
+    return;
+#else
+    SU2_MPI::Error(
+        "\nError in building Jacobi preconditioner\nENABLE_CUDA is set to YES\nPlease compile with CUDA options "
+        "enabled in Meson to access GPU Functions",
+        CURRENT_FUNCTION);
+#endif
+  }
+
   /*--- Build Jacobi preconditioner (M = D), compute and store the inverses of the diagonal blocks. ---*/
   SU2_OMP_FOR_DYN(omp_heavy_size)
   for (unsigned long iPoint = 0; iPoint < nPointDomain; iPoint++)
@@ -684,6 +709,19 @@ void CSysMatrix<ScalarType>::ComputeJacobiPreconditioner(const CSysVector<Scalar
                                                          CSysVector<ScalarType>& prod, CGeometry* geometry,
                                                          const CConfig* config) const {
   SU2_ZONE_SCOPED
+
+  if (config->GetCUDA()) {
+#ifdef HAVE_CUDA
+    ComputeJacobiPreconditionerGPU(vec, prod, geometry, config);
+    return;
+#else
+    SU2_MPI::Error(
+        "\nError in applying Jacobi preconditioner\nENABLE_CUDA is set to YES\nPlease compile with CUDA options "
+        "enabled in Meson to access GPU Functions",
+        CURRENT_FUNCTION);
+#endif
+  }
+
   /*--- Apply Jacobi preconditioner, y = D^{-1} * x, the inverse of the diagonal is already known. ---*/
   SU2_OMP_BARRIER
   SU2_OMP_FOR_DYN(omp_heavy_size)
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
index a3f1a77ca40..b8bb16ce5a9 100644
--- a/Common/src/linear_algebra/CSysMatrixGPU.cu
+++ b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -55,6 +55,38 @@ inline cusparseIndexType_t GetCusparseIndexType() {
   }
 }
 
+struct CudaSpMVResources {
+  cusparseHandle_t handle = nullptr;
+  cusparseConstSpMatDescr_t mat = nullptr;
+  size_t buffer_size = 0;
+  void* buffer = nullptr;
+};
+
+void ReleaseCudaSpMVResources(CudaSpMVResources*& resources) {
+  if (resources == nullptr) {
+    return;
+  }
+
+  if (resources->buffer != nullptr) {
+    gpuErrChk(cudaFree(resources->buffer));
+    resources->buffer = nullptr;
+    resources->buffer_size = 0;
+  }
+
+  if (resources->mat != nullptr) {
+    cusparseErrChk(cusparseDestroySpMat(resources->mat));
+    resources->mat = nullptr;
+  }
+
+  if (resources->handle != nullptr) {
+    cusparseErrChk(cusparseDestroy(resources->handle));
+    resources->handle = nullptr;
+  }
+
+  delete resources;
+  resources = nullptr;
+}
+
 template <class ScalarType>
 constexpr cudaDataType GetCudaDataType() {
   if constexpr (std::is_same<ScalarType, float>::value) {
@@ -83,8 +115,6 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
   ScalarType* d_vec = vec.GetDevicePointer();
   ScalarType* d_prod = prod.GetDevicePointer();
 
-  vec.HtDTransfer();
-
   const auto indexType = GetCusparseIndexType();
   const auto valueType = GetCudaDataType<ScalarType>();
 
@@ -100,42 +130,56 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
   const ScalarType alpha = 1.0;
   const ScalarType beta = 0.0;
 
-  cusparseHandle_t handle = nullptr;
-  cusparseConstSpMatDescr_t matA = nullptr;
-  cusparseDnVecDescr_t vecX = nullptr;
-  cusparseDnVecDescr_t vecY = nullptr;
+  if (spmv_resources == nullptr) {
+    spmv_resources = new CudaSpMVResources;
 
-  cusparseErrChk(cusparseCreate(&handle));
+    cusparseErrChk(cusparseCreate(&spmv_resources->handle));
 
-  cusparseErrChk(cusparseCreateConstBsr(&matA, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr, d_col_ind, d_matrix,
-                                        indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, valueType, CUSPARSE_ORDER_ROW));
+    cusparseErrChk(cusparseCreateConstBsr(&spmv_resources->mat, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr,
+                                          d_col_ind, d_matrix, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO,
+                                          valueType, CUSPARSE_ORDER_ROW));
+  }
+
+  cusparseDnVecDescr_t vecX = nullptr;
+  cusparseDnVecDescr_t vecY = nullptr;
 
   cusparseErrChk(cusparseCreateDnVec(&vecX, xSize, d_vec, valueType));
   cusparseErrChk(cusparseCreateDnVec(&vecY, ySize, d_prod, valueType));
 
-  size_t bufferSize = 0;
-  void* dBuffer = nullptr;
+  size_t required_buffer_size = 0;
 
-  cusparseErrChk(cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY,
-                                         valueType, CUSPARSE_SPMV_BSR_ALG1, &bufferSize));
+  cusparseErrChk(cusparseSpMV_bufferSize(spmv_resources->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                                         spmv_resources->mat, vecX, &beta, vecY, valueType, CUSPARSE_SPMV_BSR_ALG1,
+                                         &required_buffer_size));
 
-  if (bufferSize > 0) {
-    gpuErrChk(cudaMalloc(&dBuffer, bufferSize));
-  }
+  if (required_buffer_size > spmv_resources->buffer_size) {
+    if (spmv_resources->buffer != nullptr) {
+      gpuErrChk(cudaFree(spmv_resources->buffer));
+    }
 
-  cusparseErrChk(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, valueType,
-                              CUSPARSE_SPMV_BSR_ALG1, dBuffer));
+    if (required_buffer_size > 0) {
+      gpuErrChk(cudaMalloc(&spmv_resources->buffer, required_buffer_size));
+    } else {
+      spmv_resources->buffer = nullptr;
+    }
 
-  if (dBuffer != nullptr) {
-    gpuErrChk(cudaFree(dBuffer));
+    spmv_resources->buffer_size = required_buffer_size;
   }
 
+  cusparseErrChk(cusparseSpMV(spmv_resources->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmv_resources->mat,
+                              vecX, &beta, vecY, valueType, CUSPARSE_SPMV_BSR_ALG1, spmv_resources->buffer));
+
   cusparseErrChk(cusparseDestroyDnVec(vecY));
   cusparseErrChk(cusparseDestroyDnVec(vecX));
-  cusparseErrChk(cusparseDestroySpMat(matA));
-  cusparseErrChk(cusparseDestroy(handle));
-
-  prod.DtHTransfer();
 }
-
-template class CSysMatrix<su2mixedfloat>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.
+template void CSysMatrix<su2mixedfloat>::HtDTransfer(bool trigger) const;
+template void CSysMatrix<su2mixedfloat>::GPUMatrixVectorProduct(const CSysVector<su2mixedfloat>& vec,
+                                                                CSysVector<su2mixedfloat>& prod, CGeometry* geometry,
+                                                                const CConfig* config) const;
+
+#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION)
+template void CSysMatrix<passivedouble>::HtDTransfer(bool trigger) const;
+template void CSysMatrix<passivedouble>::GPUMatrixVectorProduct(const CSysVector<passivedouble>& vec,
+                                                                CSysVector<passivedouble>& prod, CGeometry* geometry,
+                                                                const CConfig* config) const;
+#endif
diff --git a/Common/src/linear_algebra/CSysPreconditionerGPU.cu b/Common/src/linear_algebra/CSysPreconditionerGPU.cu
new file mode 100644
index 00000000000..adee93264eb
--- /dev/null
+++ b/Common/src/linear_algebra/CSysPreconditionerGPU.cu
@@ -0,0 +1,109 @@
+/*!
+ * \file CSysPreconditionerGPU.cu
+ * \brief CUDA/GPU skeleton implementations for matrix-based preconditioners.
+ * \author Jesse Li
+ * \version 8.5.0 "Harrier"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2026, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../../include/linear_algebra/CSysMatrix.inl"
+#include "../../include/linear_algebra/GPUComms.cuh"
+
+namespace {
+
+template <class ScalarType>
+__global__ void ApplyJacobiPreconditionerKernel(const ScalarType* invM, const ScalarType* vec, ScalarType* prod,
+                                                unsigned long nPointDomain, unsigned long nVar) {
+  const auto iPoint = static_cast<unsigned long>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (iPoint >= nPointDomain) return;
+
+  const auto block = &invM[iPoint * nVar * nVar];
+  const auto rhs = &vec[iPoint * nVar];
+  auto out = &prod[iPoint * nVar];
+
+  for (auto iVar = 0ul; iVar < nVar; ++iVar) {
+    ScalarType sum = ScalarType(0);
+    for (auto jVar = 0ul; jVar < nVar; ++jVar) {
+      sum += block[iVar * nVar + jVar] * rhs[jVar];
+    }
+    out[iVar] = sum;
+  }
+}
+
+}  // namespace
+
+template <class ScalarType>
+void CSysMatrix<ScalarType>::BuildJacobiPreconditionerGPU() {
+  SU2_ZONE_SCOPED
+
+  if (nVar != nEqn) {
+    SU2_MPI::Error("CUDA Jacobi preconditioner requires square blocks.", CURRENT_FUNCTION);
+  }
+
+  if (invM == nullptr) {
+    SU2_MPI::Error("CUDA Jacobi preconditioner was requested without host inverse block storage.", CURRENT_FUNCTION);
+  }
+
+  if (d_invM == nullptr) {
+    d_invM = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(nPointDomain * nVar * nEqn * sizeof(ScalarType));
+  }
+
+  for (auto iPoint = 0ul; iPoint < nPointDomain; ++iPoint) {
+    InverseDiagonalBlock(iPoint, &(invM[iPoint * nVar * nVar]));
+  }
+
+  gpuErrChk(cudaMemcpy(d_invM, invM, nPointDomain * nVar * nVar * sizeof(ScalarType), cudaMemcpyHostToDevice));
+}
+
+template <class ScalarType>
+void CSysMatrix<ScalarType>::ComputeJacobiPreconditionerGPU(const CSysVector<ScalarType>& vec,
+                                                            CSysVector<ScalarType>& prod, CGeometry* geometry,
+                                                            const CConfig* config) const {
+  (void)geometry;
+  (void)config;
+
+  SU2_ZONE_SCOPED
+
+  if (d_invM == nullptr) {
+    SU2_MPI::Error("CUDA Jacobi preconditioner used before BuildJacobiPreconditionerGPU.", CURRENT_FUNCTION);
+  }
+
+  constexpr unsigned threadsPerBlock = 128;
+  const auto blocks = static_cast<unsigned>((nPointDomain + threadsPerBlock - 1) / threadsPerBlock);
+  ApplyJacobiPreconditionerKernel<<<blocks, threadsPerBlock>>>(d_invM, vec.GetDevicePointer(), prod.GetDevicePointer(),
+                                                               nPointDomain, nVar);
+  gpuErrChk(cudaPeekAtLastError());
+}
+
+template void CSysMatrix<su2mixedfloat>::BuildJacobiPreconditionerGPU();
+template void CSysMatrix<su2mixedfloat>::ComputeJacobiPreconditionerGPU(const CSysVector<su2mixedfloat>& vec,
+                                                                        CSysVector<su2mixedfloat>& prod,
+                                                                        CGeometry* geometry,
+                                                                        const CConfig* config) const;
+
+#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION)
+template void CSysMatrix<passivedouble>::BuildJacobiPreconditionerGPU();
+template void CSysMatrix<passivedouble>::ComputeJacobiPreconditionerGPU(const CSysVector<passivedouble>& vec,
+                                                                        CSysVector<passivedouble>& prod,
+                                                                        CGeometry* geometry,
+                                                                        const CConfig* config) const;
+#endif
diff --git a/Common/src/linear_algebra/CSysSolve.cpp b/Common/src/linear_algebra/CSysSolve.cpp
index cd100f311ab..1045ae68bbc 100644
--- a/Common/src/linear_algebra/CSysSolve.cpp
+++ b/Common/src/linear_algebra/CSysSolve.cpp
@@ -131,8 +131,35 @@ void LinearCombination(bool parallel, Ts&&... args) {
   }
 }
 
+template <class ScalarType>
+class CHostFGMRESOps {
+ public:
+  static constexpr bool UseSolverModGramSchmidt = true;
+
+  bool NestedParallel() const { return !omp_in_parallel() && omp_get_max_threads() > 1; }
+
+  void PrepareInput(const CSysVector<ScalarType>&, CSysVector<ScalarType>&, bool) const {}
+
+  void FinalizeOutput(CSysVector<ScalarType>&) const {}
+
+  ScalarType Norm(const CSysVector<ScalarType>& v) const { return v.norm(); }
+
+  void AssignNegative(CSysVector<ScalarType>& dst, const CSysVector<ScalarType>& src) const { dst = -src; }
+
+  void Subtract(CSysVector<ScalarType>& dst, const CSysVector<ScalarType>& src) const { dst -= src; }
+
+  void Divide(CSysVector<ScalarType>& dst, ScalarType val) const { dst /= val; }
+
+  void UpdateSolution(unsigned long n, const std::vector<CSysVector<ScalarType>>& basis, const su2vector<ScalarType>& y,
+                      CSysVector<ScalarType>& x, bool nestedParallel) const {
+    LinearCombination(nestedParallel, n, basis, y, x, true);
+  }
+};
+
 }  // namespace
 
+#include "../../include/linear_algebra/CSysSolveFGMRES.inl"
+
 template <class ScalarType>
 CSysSolve<ScalarType>::CSysSolve(LINEAR_SOLVER_MODE linear_solver_mode)
     : eps(linSolEpsilon<ScalarType>()),
@@ -421,200 +448,19 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
                                                       const CConfig* config) const {
   SU2_ZONE_SCOPED
 
-  const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE);
-  const bool flexible = !precond.IsIdentity();
-  /*--- If we call the solver outside of a parallel region, but the number of threads allows,
-   * we still want to parallelize some of the expensive operations. ---*/
-  const bool nestedParallel = !omp_in_parallel() && omp_get_max_threads() > 1;
-
-  /*---  Check the subspace size ---*/
-
-  if (m < 1) {
-    SU2_MPI::Error("Number of linear solver iterations must be greater than 0.", CURRENT_FUNCTION);
-  }
-
-  if (m > 1000) {
-    SU2_MPI::Error("FGMRES subspace is too large (>1000).", CURRENT_FUNCTION);
-  }
-
-  /*--- Allocate if not allocated yet ---*/
-
-  if (V.size() <= m || (flexible && Z.size() <= m)) {
-    BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS {
-      V.resize(m + 1);
-      for (auto& w : V) w.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr);
-      if (flexible) {
-        Z.resize(m + 1);
-        for (auto& z : Z) z.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr);
-      }
-    }
-    END_SU2_OMP_SAFE_GLOBAL_ACCESS
-  }
-
-  /*--- Define various arrays. In parallel, each thread of each rank has and works
-   on its own thread, since calculations on these arrays are based on dot products
-   (reduced across all threads and ranks) all threads do the same computations. ---*/
-
-  su2vector<ScalarType> g(m + 1), sn(m + 1), cs(m + 1), y(m);
-  g = ScalarType(0);
-  sn = ScalarType(0);
-  cs = ScalarType(0);
-  y = ScalarType(0);
-  su2matrix<ScalarType> H(m + 1, m);
-  H = ScalarType(0);
-
-  /*--- Calculate the norm of the rhs vector. ---*/
-
-  ScalarType norm0 = b.norm();
-
-  /*--- Calculate the initial residual (actually the negative residual) and compute its norm. ---*/
-
-  if (!xIsZero) {
-    mat_vec(x, V[0]);
-    V[0] -= b;
-  } else {
-    V[0] = -b;
-  }
-
-  ScalarType beta = xIsZero ? norm0 : V[0].norm();
-
-  /*--- FGMRES shares V and Z with FGCRODR, reset deflation because we're breaking
-   * relations between subspaces that FGCRODR relies on for recycling. ---*/
-
-  SU2_OMP_MASTER
-  ResetDeflation();
-  END_SU2_OMP_MASTER
-
-  /*--- Set the norm to the initial initial residual value ---*/
-
-  if (tol_type == LinearToleranceType::RELATIVE) norm0 = beta;
-
-  if (beta < tol * norm0 || beta < eps) {
-    /*--- System is already solved ---*/
-
-    if (masterRank) {
-      SU2_OMP_MASTER
-      cout << "CSysSolve::FGMRES(): system solved by initial guess." << endl;
-      END_SU2_OMP_MASTER
-    }
-    residual = beta;
-    return 0;
-  }
-
-  /*--- Normalize residual to get w_{0} (the negative sign is because w[0]
-        holds the negative residual, as mentioned above). ---*/
-
-  V[0] /= -beta;
-
-  /*--- Initialize the RHS of the reduced system ---*/
-
-  g[0] = beta;
-
-  /*--- Output header information including initial residual ---*/
-
-  unsigned long i = 0;
-  if (monitoring && masterRank) {
-    SU2_OMP_MASTER {
-      WriteHeader("FGMRES", tol, beta);
-      WriteHistory(i, beta / norm0);
-    }
-    END_SU2_OMP_MASTER
-  }
-
-  /*---  Loop over all search directions ---*/
-
-  for (i = 0; i < m; i++) {
-    /*---  Check if solution has converged ---*/
-
-    if (beta < tol * norm0) break;
-
-    if (flexible) {
-      /*---  Precondition the CSysVector v[i] and store result in z[i] ---*/
-
-      precond(V[i], Z[i]);
-
-      /*---  Add to Krylov subspace ---*/
-
-      mat_vec(Z[i], V[i + 1]);
-    } else {
-      mat_vec(V[i], V[i + 1]);
-    }
-
-    /*---  Modified Gram-Schmidt orthogonalization ---*/
-
-    bool orthog_ok;
-    if (nestedParallel) {
-      /*--- "omp parallel if" does not work well here ---*/
-      SU2_OMP_PARALLEL
-      orthog_ok = ModGramSchmidt(true, i, H, V);
-      END_SU2_OMP_PARALLEL
-    } else {
-      orthog_ok = ModGramSchmidt(false, i, H, V);
-    }
-
-    if (!orthog_ok) {
-      if (masterRank) {
-        SU2_OMP_MASTER
-        cout << "WARNING: FGMRES orthogonalization failed, linear solver diverged." << endl;
-        END_SU2_OMP_MASTER
-      }
-      break;
-    }
-
-    /*---  Apply old Givens rotations to new column of the Hessenberg matrix then generate the
-     new Givens rotation matrix and apply it to the last two elements of H[:][i] and g ---*/
-
-    for (unsigned long k = 0; k < i; k++) ApplyGivens(sn[k], cs[k], H(k, i), H(k + 1, i));
-    GenerateGivens(H(i, i), H(i + 1, i), sn[i], cs[i]);
-    ApplyGivens(sn[i], cs[i], g[i], g[i + 1]);
-
-    /*---  Set L2 norm of residual and check if solution has converged ---*/
-
-    beta = fabs(g[i + 1]);
-
-    /*---  Output the relative residual if necessary ---*/
-
-    if (monitoring && masterRank && ((i + 1) % monitorFreq == 0)) {
-      SU2_OMP_MASTER
-      WriteHistory(i + 1, beta / norm0);
-      END_SU2_OMP_MASTER
-    }
-  }
-
-  /*---  Solve the least-squares system and update solution ---*/
-
-  SolveReduced(i, H, g, y);
-
-  const auto& basis = flexible ? Z : V;
-
-  LinearCombination(nestedParallel, i, basis, y, x, true);
-
-  /*---  Recalculate final (neg.) residual (this should be optional) ---*/
-
-  if (monitoring && config->GetComm_Level() == COMM_FULL) {
-    if (masterRank) {
-      SU2_OMP_MASTER
-      WriteFinalResidual("FGMRES", i, beta / norm0);
-      END_SU2_OMP_MASTER
-    }
-
-    if (recomputeRes) {
-      mat_vec(x, V[0]);
-      V[0] -= b;
-      ScalarType res = V[0].norm();
-
-      if (fabs(res - beta) > tol * 10) {
-        if (masterRank) {
-          SU2_OMP_MASTER
-          WriteWarning(beta, res, tol);
-          END_SU2_OMP_MASTER
-        }
-      }
-    }
+  if (config->GetCUDA()) {
+#ifdef HAVE_CUDA
+    return FGMRES_LinSolver_GPU(b, x, mat_vec, precond, tol, m, residual, monitoring, config);
+#else
+    SU2_MPI::Error(
+        "\nError in launching FGMRES solver\nENABLE_CUDA is set to YES\nPlease compile with CUDA options enabled "
+        "in Meson to access GPU Functions",
+        CURRENT_FUNCTION);
+#endif
   }
 
-  residual = beta / norm0;
-  return i;
+  return FGMRES_LinSolverImpl(b, x, mat_vec, precond, tol, m, residual, monitoring, config,
+                              CHostFGMRESOps<ScalarType>());
 }
 
 template <class ScalarType>
diff --git a/Common/src/linear_algebra/CSysSolveGPU.cu b/Common/src/linear_algebra/CSysSolveGPU.cu
new file mode 100644
index 00000000000..d618cc223b5
--- /dev/null
+++ b/Common/src/linear_algebra/CSysSolveGPU.cu
@@ -0,0 +1,120 @@
+/*!
+ * \file CSysSolveGPU.cu
+ * \brief CUDA/GPU backend dispatch for Krylov solvers.
+ * \author Jesse Li
+ * \version 8.5.0 "Harrier"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2026, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../../include/linear_algebra/CSysSolve.hpp"
+#include "../../include/linear_algebra/CMatrixVectorProduct.hpp"
+#include "../../include/linear_algebra/CPreconditioner.hpp"
+#include "../../include/linear_algebra/GPUComms.cuh"
+
+void SU2_GPU_BeginSolverBLASContext();
+void SU2_GPU_EndSolverBLASContext();
+
+namespace {
+
+class CGPUSolverBLASContextGuard {
+ public:
+  CGPUSolverBLASContextGuard() { SU2_GPU_BeginSolverBLASContext(); }
+
+  ~CGPUSolverBLASContextGuard() { SU2_GPU_EndSolverBLASContext(); }
+
+  CGPUSolverBLASContextGuard(const CGPUSolverBLASContextGuard&) = delete;
+  CGPUSolverBLASContextGuard& operator=(const CGPUSolverBLASContextGuard&) = delete;
+};
+
+template <class ScalarType>
+class CDeviceFGMRESOps {
+ public:
+  static constexpr bool UseSolverModGramSchmidt = false;
+
+  bool NestedParallel() const { return false; }
+
+  void PrepareInput(const CSysVector<ScalarType>& b, CSysVector<ScalarType>& x, bool x_is_zero) const {
+    b.HtDTransfer();
+    if (x_is_zero) {
+      x.device() = ScalarType(0);
+    } else {
+      x.HtDTransfer();
+    }
+  }
+
+  void FinalizeOutput(CSysVector<ScalarType>& x) const { x.DtHTransfer(); }
+
+  ScalarType Norm(const CSysVector<ScalarType>& v) const { return v.GPUNorm(); }
+
+  void AssignNegative(CSysVector<ScalarType>& dst, const CSysVector<ScalarType>& src) const {
+    dst.device() = -src.device();
+  }
+
+  void Subtract(CSysVector<ScalarType>& dst, const CSysVector<ScalarType>& src) const {
+    dst.device() -= src.device();
+  }
+
+  void Divide(CSysVector<ScalarType>& dst, ScalarType val) const { dst.device() /= val; }
+
+  ScalarType Dot(const CSysVector<ScalarType>& lhs, const CSysVector<ScalarType>& rhs) const {
+    return lhs.GPUDot(rhs);
+  }
+
+  void AddScaled(CSysVector<ScalarType>& dst, ScalarType alpha, const CSysVector<ScalarType>& src) const {
+    dst.device() += alpha * src.device();
+  }
+
+  void UpdateSolution(unsigned long n, const std::vector<CSysVector<ScalarType>>& basis,
+                      const su2vector<ScalarType>& y, CSysVector<ScalarType>& x, bool) const {
+    for (unsigned long k = 0; k < n; k++) {
+      AddScaled(x, y[k], basis[k]);
+    }
+  }
+};
+
+}  // namespace
+
+#include "../../include/linear_algebra/CSysSolveFGMRES.inl"
+
+template <class ScalarType>
+unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver_GPU(const VectorType& b, VectorType& x,
+                                                          const ProductType& mat_vec, const PrecondType& precond,
+                                                          ScalarType tol, unsigned long m, ScalarType& residual,
+                                                          bool monitoring, const CConfig* config) const {
+  CGPUSolverBLASContextGuard blas_context;
+  return FGMRES_LinSolverImpl(b, x, mat_vec, precond, tol, m, residual, monitoring, config,
+                              CDeviceFGMRESOps<ScalarType>());
+}
+
+/*--- Explicit instantiations for the GPU backend wrapper.
+ * Keep these aligned with the scalar types instantiated for CSysSolve. ---*/
+template unsigned long CSysSolve<su2mixedfloat>::FGMRES_LinSolver_GPU(
+    const CSysVector<su2mixedfloat>& b, CSysVector<su2mixedfloat>& x,
+    const CMatrixVectorProduct<su2mixedfloat>& mat_vec, const CPreconditioner<su2mixedfloat>& precond,
+    su2mixedfloat tol, unsigned long m, su2mixedfloat& residual, bool monitoring, const CConfig* config) const;
+
+#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION)
+template unsigned long CSysSolve<passivedouble>::FGMRES_LinSolver_GPU(
+    const CSysVector<passivedouble>& b, CSysVector<passivedouble>& x,
+    const CMatrixVectorProduct<passivedouble>& mat_vec, const CPreconditioner<passivedouble>& precond,
+    passivedouble tol, unsigned long m, passivedouble& residual, bool monitoring, const CConfig* config) const;
+#endif
diff --git a/Common/src/linear_algebra/CSysVectorGPU.cu b/Common/src/linear_algebra/CSysVectorGPU.cu
index 94ec17bb88f..dca72c0d76b 100644
--- a/Common/src/linear_algebra/CSysVectorGPU.cu
+++ b/Common/src/linear_algebra/CSysVectorGPU.cu
@@ -27,23 +27,129 @@
 
 #include "../../include/linear_algebra/CSysVector.hpp"
 #include "../../include/linear_algebra/GPUComms.cuh"
+#include <cublas_v2.h>
+#include <type_traits>
 
-template<class ScalarType>
-void CSysVector<ScalarType>::HtDTransfer(bool trigger) const
-{
-   if(trigger) gpuErrChk(cudaMemcpy((void*)(d_vec_val), (void*)&vec_val[0], (sizeof(ScalarType)*nElm), cudaMemcpyHostToDevice));
+namespace {
+
+thread_local cublasHandle_t active_solver_blas_handle = nullptr;
+thread_local unsigned active_solver_blas_depth = 0;
+
+cublasHandle_t GetBlasHandle(const char* creation_error, bool& owns_handle) {
+  if (active_solver_blas_handle != nullptr) {
+    owns_handle = false;
+    return active_solver_blas_handle;
+  }
+
+  cublasHandle_t handle = nullptr;
+  owns_handle = true;
+  if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
+    SU2_MPI::Error(creation_error, CURRENT_FUNCTION);
+  }
+  return handle;
+}
+
+void ReleaseBlasHandle(cublasHandle_t handle, bool owns_handle, const char* destruction_error) {
+  if (owns_handle && handle != nullptr && cublasDestroy(handle) != CUBLAS_STATUS_SUCCESS) {
+    SU2_MPI::Error(destruction_error, CURRENT_FUNCTION);
+  }
+}
+
+}  // namespace
+
+void SU2_GPU_BeginSolverBLASContext() {
+  if (active_solver_blas_depth == 0) {
+    cublasHandle_t handle = nullptr;
+    if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
+      SU2_MPI::Error("cuBLAS handle creation failed for the GPU linear solver context.", CURRENT_FUNCTION);
+    }
+    active_solver_blas_handle = handle;
+  }
+  ++active_solver_blas_depth;
+}
+
+void SU2_GPU_EndSolverBLASContext() {
+  if (active_solver_blas_depth == 0) {
+    SU2_MPI::Error("GPU linear solver BLAS context ended without a matching begin.", CURRENT_FUNCTION);
+  }
+
+  --active_solver_blas_depth;
+  if (active_solver_blas_depth == 0) {
+    auto status = cublasDestroy(active_solver_blas_handle);
+    active_solver_blas_handle = nullptr;
+    if (status != CUBLAS_STATUS_SUCCESS) {
+      SU2_MPI::Error("cuBLAS handle destruction failed for the GPU linear solver context.", CURRENT_FUNCTION);
+    }
+  }
 }
 
-template<class ScalarType>
-void CSysVector<ScalarType>::DtHTransfer(bool trigger) const
-{
-   if(trigger) gpuErrChk(cudaMemcpy((void*)(&vec_val[0]), (void*)d_vec_val, (sizeof(ScalarType)*nElm), cudaMemcpyDeviceToHost));
+template <class ScalarType>
+void CSysVector<ScalarType>::HtDTransfer(bool trigger) const {
+  if (trigger)
+    gpuErrChk(cudaMemcpy((void*)(d_vec_val), (void*)&vec_val[0], (sizeof(ScalarType) * nElm), cudaMemcpyHostToDevice));
 }
 
-template<class ScalarType>
-void CSysVector<ScalarType>::GPUSetVal(ScalarType val, bool trigger) const
-{
-   if(trigger) gpuErrChk(cudaMemset((void*)(d_vec_val), val, (sizeof(ScalarType)*nElm)));
+template <class ScalarType>
+void CSysVector<ScalarType>::DtHTransfer(bool trigger) const {
+  if (trigger)
+    gpuErrChk(cudaMemcpy((void*)(&vec_val[0]), (void*)d_vec_val, (sizeof(ScalarType) * nElm), cudaMemcpyDeviceToHost));
 }
 
-template class CSysVector<su2double>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.
+template <class ScalarType>
+ScalarType CSysVector<ScalarType>::GPUDot(const CSysVector& other) const {
+  bool owns_handle = false;
+  cublasHandle_t handle = GetBlasHandle("cuBLAS handle creation failed in CSysVector::GPUDot.", owns_handle);
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+
+  ScalarType local_dot = ScalarType(0);
+
+  if constexpr (std::is_same_v<ScalarType, float>) {
+    status = cublasSdot(handle, static_cast<int>(nElmDomain), GetDevicePointer(), 1, other.GetDevicePointer(), 1,
+                        &local_dot);
+  } else if constexpr (std::is_same_v<ScalarType, double>) {
+    status = cublasDdot(handle, static_cast<int>(nElmDomain), GetDevicePointer(), 1, other.GetDevicePointer(), 1,
+                        &local_dot);
+  } else {
+    ReleaseBlasHandle(handle, owns_handle, "cuBLAS handle destruction failed in CSysVector::GPUDot.");
+    SU2_MPI::Error("Unsupported ScalarType in CSysVector::GPUDot.", CURRENT_FUNCTION);
+    return ScalarType(0);
+  }
+
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    ReleaseBlasHandle(handle, owns_handle, "cuBLAS handle destruction failed in CSysVector::GPUDot.");
+    SU2_MPI::Error("cuBLAS dot failed in CSysVector::GPUDot.", CURRENT_FUNCTION);
+    return ScalarType(0);
+  }
+
+  ReleaseBlasHandle(handle, owns_handle, "cuBLAS handle destruction failed in CSysVector::GPUDot.");
+
+  ScalarType global_dot = ScalarType(0);
+  const auto mpi_type = (sizeof(ScalarType) < sizeof(double)) ? MPI_FLOAT : MPI_DOUBLE;
+  SelectMPIWrapper<ScalarType>::W::Allreduce(&local_dot, &global_dot, 1, mpi_type, MPI_SUM, SU2_MPI::GetComm());
+
+  return global_dot;
+}
+
+template <class ScalarType>
+ScalarType CSysVector<ScalarType>::GPUNorm() const {
+  return sqrt(GPUDot(*this));
+}
+
+template void CSysVector<su2mixedfloat>::HtDTransfer(bool trigger) const;
+template void CSysVector<su2mixedfloat>::DtHTransfer(bool trigger) const;
+template su2mixedfloat CSysVector<su2mixedfloat>::GPUDot(const CSysVector<su2mixedfloat>& other) const;
+template su2mixedfloat CSysVector<su2mixedfloat>::GPUNorm() const;
+
+#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION)
+template void CSysVector<passivedouble>::HtDTransfer(bool trigger) const;
+template void CSysVector<passivedouble>::DtHTransfer(bool trigger) const;
+template passivedouble CSysVector<passivedouble>::GPUDot(const CSysVector<passivedouble>& other) const;
+template passivedouble CSysVector<passivedouble>::GPUNorm() const;
+#endif
+
+#ifdef CODI_REVERSE_TYPE
+template void CSysVector<su2double>::HtDTransfer(bool trigger) const;
+template void CSysVector<su2double>::DtHTransfer(bool trigger) const;
+template su2double CSysVector<su2double>::GPUDot(const CSysVector<su2double>& other) const;
+template su2double CSysVector<su2double>::GPUNorm() const;
+#endif
diff --git a/Common/src/linear_algebra/meson.build b/Common/src/linear_algebra/meson.build
index 7b880b29c1e..29bb63cd680 100644
--- a/Common/src/linear_algebra/meson.build
+++ b/Common/src/linear_algebra/meson.build
@@ -6,5 +6,5 @@ common_src += files(['CSysSolve_b.cpp',
                      'blas_structure.cpp'])
 
   if get_option('enable-cuda')
-    common_src += files(['CSysMatrixGPU.cu', 'CSysVectorGPU.cu',])
+    common_src += files(['CSysMatrixGPU.cu', 'CSysVectorGPU.cu', 'CSysSolveGPU.cu', 'CSysPreconditionerGPU.cu',])
 endif
diff --git a/Common/src/meson.build b/Common/src/meson.build
index f385c4a32ed..710e764303f 100644
--- a/Common/src/meson.build
+++ b/Common/src/meson.build
@@ -6,6 +6,27 @@ common_src =files(['graph_coloring_structure.cpp',
            '../include/parallelization/mpi_structure.cpp',
            '../include/parallelization/omp_structure.cpp'])
 
+common_cuda_cpp_args = []
+foreach arg : su2_cpp_args
+  if arg.startswith('-D') or arg.startswith('-U')
+    common_cuda_cpp_args += arg
+  endif
+endforeach
+
+common_cuda_rev_args = common_cuda_cpp_args
+foreach arg : codi_rev_args
+  if arg.startswith('-D') or arg.startswith('-U')
+    common_cuda_rev_args += arg
+  endif
+endforeach
+
+common_cuda_for_args = common_cuda_cpp_args
+foreach arg : codi_for_args
+  if arg.startswith('-D') or arg.startswith('-U')
+    common_cuda_for_args += arg
+  endif
+endforeach
+
 subdir('linear_algebra')
 subdir('toolboxes')
 subdir('geometry')
@@ -24,7 +45,8 @@ if get_option('enable-normal')
                           common_src,
                           install : false,
                           dependencies : su2_deps,
-                          cpp_args: [default_warning_flags, su2_cpp_args])
+                          cpp_args: [default_warning_flags, su2_cpp_args],
+                          cuda_args: common_cuda_cpp_args)
 
   common_dep = declare_dependency(link_with: common,
                                   include_directories : common_include)
@@ -36,7 +58,8 @@ if get_option('enable-autodiff')
                             common_src,
                             install : false,
                             dependencies : [su2_deps, codi_dep],
-                            cpp_args: [default_warning_flags, su2_cpp_args, codi_rev_args])
+                            cpp_args: [default_warning_flags, su2_cpp_args, codi_rev_args],
+                            cuda_args: common_cuda_rev_args)
 
   commonAD_dep = declare_dependency(link_with: commonAD,
                                     include_directories : common_include)
@@ -49,7 +72,8 @@ if get_option('enable-directdiff')
                             common_src,
                             install : false,
                             dependencies : [su2_deps, codi_dep],
-                            cpp_args: [default_warning_flags, su2_cpp_args, codi_for_args])
+                            cpp_args: [default_warning_flags, su2_cpp_args, codi_for_args],
+                            cuda_args: common_cuda_for_args)
 
   commonDD_dep = declare_dependency(link_with: commonDD,
                                     include_directories : common_include)
diff --git a/meson.build b/meson.build
index d43e2b031d8..43786c100e9 100644
--- a/meson.build
+++ b/meson.build
@@ -20,13 +20,18 @@ python = pymod.find_installation()
 if get_option('enable-cuda')
   add_languages('cuda')
   add_global_arguments('-arch=sm_86', language : 'cuda')
-  cuda_deps = [meson.get_compiler('cuda').find_library('cusparse', required : true)]
+  cuda_deps = [
+    meson.get_compiler('cuda').find_library('cusparse', required : true),
+    meson.get_compiler('cuda').find_library('cublas', required : true),
+  ]
 else
   cuda_deps = []
 endif
 
 su2_cpp_args = []
 su2_deps     = [declare_dependency(include_directories: 'externals/CLI11')] + cuda_deps
+codi_rev_args = []
+codi_for_args = []
 
 default_warning_flags = []
 if build_machine.system() != 'windows'