From fd5e74e2ad3e86408f59faec68a5f04d2e78e25a Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Mon, 26 Sep 2016 14:58:06 +0200
Subject: [PATCH 001/101] Add GSVD solver based on QR, CS decompositions

(double precision real only)
---
 SRC/dggqrcs.f | 564 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 564 insertions(+)
 create mode 100644 SRC/dggqrcs.f
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
new file mode 100644
index 0000000000..1eb39b4a1e
--- /dev/null
+++ b/SRC/dggqrcs.f
@@ -0,0 +1,564 @@
+*> \brief <b> DGGQRCS computes the singular value decomposition (SVD) for OTHER matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGGQRCS + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dggqrcs.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dggqrcs.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dggqrcs.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
+*                           A, LDA, B, LDB,
+*                           THETA, U1, LDU1, U2, LDU2, QT, LDQT,
+*                           WORK, LWORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBU1, JOB2, JOBQT
+*       INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
+*      $                   M, N, P, L, LWORK
+*       DOUBLE PRECISION   W
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IWORK( * )
+*       DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), THETA( * ),
+*      $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQ, * ),
+*      $                   WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGGQRCS computes the generalized singular value decomposition (GSVD)
+*> of an M-by-N real matrix A and P-by-N real matrix B:
+*>
+*>       U1**T*A*Q = D1*( 0 R ),    U2**T*B*Q = D2*( 0 R )
+*>
+*> where U1, U2, and Q are orthogonal matrices. DGGQRCS uses the QR
+*> factorization with column pivoting and the 2-by-1 CS decomposition to
+*> compute the GSVD.
+*>
+*> Let L be the effective numerical rank of the matrix (A**T,B**T)**T,
+*> then R is a L-by-L nonsingular upper triangular matrix, D1 and
+*> D2 are M-by-L and P-by-L "diagonal" matrices and of the
+*> following structures, respectively:
+*>
+*>                        K   K1
+*>        D1 =     (  0   0   0 )
+*>              K  (  0   S   0 )
+*>              K1 (  0   0   I )
+*>
+*>                    K2  K
+*>        D2 =  K2 (  I   0   0 )
+*>              K  (  0   C   0 )
+*>                 (  0   0   0 )
+*>
+*>                 N-L  L
+*>   ( 0 R ) = L (  0   R )
+*>
+*> where
+*>
+*>   K  = MIN(M, P, L, M + P - L),
+*>   K1 = MAX(L - P, 0),
+*>   K2 = MAX(L - M, 0),
+*>   C  = diag( COS(THETA(1)), ..., COS(THETA(K)) ),
+*>   S  = diag( SIN(THETA(1)), ..., SIN(THETA(K)) ), and
+*>   C^2 + S^2 = I.
+*
+*> The routine computes C, S, R, and optionally the orthogonal
+*> transformation matrices U, V and Q. If L <= M, then R is stored in
+*> A(1:L, :) on exit. Otherwise, the first M rows of R are stored in A
+*> and the last L-M rows are stored in B(1:L-M, :).
+*>
+*> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
+*> A and B implicitly gives the SVD of A*inv(B):
+*>                      A*inv(B) = U1*(D1*inv(D2))*U2**T.
+*> If (A**T,B**T)**T  has orthonormal columns, then the GSVD of A and B
+*> is also equal to the CS decomposition of A and B. Furthermore, the
+*> GSVD can be used to derive the solution of the eigenvalue problem:
+*>                      A**T*A x = lambda* B**T*B x.
+*> In some literature, the GSVD of A and B is presented in the form
+*>                  U1**T*A*X = ( 0 D1 ),   U2**T*B*X = ( 0 D2 )
+*> where U1 and U2 are orthogonal and X is nonsingular, D1 and D2 are
+*> ``diagonal''.  The former GSVD form can be converted to the latter
+*> form by taking the nonsingular matrix X as
+*>
+*>                      X = Q*( I   0    )
+*>                            ( 0 inv(R) ).
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBU1
+*> \verbatim
+*>          JOBU1 is CHARACTER*1
+*>          = 'Y':  Orthogonal matrix U1 is computed;
+*>          = 'N':  U1 is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBU2
+*> \verbatim
+*>          JOBU2 is CHARACTER*1
+*>          = 'Y':  Orthogonal matrix U2 is computed;
+*>          = 'N':  U2 is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBQT
+*> \verbatim
+*>          JOBQT is CHARACTER*1
+*>          = 'Y':  Orthogonal matrix Q is computed;
+*>          = 'N':  Q is not computed.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrices A and B.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] P
+*> \verbatim
+*>          P is INTEGER
+*>          The number of rows of the matrix B.  P >= 0.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W in DOUBLE PRECISION
+*>
+*>          On exit, W is a radix power chosen such that the Frobenius
+*>          norm of A and W*B are within sqrt(radix) and 1/sqrt(radix)
+*>          of each other.
+*> \endverbatim
+*>
+*> \param[out] L
+*> \verbatim
+*>          L is INTEGER
+*>          On exit, the effective numerical rank of the matrix
+*>          (A**T, B**T)**T.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*>          On exit, A contains the triangular matrix R or the first M
+*>          rows of R, respectively. See Purpose for details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (LDB,N)
+*>          On entry, the P-by-N matrix B.
+*>          On exit, if L > M, then B contains the last L - M rows of
+*>          the triangular matrix R. See Purpose for details.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,P).
+*> \endverbatim
+*>
+*> \param[out] THETA
+*> \verbatim
+*>          THETA is DOUBLE PRECISION array, dimension (N)
+*>
+*>          On exit, THETA contains K = MIN(M, P, L, M + P - L) values
+*>          in radians in ascending order.
+*> \endverbatim
+*>
+*> \param[out] U1
+*> \verbatim
+*>          U1 is DOUBLE PRECISION array, dimension (LDU1,M)
+*>          If JOBU1 = 'Y', U1 contains the M-by-M orthogonal matrix U1.
+*>          If JOBU1 = 'N', U1 is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDU1
+*> \verbatim
+*>          LDU1 is INTEGER
+*>          The leading dimension of the array U1. LDU1 >= max(1,M) if
+*>          JOBU1 = 'Y'; LDU1 >= 1 otherwise.
+*> \endverbatim
+*>
+*> \param[out] U2
+*> \verbatim
+*>          U2 is DOUBLE PRECISION array, dimension (LDU2,P)
+*>          If JOBU2 = 'Y', U2 contains the P-by-P orthogonal matrix U2.
+*>          If JOBU2 = 'N', U2 is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDU2
+*> \verbatim
+*>          LDU2 is INTEGER
+*>          The leading dimension of the array U2. LDU2 >= max(1,P) if
+*>          JOBU2 = 'Y'; LDU2 >= 1 otherwise.
+*> \endverbatim
+*>
+*> \param[out] QT
+*> \verbatim
+*>          QT is DOUBLE PRECISION array, dimension (LDQT,N)
+*>          If JOBQT = 'Y', QT contains the N-by-N orthogonal matrix
+*>          Q**T.
+*> \endverbatim
+*>
+*> \param[in] LDQT
+*> \verbatim
+*>          LDQT is INTEGER
+*>          The leading dimension of the array QT. LDQT >= max(1,N) if
+*>          JOBQT = 'Y'; LDQT >= 1 otherwise.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal size of the WORK array,
+*>          returns this value as the first entry of the WORK array, and
+*>          no error message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (M + N + P)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>          > 0:  DBBCSD did not converge. For further details, see
+*>                subroutine DORCSDBY1.
+*> \endverbatim
+*
+*> \par Internal Parameters:
+*  =========================
+*>
+*> \verbatim
+*>  TOL     DOUBLE PRECISION
+*>          Let G = (A**T,B**T)**T. TOL is the threshold to determine
+*>          the effective rank of G. Generally, it is set to
+*>                   TOL = MAX(M,P,N) * norm(G) * MACHEPS,
+*>          where norm(G) is the Frobenius norm of G.
+*>          The size of TOL may affect the size of backward error of the
+*>          decomposition.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Christoph Conrads (https://christoph-conrads.name)
+*
+*> \date September 2016
+*
+*> \ingroup doubleGEsing
+*
+*> \par Contributors:
+*  ==================
+*>
+*>     Christoph Conrads (https://christoph-conrads.name)
+*>
+*
+*> \par Further Details:
+*  =====================
+*>
+*>  DGGQRCS should be significantly faster than DGGSVD and DGGSVD3 for
+*>  large matrices because the matrices A and B are reduced to a pair of
+*>  well-conditioned bidiagonal matrices instead of pairs of upper
+*>  triangular matrices. On the downside, DGGQRCS requires a much larger
+*>  workspace whose dimension must be queried at run-time.
+*>
+*  =====================================================================
+      SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
+     $                    A, LDA, B, LDB
+     $                    THETA, U1, LDU1, U2, LDU2, QT, LDQT,
+     $                    WORK, LWORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine (version 3.7.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
+*     September 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBU1, JOBU2, JOBQT
+      INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
+     $                   L, M, N, P, LWORK
+      DOUBLE PRECISION   W
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * )
+      DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), THETA( * ),
+     $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQT, * ),
+     $                   WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Local Scalars ..
+      LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
+      INTEGER            I, J, Z, R, LWKOPT
+      DOUBLE PRECISION   GNORM, FACTOR, TOL, ULP, UNFL
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      DOUBLE PRECISION   DLAMCH, DLANGE
+      EXTERNAL           LSAME, DLAMCH, DLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DGEQP3, DORGQR, DGERQF, QORGRQ,
+     $                   DORCSD2BY1, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Decode and test the input parameters
+*
+      WANTU1 = LSAME( JOBU1, 'Y' )
+      WANTU2 = LSAME( JOBU2, 'Y' )
+      WANTQT = LSAME( JOBQT, 'Y' )
+      LQUERY = ( LWORK.EQ.-1 )
+      LWKOPT = 1
+*
+*     Initialize variables
+*
+      G = WORK
+      TAU = WORK
+      L = MIN( M + P, N )
+      LDG = M + P
+      Z = ( M + P ) * N
+*
+*     Test the input arguments
+*
+      INFO = 0
+      IF( .NOT.( WANTU1 .OR. LSAME( JOBU1, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( WANTU2 .OR. LSAME( JOBU2, 'N' ) ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( WANTQT .OR. LSAME( JOBQT, 'N' ) ) ) THEN
+         INFO = -3
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( P.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -10
+      ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
+         INFO = -12
+      ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
+         INFO = -15
+      ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
+         INFO = -17
+      ELSE IF( LDQT.LT.1 .OR. ( WANTQT .AND. LDQT.LT.N ) ) THEN
+         INFO = -19
+      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
+         INFO = -23
+      END IF
+*
+*     Compute workspace
+*
+      IF( INFO.EQ.0 ) THEN
+         CALL DGEQP3( M+P, N, G, LDG, IWORK,
+     $                THETA, WORK( Z + 1 ), LWORK - Z, -1 )
+         LWKOPT = INT( WORK( 1 ) )
+
+         CALL DORGQR( M + P, L, L, G, LDG, THETA, WORK( Z + 1 ), -1 )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+
+         CALL DORCSD2BY1( JOBU1, JOBU2, JOBQT, M + P, M, N,
+     $                    G, LDG, G, LDG,
+     $                    THETA, U1, LDU1, U2, LDU2, QT, LDQT,
+     $                    WORK( Z + 1 ), LWORK - Z, IWORK, -1 )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+         LWKOPT = (M + P)*N + LWKOPT
+
+*        DGERQF stores L scalar factors for the elementary reflectors
+         CALL DGERQF( L, N, QT, LDQT, TAU, WORK( L ), LWORK, -1 )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
+
+         CALL DORGRQ( N, N, L, QT, LDQT, TAU, WORK( L ), LWORK, -1 )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
+
+         WORK( 1 ) = DBLE( LWKOPT )
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DGGQRCS', -INFO )
+         RETURN
+      END IF
+      IF( LQUERY ) THEN
+         RETURN
+      ENDIF
+*
+*     Scale matrix B such that norm(A) \approx norm(B)
+*
+      NORMA = DLANGE( 'F', M, N, A, LDA, NULL )
+      NORMB = DLANGE( 'F', P, N, B, LDB, NULL )
+*
+      BASE = DLAMCH( 'B' )
+      FACTOR = -0.5D0 / LOG ( BASE )
+      W = BASE ** INT( FACTOR * LOG( NORMA / NORMB ) )
+*
+      DLASCL( 'G', -1, -1, ONE, W, P, N, B, LDB, INFO )
+      IF ( INFO.NE.0 )
+         RETURN
+      END IF
+*
+*     Copy matrices A, B into the (M+P) x n matrix G
+*
+      DLACPY( 'A', M, N, A, LDA, G( P + 1, 1 ), LDG )
+      DLACPY( 'A', P, N, B, LDB, G( 1, 1 ), LDG )
+*
+*     Compute the Frobenius norm of matrix G
+*
+      GNORM = DLANGE( 'F', M + P, N, G, LDG, NULL )
+*
+*     Get machine precision and set up threshold for determining
+*     the effective numerical rank of the matrix G.
+*
+      ULP = DLAMCH( 'Precision' )
+      UNFL = DLAMCH( 'Safe Minimum' )
+      TOL = MAX( M + P, N )*MAX( NORM, UNFL )*ULP
+*
+*     IWORK stores the column permutations computed by DGEQP3.
+*     Columns J where IWORK( J ) is non-zero are permuted to the front
+*     so we set the all entries to zero here.
+*
+      DO 10 J = 1, N
+         IWORK( J ) = 0
+   10 CONTINUE
+*
+*     Compute the QR factorization with column pivoting GΠ = Q1 R1
+*
+      CALL DGEQP3( M + P, N, G, LDG, IWORK, THETA, WORK2, LWORK2, INFO )
+      IF( INFO.NE.0 ) THEN
+         ERROR
+      END IF
+*
+*     Determine the rank of G
+*
+      R = 0
+      DO 20 I = 1, MIN( M + P, N )
+         IF( ABS( G( I, I ) ).LE.TOL ) THEN
+            EXIT
+         END IF
+         R = R + 1
+   20 CONTINUE
+*
+      L = R
+*
+*     Copy R1 into A
+*
+      IF( R.LE.M ) THEN
+          DLACPY( 'U', R, N, G, LDG, A, LDA )
+      ELSE
+          DLACPY( 'U', M, N, G, LDG, A, LDA )
+          DLACPY( 'A', R - M, N, G( M + 1, 1 ), LDG, B, LDB )
+      END IF
+*
+*     Explicitly form Q1 so that we can compute the CS decomposition
+*
+      DORGQR( M + P, R, R, G, LDG, THETA, WORK2, LWORK2, INFO )
+*
+*     Compute the CS decomposition of Q1( :, 1:R )
+*
+      DORCSD2BY1( JOBU1, JOBU2, JOBQT, M + P, M, R,
+     $            G( 1, 1 ), LDG, G( M + 1, 1 ), LDG, THETA,
+     $            U2, LDU2, U1, LDU1, QT, LDQT,
+     $            WORK2, LWORK2, IWORK( N + 1 ), INFO )
+      IF( INFO.NE.0 )
+         RETURN
+      END IF
+*
+*     Copy V^T from QT to G
+*
+      DLACPY( 'A', R, R, QT, LDQT, G, R )
+*
+*     Compute V^T R1( 1:R, : )
+*
+      IF ( R.LE.M ) THEN
+         DGEMM( 'N', 'N', R, N, R, 1.0D0, G, R,
+     $          A, LDA, 0.0D0, QT, LDQT )
+      ELSE
+         DGEMM( 'N', 'N', R, N, M, 1.0D0, G( 1, 1 ), R,
+     $          A, LDA, 0.0D0, QT, LDQT )
+         DGEMM( 'N', 'N', R, N, R - M, 1.0D0, G( 1, M+1: ), R,
+     $          B, LDB, 1.0D0, QT, LDQT )
+      END IF
+*
+*     Copy V^T R1( 1:R, : ) from G to QT
+*
+      DLACPY
+*
+*     Compute the RQ decomposition of V^T R1( 1:R, : )
+*
+      DGERQF( R, N, QT, LDQT, TAU, WORK2, LWORK2, INFO )
+      IF ( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     Copy matrix R from QT( 1:R, N-R+1: ) to A, B
+*
+      IF ( R.LE.M ) THEN
+         DLACPY( 'U', R, R, QT( 1, N-R+1: ), LDQT, A, LDA )
+      ELSE
+         DLACPY( 'U', M,     R, QT(     1, N-R+1: ), LDQT, A, LDA )
+         DLACPY( 'U', R - M, R, QT( M + 1, N-R+1: ), LDQT, B, LDB )
+      END IF
+*
+*     Explicitly form Q^T
+*
+      DORGRQ( N, N, R, QT, LDQT, TAU, WORK2, LWORK2, INFO )
+      IF ( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     Revert column permutation Π by permuting the rows of Q^T
+*
+      DLAPMT( .FALSE., N, N, QT, LDQT, IWORK )
+*
+      WORK( 1 ) = DBLE( LWKOPT )
+      RETURN
+*
+*     End of DGGQRCS
+*
+      END

From 63ce49028287fd8ab123e3955514874039e8bda3 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Mon, 26 Sep 2016 16:55:48 +0200
Subject: [PATCH 002/101] Make DGGQRCS compile

- add file to CMakeLists.txt
- fix syntax errors etc.
---
 SRC/CMakeLists.txt |  2 +-
 SRC/dggqrcs.f      | 79 +++++++++++++++++++++++-----------------------
 2 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index abd05731d7..98a35253d8 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -284,7 +284,7 @@ set(DLASRC
    dgetrf.f dgetrf2.f dgetri.f
    dgetrs.f dggbak.f dggbal.f
    dgges.f  dgges3.f dggesx.f dggev.f  dggev3.f dggevx.f
-   dggglm.f dgghrd.f dgghd3.f dgglse.f dggqrf.f
+   dggglm.f dgghrd.f dgghd3.f dgglse.f dggqrf.f dggqrcs.f
    dggrqf.f dggsvd3.f dggsvp3.f dgtcon.f dgtrfs.f dgtsv.f
    dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f
    dlaqz0.f dlaqz1.f dlaqz2.f dlaqz3.f dlaqz4.f
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 1eb39b4a1e..65fa2ea860 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -77,7 +77,7 @@
 *>   C  = diag( COS(THETA(1)), ..., COS(THETA(K)) ),
 *>   S  = diag( SIN(THETA(1)), ..., SIN(THETA(K)) ), and
 *>   C^2 + S^2 = I.
-*
+*>
 *> The routine computes C, S, R, and optionally the orthogonal
 *> transformation matrices U, V and Q. If L <= M, then R is stored in
 *> A(1:L, :) on exit. Otherwise, the first M rows of R are stored in A
@@ -89,7 +89,7 @@
 *> If (A**T,B**T)**T  has orthonormal columns, then the GSVD of A and B
 *> is also equal to the CS decomposition of A and B. Furthermore, the
 *> GSVD can be used to derive the solution of the eigenvalue problem:
-*>                      A**T*A x = lambda* B**T*B x.
+*>                      A**T*A x = lambda * B**T*B x.
 *> In some literature, the GSVD of A and B is presented in the form
 *>                  U1**T*A*X = ( 0 D1 ),   U2**T*B*X = ( 0 D2 )
 *> where U1 and U2 are orthogonal and X is nonsingular, D1 and D2 are
@@ -306,7 +306,7 @@
 *>
 *  =====================================================================
       SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
-     $                    A, LDA, B, LDB
+     $                    A, LDA, B, LDB,
      $                    THETA, U1, LDU1, U2, LDU2, QT, LDQT,
      $                    WORK, LWORK, IWORK, INFO )
 *
@@ -333,7 +333,9 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
       INTEGER            I, J, Z, R, LWKOPT
-      DOUBLE PRECISION   GNORM, FACTOR, TOL, ULP, UNFL
+      DOUBLE PRECISION   GNORM, FACTOR, TOL, ULP, UNFL, NORMA, NORMB
+*     .. Local Arrays ..
+      DOUBLE PRECISION   G( M + P, N )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -341,7 +343,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       EXTERNAL           LSAME, DLAMCH, DLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DCOPY, DGEQP3, DORGQR, DGERQF, QORGRQ,
+      EXTERNAL           DLACPY, DLASCL, DGEQP3, DORGQR, DGERQF, QORGRQ,
      $                   DORCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
@@ -359,11 +361,10 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Initialize variables
 *
-      G = WORK
-      TAU = WORK
       L = MIN( M + P, N )
-      LDG = M + P
       Z = ( M + P ) * N
+      G = RESHAPE( WORK(1:Z), (/ M + P, N /) )
+      LDG = M + P
 *
 *     Test the input arguments
 *
@@ -409,13 +410,13 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
      $                    THETA, U1, LDU1, U2, LDU2, QT, LDQT,
      $                    WORK( Z + 1 ), LWORK - Z, IWORK, -1 )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
-         LWKOPT = (M + P)*N + LWKOPT
+         LWKOPT = Z + LWKOPT
 
 *        DGERQF stores L scalar factors for the elementary reflectors
-         CALL DGERQF( L, N, QT, LDQT, TAU, WORK( L ), LWORK, -1 )
+         CALL DGERQF( L, N, QT, LDQT, WORK, WORK, LWORK - L, -1 )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
 
-         CALL DORGRQ( N, N, L, QT, LDQT, TAU, WORK( L ), LWORK, -1 )
+         CALL DORGRQ( N, N, L, QT, LDQT, WORK, WORK, LWORK - L, -1 )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
 
          WORK( 1 ) = DBLE( LWKOPT )
@@ -438,15 +439,15 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       FACTOR = -0.5D0 / LOG ( BASE )
       W = BASE ** INT( FACTOR * LOG( NORMA / NORMB ) )
 *
-      DLASCL( 'G', -1, -1, ONE, W, P, N, B, LDB, INFO )
-      IF ( INFO.NE.0 )
+      CALL DLASCL( 'G', -1, -1, ONE, W, P, N, B, LDB, INFO )
+      IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
 *
 *     Copy matrices A, B into the (M+P) x n matrix G
 *
-      DLACPY( 'A', M, N, A, LDA, G( P + 1, 1 ), LDG )
-      DLACPY( 'A', P, N, B, LDB, G( 1, 1 ), LDG )
+      CALL DLACPY( 'A', M, N, A, LDA, G( P + 1, 1 ), LDG )
+      CALL DLACPY( 'A', P, N, B, LDB, G( 1, 1 ), LDG )
 *
 *     Compute the Frobenius norm of matrix G
 *
@@ -457,7 +458,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       ULP = DLAMCH( 'Precision' )
       UNFL = DLAMCH( 'Safe Minimum' )
-      TOL = MAX( M + P, N )*MAX( NORM, UNFL )*ULP
+      TOL = MAX( M + P, N ) * MAX( GNORM, UNFL ) * ULP
 *
 *     IWORK stores the column permutations computed by DGEQP3.
 *     Columns J where IWORK( J ) is non-zero are permuted to the front
@@ -469,9 +470,10 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Compute the QR factorization with column pivoting GΠ = Q1 R1
 *
-      CALL DGEQP3( M + P, N, G, LDG, IWORK, THETA, WORK2, LWORK2, INFO )
+      CALL DGEQP3( M + P, N, G, LDG, IWORK, THETA,
+     $             WORK( Z + 1 ), LWORK - Z, INFO )
       IF( INFO.NE.0 ) THEN
-         ERROR
+         RETURN
       END IF
 *
 *     Determine the rank of G
@@ -489,49 +491,47 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     Copy R1 into A
 *
       IF( R.LE.M ) THEN
-          DLACPY( 'U', R, N, G, LDG, A, LDA )
+          CALL DLACPY( 'U', R, N, G, LDG, A, LDA )
       ELSE
-          DLACPY( 'U', M, N, G, LDG, A, LDA )
-          DLACPY( 'A', R - M, N, G( M + 1, 1 ), LDG, B, LDB )
+          CALL DLACPY( 'U', M, N, G, LDG, A, LDA )
+          CALL DLACPY( 'A', R - M, N, G( M + 1, 1 ), LDG, B, LDB )
       END IF
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition
 *
-      DORGQR( M + P, R, R, G, LDG, THETA, WORK2, LWORK2, INFO )
+      CALL DORGQR( M + P, R, R, G, LDG, THETA,
+     $             WORK( Z + 1 ), LWORK - Z, INFO )
 *
 *     Compute the CS decomposition of Q1( :, 1:R )
 *
-      DORCSD2BY1( JOBU1, JOBU2, JOBQT, M + P, M, R,
+      CALL DORCSD2BY1( JOBU1, JOBU2, JOBQT, M + P, M, R,
      $            G( 1, 1 ), LDG, G( M + 1, 1 ), LDG, THETA,
      $            U2, LDU2, U1, LDU1, QT, LDQT,
-     $            WORK2, LWORK2, IWORK( N + 1 ), INFO )
-      IF( INFO.NE.0 )
+     $            WORK( Z + 1 ), LWORK - Z, IWORK( N + 1 ), INFO )
+      IF( INFO.NE.0 ) THEN
          RETURN
       END IF
 *
 *     Copy V^T from QT to G
 *
-      DLACPY( 'A', R, R, QT, LDQT, G, R )
+      CALL DLACPY( 'A', R, R, QT, LDQT, G, R )
 *
 *     Compute V^T R1( 1:R, : )
 *
       IF ( R.LE.M ) THEN
-         DGEMM( 'N', 'N', R, N, R, 1.0D0, G, R,
+         CALL DGEMM( 'N', 'N', R, N, R, 1.0D0, G, R,
      $          A, LDA, 0.0D0, QT, LDQT )
       ELSE
-         DGEMM( 'N', 'N', R, N, M, 1.0D0, G( 1, 1 ), R,
+         CALL DGEMM( 'N', 'N', R, N, M, 1.0D0, G( 1, 1 ), R,
      $          A, LDA, 0.0D0, QT, LDQT )
-         DGEMM( 'N', 'N', R, N, R - M, 1.0D0, G( 1, M+1: ), R,
+         CALL DGEMM( 'N', 'N', R, N, R - M, 1.0D0, G( 1, M + 1 ), R,
      $          B, LDB, 1.0D0, QT, LDQT )
       END IF
 *
-*     Copy V^T R1( 1:R, : ) from G to QT
-*
-      DLACPY
-*
 *     Compute the RQ decomposition of V^T R1( 1:R, : )
 *
-      DGERQF( R, N, QT, LDQT, TAU, WORK2, LWORK2, INFO )
+      CALL DGERQF( R, N, QT, LDQT, WORK,
+     $             WORK( L + 1 ), LWORK - L, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
@@ -539,22 +539,23 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     Copy matrix R from QT( 1:R, N-R+1: ) to A, B
 *
       IF ( R.LE.M ) THEN
-         DLACPY( 'U', R, R, QT( 1, N-R+1: ), LDQT, A, LDA )
+         CALL DLACPY( 'U', R, R, QT( 1, N-R+1 ), LDQT, A, LDA )
       ELSE
-         DLACPY( 'U', M,     R, QT(     1, N-R+1: ), LDQT, A, LDA )
-         DLACPY( 'U', R - M, R, QT( M + 1, N-R+1: ), LDQT, B, LDB )
+         CALL DLACPY( 'U', M,     R, QT(     1, N-R+1 ), LDQT, A, LDA )
+         CALL DLACPY( 'U', R - M, R, QT( M + 1, N-R+1 ), LDQT, B, LDB )
       END IF
 *
 *     Explicitly form Q^T
 *
-      DORGRQ( N, N, R, QT, LDQT, TAU, WORK2, LWORK2, INFO )
+      CALL DORGRQ( N, N, R, QT, LDQT, WORK,
+     $             WORK( L + 1 ), LWORK - L, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
 *
 *     Revert column permutation Π by permuting the rows of Q^T
 *
-      DLAPMT( .FALSE., N, N, QT, LDQT, IWORK )
+      CALL DLAPMT( .FALSE., N, N, QT, LDQT, IWORK )
 *
       WORK( 1 ) = DBLE( LWKOPT )
       RETURN

From 96dccdf7f769c0e943ff9c7e1f071dc35a4b6451 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Tue, 27 Sep 2016 17:49:14 +0200
Subject: [PATCH 003/101] Fix DGGQRCS

- workspace queries are signalled by LWORK=-1, not INFO=-1
- fix undeclared, uninitialized variable errors
---
 SRC/dggqrcs.f | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 65fa2ea860..597ecd5ec8 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -315,6 +315,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *     September 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          JOBU1, JOBU2, JOBQT
       INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
@@ -332,8 +333,9 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
-      INTEGER            I, J, Z, R, LWKOPT
-      DOUBLE PRECISION   GNORM, FACTOR, TOL, ULP, UNFL, NORMA, NORMB
+      INTEGER            I, J, Z, R, LDG, LWKOPT
+      DOUBLE PRECISION   GNORM, FACTOR, TOL, ULP, UNFL, NORMA, NORMB,
+     $                   BASE
 *     .. Local Arrays ..
       DOUBLE PRECISION   G( M + P, N )
 *     ..
@@ -399,24 +401,25 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       IF( INFO.EQ.0 ) THEN
          CALL DGEQP3( M+P, N, G, LDG, IWORK,
-     $                THETA, WORK( Z + 1 ), LWORK - Z, -1 )
+     $                THETA, WORK( Z + 1 ), -1, INFO )
          LWKOPT = INT( WORK( 1 ) )
 
-         CALL DORGQR( M + P, L, L, G, LDG, THETA, WORK( Z + 1 ), -1 )
+         CALL DORGQR( M + P, L, L, G, LDG, THETA,
+     $                WORK( Z + 1 ), -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 
          CALL DORCSD2BY1( JOBU1, JOBU2, JOBQT, M + P, M, N,
      $                    G, LDG, G, LDG,
      $                    THETA, U1, LDU1, U2, LDU2, QT, LDQT,
-     $                    WORK( Z + 1 ), LWORK - Z, IWORK, -1 )
+     $                    WORK( Z + 1 ), -1, IWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          LWKOPT = Z + LWKOPT
 
 *        DGERQF stores L scalar factors for the elementary reflectors
-         CALL DGERQF( L, N, QT, LDQT, WORK, WORK, LWORK - L, -1 )
+         CALL DGERQF( L, N, QT, LDQT, WORK, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
 
-         CALL DORGRQ( N, N, L, QT, LDQT, WORK, WORK, LWORK - L, -1 )
+         CALL DORGRQ( N, N, L, QT, LDQT, WORK, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
 
          WORK( 1 ) = DBLE( LWKOPT )
@@ -432,14 +435,14 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Scale matrix B such that norm(A) \approx norm(B)
 *
-      NORMA = DLANGE( 'F', M, N, A, LDA, NULL )
-      NORMB = DLANGE( 'F', P, N, B, LDB, NULL )
+      NORMA = DLANGE( 'F', M, N, A, LDA, WORK )
+      NORMB = DLANGE( 'F', P, N, B, LDB, WORK )
 *
       BASE = DLAMCH( 'B' )
       FACTOR = -0.5D0 / LOG ( BASE )
       W = BASE ** INT( FACTOR * LOG( NORMA / NORMB ) )
 *
-      CALL DLASCL( 'G', -1, -1, ONE, W, P, N, B, LDB, INFO )
+      CALL DLASCL( 'G', -1, -1, 1.0D0, W, P, N, B, LDB, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
@@ -451,7 +454,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Compute the Frobenius norm of matrix G
 *
-      GNORM = DLANGE( 'F', M + P, N, G, LDG, NULL )
+      GNORM = DLANGE( 'F', M + P, N, G, LDG, WORK( Z ) )
 *
 *     Get machine precision and set up threshold for determining
 *     the effective numerical rank of the matrix G.

From 1e815eeb39be51a1c657b08446387550744648f1 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 28 Sep 2016 17:33:33 +0200
Subject: [PATCH 004/101] Fix many DGGQRCS bugs

- fix workspace queries
- fix DORCSD2BY1 parameter containing the number of rows of X11
- avoid scaling the matrix B if B is zero
- handle the case where both matrices A, B are zero
- check INFO after calling DORGQR
---
 SRC/dggqrcs.f | 54 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 597ecd5ec8..d23a61d82c 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -400,18 +400,16 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     Compute workspace
 *
       IF( INFO.EQ.0 ) THEN
-         CALL DGEQP3( M+P, N, G, LDG, IWORK,
-     $                THETA, WORK( Z + 1 ), -1, INFO )
+         CALL DGEQP3( M+P, N, G, LDG, IWORK, THETA, WORK, -1, INFO )
          LWKOPT = INT( WORK( 1 ) )
 
-         CALL DORGQR( M + P, L, L, G, LDG, THETA,
-     $                WORK( Z + 1 ), -1, INFO )
+         CALL DORGQR( M + P, L, L, G, LDG, THETA, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 
-         CALL DORCSD2BY1( JOBU1, JOBU2, JOBQT, M + P, M, N,
+         CALL DORCSD2BY1( JOBU1, JOBU2, JOBQT, M + P, P, N,
      $                    G, LDG, G, LDG,
-     $                    THETA, U1, LDU1, U2, LDU2, QT, LDQT,
-     $                    WORK( Z + 1 ), -1, IWORK, INFO )
+     $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
+     $                    WORK, -1, IWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          LWKOPT = Z + LWKOPT
 
@@ -438,13 +436,17 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       NORMA = DLANGE( 'F', M, N, A, LDA, WORK )
       NORMB = DLANGE( 'F', P, N, B, LDB, WORK )
 *
-      BASE = DLAMCH( 'B' )
-      FACTOR = -0.5D0 / LOG ( BASE )
-      W = BASE ** INT( FACTOR * LOG( NORMA / NORMB ) )
+      IF ( NORMB.EQ.0 ) THEN
+         W = 1.0D0
+      ELSE
+         BASE = DLAMCH( 'B' )
+         FACTOR = -0.5D0 / LOG ( BASE )
+         W = BASE ** INT( FACTOR * LOG( NORMA / NORMB ) )
 *
-      CALL DLASCL( 'G', -1, -1, 1.0D0, W, P, N, B, LDB, INFO )
-      IF ( INFO.NE.0 ) THEN
-         RETURN
+         CALL DLASCL( 'G', -1, -1, 1.0D0, W, P, N, B, LDB, INFO )
+         IF ( INFO.NE.0 ) THEN
+            RETURN
+         END IF
       END IF
 *
 *     Copy matrices A, B into the (M+P) x n matrix G
@@ -491,6 +493,21 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       L = R
 *
+*     Handle rank=0 case
+*
+      IF( L.EQ.0 ) THEN
+         IF( WANTU1 ) THEN
+            CALL DLASET( 'A', M, M, 0.0D0, 1.0D0, U1, LDU1 )
+         END IF
+         IF( WANTU2 ) THEN
+            CALL DLASET( 'A', P, P, 0.0D0, 1.0D0, U2, LDU2 )
+         END IF
+         IF( WANTQT ) THEN
+            CALL DLASET( 'A', N, N, 0.0D0, 1.0D0, QT, LDQT )
+         END IF
+         RETURN
+      END IF
+*
 *     Copy R1 into A
 *
       IF( R.LE.M ) THEN
@@ -504,13 +521,16 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       CALL DORGQR( M + P, R, R, G, LDG, THETA,
      $             WORK( Z + 1 ), LWORK - Z, INFO )
+      IF ( INFO.NE.0 ) THEN
+         RETURN
+      END IF
 *
 *     Compute the CS decomposition of Q1( :, 1:R )
 *
-      CALL DORCSD2BY1( JOBU1, JOBU2, JOBQT, M + P, M, R,
-     $            G( 1, 1 ), LDG, G( M + 1, 1 ), LDG, THETA,
-     $            U2, LDU2, U1, LDU1, QT, LDQT,
-     $            WORK( Z + 1 ), LWORK - Z, IWORK( N + 1 ), INFO )
+      CALL DORCSD2BY1( JOBU1, JOBU2, JOBQT, M + P, P, R,
+     $                 G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
+     $                 U2, LDU2, U1, LDU1, QT, LDQT,
+     $                 WORK( Z + 1 ), LWORK - Z, IWORK( N + 1 ), INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
       END IF

From ace0611ff89f52361cfa84d9becb2a194b463fa5 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 28 Sep 2016 17:44:53 +0200
Subject: [PATCH 005/101] Fix matrix computation in DGGQRCS

Fix the computation of an n-by-n orthogonal matrix Q by DORGRQ by
ensuring that the R elementary reflectors from DGERQF can be found in
the last R rows of Q.
---
 SRC/dggqrcs.f | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index d23a61d82c..62b71373b4 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -539,21 +539,21 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       CALL DLACPY( 'A', R, R, QT, LDQT, G, R )
 *
-*     Compute V^T R1( 1:R, : )
+*     Compute V^T R1( 1:R, : ) in the last R rows of QT
 *
       IF ( R.LE.M ) THEN
          CALL DGEMM( 'N', 'N', R, N, R, 1.0D0, G, R,
-     $          A, LDA, 0.0D0, QT, LDQT )
+     $          A, LDA, 0.0D0, QT( N-R+1, 1 ), LDQT )
       ELSE
          CALL DGEMM( 'N', 'N', R, N, M, 1.0D0, G( 1, 1 ), R,
-     $          A, LDA, 0.0D0, QT, LDQT )
+     $          A, LDA, 0.0D0, QT( N-R+1, 1 ), LDQT )
          CALL DGEMM( 'N', 'N', R, N, R - M, 1.0D0, G( 1, M + 1 ), R,
-     $          B, LDB, 1.0D0, QT, LDQT )
+     $          B, LDB, 1.0D0, QT( N-R+1, 1 ), LDQT )
       END IF
 *
 *     Compute the RQ decomposition of V^T R1( 1:R, : )
 *
-      CALL DGERQF( R, N, QT, LDQT, WORK,
+      CALL DGERQF( R, N, QT( N-R+1, 1 ), LDQT, WORK,
      $             WORK( L + 1 ), LWORK - L, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
@@ -562,10 +562,10 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     Copy matrix R from QT( 1:R, N-R+1: ) to A, B
 *
       IF ( R.LE.M ) THEN
-         CALL DLACPY( 'U', R, R, QT( 1, N-R+1 ), LDQT, A, LDA )
+         CALL DLACPY( 'U', R, R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
       ELSE
-         CALL DLACPY( 'U', M,     R, QT(     1, N-R+1 ), LDQT, A, LDA )
-         CALL DLACPY( 'U', R - M, R, QT( M + 1, N-R+1 ), LDQT, B, LDB )
+         CALL DLACPY( 'U', M,     R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
+         CALL DLACPY( 'U', R - M, R, QT( N-R+M+1, N-R+1 ), LDQT, B, LDB)
       END IF
 *
 *     Explicitly form Q^T

From 4bb8054ceb872a58c1a321004680ad2f43bfe2b8 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 28 Sep 2016 20:09:26 +0200
Subject: [PATCH 006/101] Avoid DGGQRCS workspace query memory overflows

---
 SRC/dggqrcs.f | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 62b71373b4..b8c3b20ad3 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -365,7 +365,11 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       L = MIN( M + P, N )
       Z = ( M + P ) * N
-      G = RESHAPE( WORK(1:Z), (/ M + P, N /) )
+      IF ( LQUERY ) THEN
+         G = 0
+      ELSE
+         G = RESHAPE( WORK(1:Z), (/ M + P, N /) )
+      END IF
       LDG = M + P
 *
 *     Test the input arguments

From 32622a69c3c4795909a63a7b4333ceeb791f5e04 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 28 Sep 2016 20:10:32 +0200
Subject: [PATCH 007/101] DGGQRCS: ensure #columns <= #rows with DORCSD2BY1

Ensure the number of columns indicated to DORCSD2BY1 is less than or
equal to the number of rows during workspace queries.
---
 SRC/dggqrcs.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index b8c3b20ad3..931da4b1a4 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -410,7 +410,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          CALL DORGQR( M + P, L, L, G, LDG, THETA, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 
-         CALL DORCSD2BY1( JOBU1, JOBU2, JOBQT, M + P, P, N,
+         CALL DORCSD2BY1( JOBU1, JOBU2, JOBQT, M + P, P, L,
      $                    G, LDG, G, LDG,
      $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
      $                    WORK, -1, IWORK, INFO )

From 1750e47c879a276427910a5e7f9286b2ef6d8493 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 28 Sep 2016 21:33:57 +0200
Subject: [PATCH 008/101] Fix completely broken DGGQRCS matrix scaling

The variable FACTOR was copy-pasted from xGEEQUB.
---
 SRC/dggqrcs.f | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 931da4b1a4..4c93b7be41 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -334,8 +334,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
       INTEGER            I, J, Z, R, LDG, LWKOPT
-      DOUBLE PRECISION   GNORM, FACTOR, TOL, ULP, UNFL, NORMA, NORMB,
-     $                   BASE
+      DOUBLE PRECISION   GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE
 *     .. Local Arrays ..
       DOUBLE PRECISION   G( M + P, N )
 *     ..
@@ -444,8 +443,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          W = 1.0D0
       ELSE
          BASE = DLAMCH( 'B' )
-         FACTOR = -0.5D0 / LOG ( BASE )
-         W = BASE ** INT( FACTOR * LOG( NORMA / NORMB ) )
+         W = BASE ** INT( LOG( NORMA / NORMB ) / LOG( BASE ) )
 *
          CALL DLASCL( 'G', -1, -1, 1.0D0, W, P, N, B, LDB, INFO )
          IF ( INFO.NE.0 ) THEN

From fbf90a0aec7b6bab3355cac40ea56e683e1caca0 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 28 Sep 2016 21:47:36 +0200
Subject: [PATCH 009/101] Fix DGGQRCS bugs

- compute Q only when needed
- fix argument order to DORCSD2BY1
---
 SRC/dggqrcs.f | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 4c93b7be41..637c66d3d0 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -409,7 +409,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          CALL DORGQR( M + P, L, L, G, LDG, THETA, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 
-         CALL DORCSD2BY1( JOBU1, JOBU2, JOBQT, M + P, P, L,
+         CALL DORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
      $                    G, LDG, G, LDG,
      $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
      $                    WORK, -1, IWORK, INFO )
@@ -529,7 +529,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Compute the CS decomposition of Q1( :, 1:R )
 *
-      CALL DORCSD2BY1( JOBU1, JOBU2, JOBQT, M + P, P, R,
+      CALL DORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, R,
      $                 G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
      $                 U2, LDU2, U1, LDU1, QT, LDQT,
      $                 WORK( Z + 1 ), LWORK - Z, IWORK( N + 1 ), INFO )
@@ -572,15 +572,17 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Explicitly form Q^T
 *
-      CALL DORGRQ( N, N, R, QT, LDQT, WORK,
-     $             WORK( L + 1 ), LWORK - L, INFO )
-      IF ( INFO.NE.0 ) THEN
-         RETURN
-      END IF
+      IF( WANTQT ) THEN
+         CALL DORGRQ( N, N, R, QT, LDQT, WORK,
+     $                WORK( L + 1 ), LWORK - L, INFO )
+         IF ( INFO.NE.0 ) THEN
+            RETURN
+         END IF
 *
 *     Revert column permutation Π by permuting the rows of Q^T
 *
-      CALL DLAPMT( .FALSE., N, N, QT, LDQT, IWORK )
+         CALL DLAPMT( .FALSE., N, N, QT, LDQT, IWORK )
+      END IF
 *
       WORK( 1 ) = DBLE( LWKOPT )
       RETURN

From fd8c48777415a3e5796e98a1531205aba0f81b97 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 29 Sep 2016 18:19:29 +0200
Subject: [PATCH 010/101] Use 1-based indexing in DGGQRCS

DLANGE does not use WORK when computing the Frobenius norm.
Nevertheless, this change may avoid future bugs.
---
 SRC/dggqrcs.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 637c66d3d0..f041adedb5 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -458,7 +458,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Compute the Frobenius norm of matrix G
 *
-      GNORM = DLANGE( 'F', M + P, N, G, LDG, WORK( Z ) )
+      GNORM = DLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
 *
 *     Get machine precision and set up threshold for determining
 *     the effective numerical rank of the matrix G.

From 7c89211370267f0fdb73d3ba9a997825c62eac23 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 29 Sep 2016 18:29:40 +0200
Subject: [PATCH 011/101] DGGQRCS: always return optimal workspace

Return the optimal workspace size even if both matrices A, B are zero.
---
 SRC/dggqrcs.f | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index f041adedb5..7d397aa2ff 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -507,6 +507,8 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          IF( WANTQT ) THEN
             CALL DLASET( 'A', N, N, 0.0D0, 1.0D0, QT, LDQT )
          END IF
+*
+         WORK( 1 ) = DBLE( LWKOPT )
          RETURN
       END IF
 *

From d58326f07d285bf16ca43b22d6ba5a59883763bd Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 29 Sep 2016 18:31:44 +0200
Subject: [PATCH 012/101] DGGQRCS: fix indentation in a multiline statement

---
 SRC/dggqrcs.f | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 7d397aa2ff..66917cd7ca 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -547,12 +547,12 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       IF ( R.LE.M ) THEN
          CALL DGEMM( 'N', 'N', R, N, R, 1.0D0, G, R,
-     $          A, LDA, 0.0D0, QT( N-R+1, 1 ), LDQT )
+     $               A, LDA, 0.0D0, QT( N-R+1, 1 ), LDQT )
       ELSE
          CALL DGEMM( 'N', 'N', R, N, M, 1.0D0, G( 1, 1 ), R,
-     $          A, LDA, 0.0D0, QT( N-R+1, 1 ), LDQT )
+     $               A, LDA, 0.0D0, QT( N-R+1, 1 ), LDQT )
          CALL DGEMM( 'N', 'N', R, N, R - M, 1.0D0, G( 1, M + 1 ), R,
-     $          B, LDB, 1.0D0, QT( N-R+1, 1 ), LDQT )
+     $               B, LDB, 1.0D0, QT( N-R+1, 1 ), LDQT )
       END IF
 *
 *     Compute the RQ decomposition of V^T R1( 1:R, : )

From b35fd6c070830d1dfa520fd35da58c7799060600 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 29 Sep 2016 19:42:52 +0200
Subject: [PATCH 013/101] DGGQRCS: compute V^T R1( 1:RANK, : ) correctly

The multiplication uses DGEMM so the lower triangular part of
R1( 1:RANK, : ) must be set to zero.
---
 SRC/dggqrcs.f | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 66917cd7ca..679d7d196f 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -512,13 +512,17 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          RETURN
       END IF
 *
-*     Copy R1 into A
+*     Copy R1 into A and set lower triangular part of A to zero
 *
       IF( R.LE.M ) THEN
           CALL DLACPY( 'U', R, N, G, LDG, A, LDA )
+          CALL DLASET( 'L', R - 1, N, 0.0D0, 0.0D0, A( 2, 1 ), LDA )
       ELSE
           CALL DLACPY( 'U', M, N, G, LDG, A, LDA )
-          CALL DLACPY( 'A', R - M, N, G( M + 1, 1 ), LDG, B, LDB )
+          CALL DLACPY( 'U', R - M, N, G( M + 1, 1 ), LDG, B, LDB )
+*
+          CALL DLASET( 'L', M - 1, N, 0.0D0, 0.0D0, A( 2, 1 ), LDA )
+          CALL DLASET( 'L', R-M-1, N, 0.0D0, 0.0D0, B( 2, 1 ), LDB )
       END IF
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition

From 2d8da3aee9a9d527ab59e44ffe0023404e292d61 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 29 Sep 2016 20:22:53 +0200
Subject: [PATCH 014/101] DGGQRCS: change location of matrix R

---
 SRC/dggqrcs.f | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 679d7d196f..401db7702f 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -80,8 +80,9 @@
 *>
 *> The routine computes C, S, R, and optionally the orthogonal
 *> transformation matrices U, V and Q. If L <= M, then R is stored in
-*> A(1:L, :) on exit. Otherwise, the first M rows of R are stored in A
-*> and the last L-M rows are stored in B(1:L-M, :).
+*> A(1:L, 1:L) on exit. Otherwise, the first M rows of R are stored in
+*> A(:, 1:L) and R( M+1:, M+1: ) is stored in B(1:L-M, 1:L-M). In both
+*> cases, only the upper triangular part is stored.
 *>
 *> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
 *> A and B implicitly gives the SVD of A*inv(B):
@@ -570,10 +571,11 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     Copy matrix R from QT( 1:R, N-R+1: ) to A, B
 *
       IF ( R.LE.M ) THEN
-         CALL DLACPY( 'U', R, R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
+         CALL DLACPY( 'U', R, R, QT( 1, N-R+1 ), LDQT, A, LDA )
       ELSE
-         CALL DLACPY( 'U', M,     R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
-         CALL DLACPY( 'U', R - M, R, QT( N-R+M+1, N-R+1 ), LDQT, B, LDB)
+         CALL DLACPY( 'U', M,     R, QT( 1, N-R+1 ), LDQT, A, LDA )
+         CALL DLACPY( 'U', R - M, R - M, QT( M + 1, N-R+M+1 ), LDQT,
+     $                B, LDB )
       END IF
 *
 *     Explicitly form Q^T

From 2c83b3762752ea7fe9d2e0209865fc15100bffaf Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 29 Sep 2016 21:06:12 +0200
Subject: [PATCH 015/101] DGGQRCS: overwrite unused memory with NaNs

Ease debugging
---
 SRC/dggqrcs.f | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 401db7702f..b18732b937 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -335,7 +335,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
       INTEGER            I, J, Z, R, LDG, LWKOPT
-      DOUBLE PRECISION   GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE
+      DOUBLE PRECISION   GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
 *     .. Local Arrays ..
       DOUBLE PRECISION   G( M + P, N )
 *     ..
@@ -371,6 +371,9 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          G = RESHAPE( WORK(1:Z), (/ M + P, N /) )
       END IF
       LDG = M + P
+*     Computing 0.0 / 0.0 directly causes compiler errors
+      NAN = 1.0D0
+      NAN = 0.0 / (NAN - 1.0D0)
 *
 *     Test the input arguments
 *
@@ -457,6 +460,11 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       CALL DLACPY( 'A', M, N, A, LDA, G( P + 1, 1 ), LDG )
       CALL DLACPY( 'A', P, N, B, LDB, G( 1, 1 ), LDG )
 *
+*     DEBUG
+*
+      CALL DLASET( 'A', M, N, NAN, NAN, A, LDA )
+      CALL DLASET( 'A', P, N, NAN, NAN, B, LDB )
+*
 *     Compute the Frobenius norm of matrix G
 *
       GNORM = DLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
@@ -534,6 +542,10 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          RETURN
       END IF
 *
+*     DEBUG
+*
+      THETA(1:N) = NAN
+*
 *     Compute the CS decomposition of Q1( :, 1:R )
 *
       CALL DORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, R,
@@ -544,10 +556,18 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          RETURN
       END IF
 *
+*     DEBUG
+*
+      WORK(1:LWORK) = NAN
+*
 *     Copy V^T from QT to G
 *
       CALL DLACPY( 'A', R, R, QT, LDQT, G, R )
 *
+*     DEBUG
+*
+      CALL DLASET( 'A', N, N, NAN, NAN, QT, LDQT )
+*
 *     Compute V^T R1( 1:R, : ) in the last R rows of QT
 *
       IF ( R.LE.M ) THEN
@@ -560,6 +580,12 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
      $               B, LDB, 1.0D0, QT( N-R+1, 1 ), LDQT )
       END IF
 *
+*     DEBUG
+*
+      CALL DLASET( 'A', M, N, NAN, NAN, A, LDA )
+      CALL DLASET( 'A', P, N, NAN, NAN, B, LDB )
+      WORK(1:LWORK) = NAN
+*
 *     Compute the RQ decomposition of V^T R1( 1:R, : )
 *
       CALL DGERQF( R, N, QT( N-R+1, 1 ), LDQT, WORK,
@@ -578,6 +604,11 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
      $                B, LDB )
       END IF
 *
+*     DEBUG
+*
+      CALL DLASET( 'U', R, R, NAN, NAN, QT( 1, N-R+1 ), LDQT )
+      WORK( L+1:LWORK ) = NAN
+*
 *     Explicitly form Q^T
 *
       IF( WANTQT ) THEN

From 436931e4be6688bbf41d06c170353b99d352e4e3 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 29 Sep 2016 21:16:13 +0200
Subject: [PATCH 016/101] DGGQRCS: fix argument to DLACPY

This bug was introduced in commit 446ac9a2.
---
 SRC/dggqrcs.f | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index b18732b937..41bfeb8822 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -597,10 +597,10 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     Copy matrix R from QT( 1:R, N-R+1: ) to A, B
 *
       IF ( R.LE.M ) THEN
-         CALL DLACPY( 'U', R, R, QT( 1, N-R+1 ), LDQT, A, LDA )
+         CALL DLACPY( 'U', R, R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
       ELSE
-         CALL DLACPY( 'U', M,     R, QT( 1, N-R+1 ), LDQT, A, LDA )
-         CALL DLACPY( 'U', R - M, R - M, QT( M + 1, N-R+M+1 ), LDQT,
+         CALL DLACPY( 'U', M,     R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
+         CALL DLACPY( 'U', R - M, R - M, QT( N-R+M+1, N-R+M+1 ), LDQT,
      $                B, LDB )
       END IF
 *

From 094c302e09d93354772a962b3f926517a6ff05f8 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 29 Sep 2016 22:38:18 +0200
Subject: [PATCH 017/101] DGGQRCS: use correct leading dimension

---
 SRC/dggqrcs.f | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 41bfeb8822..18aa4cd6e9 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -562,7 +562,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Copy V^T from QT to G
 *
-      CALL DLACPY( 'A', R, R, QT, LDQT, G, R )
+      CALL DLACPY( 'A', R, R, QT, LDQT, G, LDG )
 *
 *     DEBUG
 *
@@ -571,12 +571,12 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     Compute V^T R1( 1:R, : ) in the last R rows of QT
 *
       IF ( R.LE.M ) THEN
-         CALL DGEMM( 'N', 'N', R, N, R, 1.0D0, G, R,
+         CALL DGEMM( 'N', 'N', R, N, R, 1.0D0, G, LDG,
      $               A, LDA, 0.0D0, QT( N-R+1, 1 ), LDQT )
       ELSE
-         CALL DGEMM( 'N', 'N', R, N, M, 1.0D0, G( 1, 1 ), R,
+         CALL DGEMM( 'N', 'N', R, N, M, 1.0D0, G( 1, 1 ), LDG,
      $               A, LDA, 0.0D0, QT( N-R+1, 1 ), LDQT )
-         CALL DGEMM( 'N', 'N', R, N, R - M, 1.0D0, G( 1, M + 1 ), R,
+         CALL DGEMM( 'N', 'N', R, N, R - M, 1.0D0, G( 1, M + 1 ), LDG,
      $               B, LDB, 1.0D0, QT( N-R+1, 1 ), LDQT )
       END IF
 *

From a25860a86062af1f9b03a12dc824020230cee292 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Fri, 30 Sep 2016 13:31:29 +0200
Subject: [PATCH 018/101] DGGQRCS: fix triangular matrices copies again

This bug was discovered by the random test from commit dcd04cd7.
Minimal triggering example with m=1, n=2, p=1 with non-random entries:
  A = [1, 1],
  B = [1, 0].
---
 SRC/dggqrcs.f | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 18aa4cd6e9..0b2cb098bf 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -528,7 +528,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
           CALL DLASET( 'L', R - 1, N, 0.0D0, 0.0D0, A( 2, 1 ), LDA )
       ELSE
           CALL DLACPY( 'U', M, N, G, LDG, A, LDA )
-          CALL DLACPY( 'U', R - M, N, G( M + 1, 1 ), LDG, B, LDB )
+          CALL DLACPY( 'U', R - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
 *
           CALL DLASET( 'L', M - 1, N, 0.0D0, 0.0D0, A( 2, 1 ), LDA )
           CALL DLASET( 'L', R-M-1, N, 0.0D0, 0.0D0, B( 2, 1 ), LDB )
@@ -576,8 +576,9 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       ELSE
          CALL DGEMM( 'N', 'N', R, N, M, 1.0D0, G( 1, 1 ), LDG,
      $               A, LDA, 0.0D0, QT( N-R+1, 1 ), LDQT )
-         CALL DGEMM( 'N', 'N', R, N, R - M, 1.0D0, G( 1, M + 1 ), LDG,
-     $               B, LDB, 1.0D0, QT( N-R+1, 1 ), LDQT )
+         CALL DGEMM( 'N', 'N', R, N - M, R - M, 1.0D0,
+     $               G( 1, M + 1 ), LDG, B, LDB,
+     $               1.0D0, QT( N-R+1, M+1 ), LDQT )
       END IF
 *
 *     DEBUG

From 1e7e677e9b90bb2d4a11101f07303646f0c8f8d2 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Fri, 30 Sep 2016 17:53:00 +0200
Subject: [PATCH 019/101] DGGQRCS: more accurate comments

---
 SRC/dggqrcs.f | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 0b2cb098bf..95a24915df 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -521,7 +521,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          RETURN
       END IF
 *
-*     Copy R1 into A and set lower triangular part of A to zero
+*     Copy R1( 1:R, : ) into A, B and set lower triangular part to zero
 *
       IF( R.LE.M ) THEN
           CALL DLACPY( 'U', R, N, G, LDG, A, LDA )
@@ -595,7 +595,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          RETURN
       END IF
 *
-*     Copy matrix R from QT( 1:R, N-R+1: ) to A, B
+*     Copy matrix R from QT( N-R+1:N, N-R+1:N ) to A, B
 *
       IF ( R.LE.M ) THEN
          CALL DLACPY( 'U', R, R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )

From 81a282b17b5fc88a9b40af571672c677db5261e5 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sun, 20 Oct 2019 17:05:00 +0200
Subject: [PATCH 020/101] Add single-precision GSVD via QR+CSD

---
 SRC/CMakeLists.txt |   2 +-
 SRC/sggqrcs.f      | 632 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 633 insertions(+), 1 deletion(-)
 create mode 100644 SRC/sggqrcs.f

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 98a35253d8..96d50a136c 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -89,7 +89,7 @@ set(SLASRC
    sgetrf2.f sgetri.f
    sggbak.f sggbal.f
    sgges.f  sgges3.f sggesx.f sggev.f  sggev3.f sggevx.f
-   sggglm.f sgghrd.f sgghd3.f sgglse.f sggqrf.f
+   sggglm.f sgghrd.f sgghd3.f sgglse.f sggqrf.f sggqrcs.f
    sggrqf.f sggsvd3.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f
    sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f
    slaqz0.f slaqz1.f slaqz2.f slaqz3.f slaqz4.f
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
new file mode 100644
index 0000000000..2e9ab3fc90
--- /dev/null
+++ b/SRC/sggqrcs.f
@@ -0,0 +1,632 @@
+*> \brief <b> SGGQRCS computes the singular value decomposition (SVD) for OTHER matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SGGQRCS + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sggqrcs.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sggqrcs.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sggqrcs.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
+*                           A, LDA, B, LDB,
+*                           THETA, U1, LDU1, U2, LDU2, QT, LDQT,
+*                           WORK, LWORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBU1, JOB2, JOBQT
+*       INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
+*      $                   M, N, P, L, LWORK
+*       REAL               W
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IWORK( * )
+*       REAL               A( LDA, * ), B( LDB, * ), THETA( * ),
+*      $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQ, * ),
+*      $                   WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGGQRCS computes the generalized singular value decomposition (GSVD)
+*> of an M-by-N real matrix A and P-by-N real matrix B:
+*>
+*>       U1**T*A*Q = D1*( 0 R ),    U2**T*B*Q = D2*( 0 R )
+*>
+*> where U1, U2, and Q are orthogonal matrices. SGGQRCS uses the QR
+*> factorization with column pivoting and the 2-by-1 CS decomposition to
+*> compute the GSVD.
+*>
+*> Let L be the effective numerical rank of the matrix (A**T,B**T)**T,
+*> then R is a L-by-L nonsingular upper triangular matrix, D1 and
+*> D2 are M-by-L and P-by-L "diagonal" matrices and of the
+*> following structures, respectively:
+*>
+*>                        K   K1
+*>        D1 =     (  0   0   0 )
+*>              K  (  0   S   0 )
+*>              K1 (  0   0   I )
+*>
+*>                    K2  K
+*>        D2 =  K2 (  I   0   0 )
+*>              K  (  0   C   0 )
+*>                 (  0   0   0 )
+*>
+*>                 N-L  L
+*>   ( 0 R ) = L (  0   R )
+*>
+*> where
+*>
+*>   K  = MIN(M, P, L, M + P - L),
+*>   K1 = MAX(L - P, 0),
+*>   K2 = MAX(L - M, 0),
+*>   C  = diag( COS(THETA(1)), ..., COS(THETA(K)) ),
+*>   S  = diag( SIN(THETA(1)), ..., SIN(THETA(K)) ), and
+*>   C^2 + S^2 = I.
+*>
+*> The routine computes C, S, R, and optionally the orthogonal
+*> transformation matrices U, V and Q. If L <= M, then R is stored in
+*> A(1:L, 1:L) on exit. Otherwise, the first M rows of R are stored in
+*> A(:, 1:L) and R( M+1:, M+1: ) is stored in B(1:L-M, 1:L-M). In both
+*> cases, only the upper triangular part is stored.
+*>
+*> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
+*> A and B implicitly gives the SVD of A*inv(B):
+*>                      A*inv(B) = U1*(D1*inv(D2))*U2**T.
+*> If (A**T,B**T)**T  has orthonormal columns, then the GSVD of A and B
+*> is also equal to the CS decomposition of A and B. Furthermore, the
+*> GSVD can be used to derive the solution of the eigenvalue problem:
+*>                      A**T*A x = lambda * B**T*B x.
+*> In some literature, the GSVD of A and B is presented in the form
+*>                  U1**T*A*X = ( 0 D1 ),   U2**T*B*X = ( 0 D2 )
+*> where U1 and U2 are orthogonal and X is nonsingular, D1 and D2 are
+*> ``diagonal''.  The former GSVD form can be converted to the latter
+*> form by taking the nonsingular matrix X as
+*>
+*>                      X = Q*( I   0    )
+*>                            ( 0 inv(R) ).
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBU1
+*> \verbatim
+*>          JOBU1 is CHARACTER*1
+*>          = 'Y':  Orthogonal matrix U1 is computed;
+*>          = 'N':  U1 is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBU2
+*> \verbatim
+*>          JOBU2 is CHARACTER*1
+*>          = 'Y':  Orthogonal matrix U2 is computed;
+*>          = 'N':  U2 is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBQT
+*> \verbatim
+*>          JOBQT is CHARACTER*1
+*>          = 'Y':  Orthogonal matrix Q is computed;
+*>          = 'N':  Q is not computed.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrices A and B.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] P
+*> \verbatim
+*>          P is INTEGER
+*>          The number of rows of the matrix B.  P >= 0.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W in REAL
+*>
+*>          On exit, W is a radix power chosen such that the Frobenius
+*>          norm of A and W*B are within sqrt(radix) and 1/sqrt(radix)
+*>          of each other.
+*> \endverbatim
+*>
+*> \param[out] L
+*> \verbatim
+*>          L is INTEGER
+*>          On exit, the effective numerical rank of the matrix
+*>          (A**T, B**T)**T.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*>          On exit, A contains the triangular matrix R or the first M
+*>          rows of R, respectively. See Purpose for details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is REAL array, dimension (LDB,N)
+*>          On entry, the P-by-N matrix B.
+*>          On exit, if L > M, then B contains the last L - M rows of
+*>          the triangular matrix R. See Purpose for details.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,P).
+*> \endverbatim
+*>
+*> \param[out] THETA
+*> \verbatim
+*>          THETA is REAL array, dimension (N)
+*>
+*>          On exit, THETA contains K = MIN(M, P, L, M + P - L) values
+*>          in radians in ascending order.
+*> \endverbatim
+*>
+*> \param[out] U1
+*> \verbatim
+*>          U1 is REAL array, dimension (LDU1,M)
+*>          If JOBU1 = 'Y', U1 contains the M-by-M orthogonal matrix U1.
+*>          If JOBU1 = 'N', U1 is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDU1
+*> \verbatim
+*>          LDU1 is INTEGER
+*>          The leading dimension of the array U1. LDU1 >= max(1,M) if
+*>          JOBU1 = 'Y'; LDU1 >= 1 otherwise.
+*> \endverbatim
+*>
+*> \param[out] U2
+*> \verbatim
+*>          U2 is REAL array, dimension (LDU2,P)
+*>          If JOBU2 = 'Y', U2 contains the P-by-P orthogonal matrix U2.
+*>          If JOBU2 = 'N', U2 is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDU2
+*> \verbatim
+*>          LDU2 is INTEGER
+*>          The leading dimension of the array U2. LDU2 >= max(1,P) if
+*>          JOBU2 = 'Y'; LDU2 >= 1 otherwise.
+*> \endverbatim
+*>
+*> \param[out] QT
+*> \verbatim
+*>          QT is REAL array, dimension (LDQT,N)
+*>          If JOBQT = 'Y', QT contains the N-by-N orthogonal matrix
+*>          Q**T.
+*> \endverbatim
+*>
+*> \param[in] LDQT
+*> \verbatim
+*>          LDQT is INTEGER
+*>          The leading dimension of the array QT. LDQT >= max(1,N) if
+*>          JOBQT = 'Y'; LDQT >= 1 otherwise.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal size of the WORK array,
+*>          returns this value as the first entry of the WORK array, and
+*>          no error message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (M + N + P)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>          > 0:  DBBCSD did not converge. For further details, see
+*>                subroutine DORCSDBY1.
+*> \endverbatim
+*
+*> \par Internal Parameters:
+*  =========================
+*>
+*> \verbatim
+*>  TOL     REAL
+*>          Let G = (A**T,B**T)**T. TOL is the threshold to determine
+*>          the effective rank of G. Generally, it is set to
+*>                   TOL = MAX(M,P,N) * norm(G) * MACHEPS,
+*>          where norm(G) is the Frobenius norm of G.
+*>          The size of TOL may affect the size of backward error of the
+*>          decomposition.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Christoph Conrads (https://christoph-conrads.name)
+*
+*> \date October 2019
+*
+*> \ingroup realGEsing
+*
+*> \par Contributors:
+*  ==================
+*>
+*>     Christoph Conrads (https://christoph-conrads.name)
+*>
+*
+*> \par Further Details:
+*  =====================
+*>
+*>  SGGQRCS should be significantly faster than DGGSVD and DGGSVD3 for
+*>  large matrices because the matrices A and B are reduced to a pair of
+*>  well-conditioned bidiagonal matrices instead of pairs of upper
+*>  triangular matrices. On the downside, SGGQRCS requires a much larger
+*>  workspace whose dimension must be queried at run-time.
+*>
+*  =====================================================================
+      SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
+     $                    A, LDA, B, LDB,
+     $                    THETA, U1, LDU1, U2, LDU2, QT, LDQT,
+     $                    WORK, LWORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine (version 3.7.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
+*     September 2016
+*
+      IMPLICIT NONE
+*     .. Scalar Arguments ..
+      CHARACTER          JOBU1, JOBU2, JOBQT
+      INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
+     $                   L, M, N, P, LWORK
+      REAL               W
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * )
+      REAL               A( LDA, * ), B( LDB, * ), THETA( * ),
+     $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQT, * ),
+     $                   WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Local Scalars ..
+      LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
+      INTEGER            I, J, Z, R, LDG, LWKOPT
+      REAL               GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
+*     .. Local Arrays ..
+      REAL               G( M + P, N )
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      REAL               SLAMCH, SLANGE
+      EXTERNAL           LSAME, SLAMCH, SLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLACPY, SLASCL, SGEQP3, SORGQR, SGERQF, SORGRQ,
+     $                   SORCSD2BY1, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Decode and test the input parameters
+*
+      WANTU1 = LSAME( JOBU1, 'Y' )
+      WANTU2 = LSAME( JOBU2, 'Y' )
+      WANTQT = LSAME( JOBQT, 'Y' )
+      LQUERY = ( LWORK.EQ.-1 )
+      LWKOPT = 1
+*
+*     Initialize variables
+*
+      L = MIN( M + P, N )
+      Z = ( M + P ) * N
+      IF ( LQUERY ) THEN
+         G = 0
+      ELSE
+         G = RESHAPE( WORK(1:Z), (/ M + P, N /) )
+      END IF
+      LDG = M + P
+*     Computing 0.0 / 0.0 directly causes compiler errors
+      NAN = 1.0E0
+      NAN = 0.0 / (NAN - 1.0E0)
+*
+*     Test the input arguments
+*
+      INFO = 0
+      IF( .NOT.( WANTU1 .OR. LSAME( JOBU1, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( WANTU2 .OR. LSAME( JOBU2, 'N' ) ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( WANTQT .OR. LSAME( JOBQT, 'N' ) ) ) THEN
+         INFO = -3
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( P.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -10
+      ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
+         INFO = -12
+      ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
+         INFO = -15
+      ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
+         INFO = -17
+      ELSE IF( LDQT.LT.1 .OR. ( WANTQT .AND. LDQT.LT.N ) ) THEN
+         INFO = -19
+      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
+         INFO = -23
+      END IF
+*
+*     Compute workspace
+*
+      IF( INFO.EQ.0 ) THEN
+         CALL SGEQP3( M+P, N, G, LDG, IWORK, THETA, WORK, -1, INFO )
+         LWKOPT = INT( WORK( 1 ) )
+
+         CALL SORGQR( M + P, L, L, G, LDG, THETA, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+
+         CALL SORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
+     $                    G, LDG, G, LDG,
+     $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
+     $                    WORK, -1, IWORK, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+         LWKOPT = Z + LWKOPT
+
+*        DGERQF stores L scalar factors for the elementary reflectors
+         CALL SGERQF( L, N, QT, LDQT, WORK, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
+
+         CALL SORGRQ( N, N, L, QT, LDQT, WORK, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
+
+         WORK( 1 ) = REAL( LWKOPT )
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SGGQRCS', -INFO )
+         RETURN
+      END IF
+      IF( LQUERY ) THEN
+         RETURN
+      ENDIF
+*
+*     Scale matrix B such that norm(A) \approx norm(B)
+*
+      NORMA = SLANGE( 'F', M, N, A, LDA, WORK )
+      NORMB = SLANGE( 'F', P, N, B, LDB, WORK )
+*
+      IF ( NORMB.EQ.0 ) THEN
+         W = 1.0E0
+      ELSE
+         BASE = SLAMCH( 'B' )
+         W = BASE ** INT( LOG( NORMA / NORMB ) / LOG( BASE ) )
+*
+         CALL SLASCL( 'G', -1, -1, 1.0E0, W, P, N, B, LDB, INFO )
+         IF ( INFO.NE.0 ) THEN
+            RETURN
+         END IF
+      END IF
+*
+*     Copy matrices A, B into the (M+P) x n matrix G
+*
+      CALL SLACPY( 'A', M, N, A, LDA, G( P + 1, 1 ), LDG )
+      CALL SLACPY( 'A', P, N, B, LDB, G( 1, 1 ), LDG )
+*
+*     DEBUG
+*
+      CALL SLASET( 'A', M, N, NAN, NAN, A, LDA )
+      CALL SLASET( 'A', P, N, NAN, NAN, B, LDB )
+*
+*     Compute the Frobenius norm of matrix G
+*
+      GNORM = SLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
+*
+*     Get machine precision and set up threshold for determining
+*     the effective numerical rank of the matrix G.
+*
+      ULP = SLAMCH( 'Precision' )
+      UNFL = SLAMCH( 'Safe Minimum' )
+      TOL = MAX( M + P, N ) * MAX( GNORM, UNFL ) * ULP
+*
+*     IWORK stores the column permutations computed by DGEQP3.
+*     Columns J where IWORK( J ) is non-zero are permuted to the front
+*     so we set the all entries to zero here.
+*
+      DO 10 J = 1, N
+         IWORK( J ) = 0
+   10 CONTINUE
+*
+*     Compute the QR factorization with column pivoting GΠ = Q1 R1
+*
+      CALL SGEQP3( M + P, N, G, LDG, IWORK, THETA,
+     $             WORK( Z + 1 ), LWORK - Z, INFO )
+      IF( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     Determine the rank of G
+*
+      R = 0
+      DO 20 I = 1, MIN( M + P, N )
+         IF( ABS( G( I, I ) ).LE.TOL ) THEN
+            EXIT
+         END IF
+         R = R + 1
+   20 CONTINUE
+*
+      L = R
+*
+*     Handle rank=0 case
+*
+      IF( L.EQ.0 ) THEN
+         IF( WANTU1 ) THEN
+            CALL SLASET( 'A', M, M, 0.0E0, 1.0E0, U1, LDU1 )
+         END IF
+         IF( WANTU2 ) THEN
+            CALL SLASET( 'A', P, P, 0.0E0, 1.0E0, U2, LDU2 )
+         END IF
+         IF( WANTQT ) THEN
+            CALL SLASET( 'A', N, N, 0.0E0, 1.0E0, QT, LDQT )
+         END IF
+*
+         WORK( 1 ) = REAL( LWKOPT )
+         RETURN
+      END IF
+*
+*     Copy R1( 1:R, : ) into A, B and set lower triangular part to zero
+*
+      IF( R.LE.M ) THEN
+          CALL SLACPY( 'U', R, N, G, LDG, A, LDA )
+          CALL SLASET( 'L', R - 1, N, 0.0E0, 0.0E0, A( 2, 1 ), LDA )
+      ELSE
+          CALL SLACPY( 'U', M, N, G, LDG, A, LDA )
+          CALL SLACPY( 'U', R - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
+*
+          CALL SLASET( 'L', M - 1, N, 0.0E0, 0.0E0, A( 2, 1 ), LDA )
+          CALL SLASET( 'L', R-M-1, N, 0.0E0, 0.0E0, B( 2, 1 ), LDB )
+      END IF
+*
+*     Explicitly form Q1 so that we can compute the CS decomposition
+*
+      CALL SORGQR( M + P, R, R, G, LDG, THETA,
+     $             WORK( Z + 1 ), LWORK - Z, INFO )
+      IF ( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     DEBUG
+*
+      THETA(1:N) = NAN
+*
+*     Compute the CS decomposition of Q1( :, 1:R )
+*
+      CALL SORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, R,
+     $                 G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
+     $                 U2, LDU2, U1, LDU1, QT, LDQT,
+     $                 WORK( Z + 1 ), LWORK - Z, IWORK( N + 1 ), INFO )
+      IF( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     DEBUG
+*
+      WORK(1:LWORK) = NAN
+*
+*     Copy V^T from QT to G
+*
+      CALL SLACPY( 'A', R, R, QT, LDQT, G, LDG )
+*
+*     DEBUG
+*
+      CALL SLASET( 'A', N, N, NAN, NAN, QT, LDQT )
+*
+*     Compute V^T R1( 1:R, : ) in the last R rows of QT
+*
+      IF ( R.LE.M ) THEN
+         CALL SGEMM( 'N', 'N', R, N, R, 1.0E0, G, LDG,
+     $               A, LDA, 0.0E0, QT( N-R+1, 1 ), LDQT )
+      ELSE
+         CALL SGEMM( 'N', 'N', R, N, M, 1.0E0, G( 1, 1 ), LDG,
+     $               A, LDA, 0.0E0, QT( N-R+1, 1 ), LDQT )
+         CALL SGEMM( 'N', 'N', R, N - M, R - M, 1.0E0,
+     $               G( 1, M + 1 ), LDG, B, LDB,
+     $               1.0E0, QT( N-R+1, M+1 ), LDQT )
+      END IF
+*
+*     DEBUG
+*
+      CALL SLASET( 'A', M, N, NAN, NAN, A, LDA )
+      CALL SLASET( 'A', P, N, NAN, NAN, B, LDB )
+      WORK(1:LWORK) = NAN
+*
+*     Compute the RQ decomposition of V^T R1( 1:R, : )
+*
+      CALL SGERQF( R, N, QT( N-R+1, 1 ), LDQT, WORK,
+     $             WORK( L + 1 ), LWORK - L, INFO )
+      IF ( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     Copy matrix R from QT( N-R+1:N, N-R+1:N ) to A, B
+*
+      IF ( R.LE.M ) THEN
+         CALL SLACPY( 'U', R, R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
+      ELSE
+         CALL SLACPY( 'U', M,     R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
+         CALL SLACPY( 'U', R - M, R - M, QT( N-R+M+1, N-R+M+1 ), LDQT,
+     $                B, LDB )
+      END IF
+*
+*     DEBUG
+*
+      CALL SLASET( 'U', R, R, NAN, NAN, QT( 1, N-R+1 ), LDQT )
+      WORK( L+1:LWORK ) = NAN
+*
+*     Explicitly form Q^T
+*
+      IF( WANTQT ) THEN
+         CALL SORGRQ( N, N, R, QT, LDQT, WORK,
+     $                WORK( L + 1 ), LWORK - L, INFO )
+         IF ( INFO.NE.0 ) THEN
+            RETURN
+         END IF
+*
+*     Revert column permutation Π by permuting the rows of Q^T
+*
+         CALL SLAPMT( .FALSE., N, N, QT, LDQT, IWORK )
+      END IF
+*
+      WORK( 1 ) = REAL( LWKOPT )
+      RETURN
+*
+*     End of SGGQRCS
+*
+      END

From 957e2175d5b4a2cfe1bd57c0f4890435575322b4 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sun, 20 Oct 2019 17:05:03 +0200
Subject: [PATCH 021/101] DGGQRCS: fix typos

---
 SRC/dggqrcs.f | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 95a24915df..59e9875bf5 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -145,7 +145,7 @@
 *>
 *> \param[out] W
 *> \verbatim
-*>          W in DOUBLE PRECISION
+*>          W is DOUBLE PRECISION
 *>
 *>          On exit, W is a radix power chosen such that the Frobenius
 *>          norm of A and W*B are within sqrt(radix) and 1/sqrt(radix)
@@ -544,7 +544,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     DEBUG
 *
-      THETA(1:N) = NAN
+      THETA(1:L) = NAN
 *
 *     Compute the CS decomposition of Q1( :, 1:R )
 *

From f683c7904110c7464d77a14a014ed00158628556 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sun, 20 Oct 2019 17:05:07 +0200
Subject: [PATCH 022/101] CGGQRCS: draft complex (2x32bit) GSVD via QR, CSD

---
 SRC/CMakeLists.txt |   2 +-
 SRC/cggqrcs.f      | 665 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 666 insertions(+), 1 deletion(-)
 create mode 100644 SRC/cggqrcs.f

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 96d50a136c..627954d906 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -180,7 +180,7 @@ set(CLASRC
    cgetri.f
    cggbak.f cggbal.f
    cgges.f  cgges3.f cggesx.f cggev.f  cggev3.f cggevx.f
-   cggglm.f cgghrd.f cgghd3.f cgglse.f cggqrf.f cggrqf.f
+   cggglm.f cgghrd.f cgghd3.f cgglse.f cggqrf.f cggrqf.f cggqrcs.f
    cggsvd3.f cggsvp3.f
    cgtcon.f cgtrfs.f cgtsv.f  cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f
    chbevd.f chbevx.f chbgst.f chbgv.f  chbgvd.f chbgvx.f chbtrd.f
diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
new file mode 100644
index 0000000000..18a79aae35
--- /dev/null
+++ b/SRC/cggqrcs.f
@@ -0,0 +1,665 @@
+*> \brief <b> CGGQRCS computes the singular value decomposition (SVD) for OTHER matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGGQRCS + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sggqrcs.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sggqrcs.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sggqrcs.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
+*                           A, LDA, B, LDB,
+*                           THETA, U1, LDU1, U2, LDU2, QT, LDQT,
+*                           WORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBU1, JOB2, JOBQT
+*       INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
+*      $                   M, N, P, L, LWORK, LRWORK
+*       REAL               W
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IWORK( * ), RWORK( * )
+*       COMPLEX            A( LDA, * ), B( LDB, * ), THETA( * ),
+*      $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQ, * ),
+*      $                   WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGGQRCS computes the generalized singular value decomposition (GSVD)
+*> of an M-by-N real matrix A and P-by-N real matrix B:
+*>
+*>       U1**T*A*Q = D1*( 0 R ),    U2**T*B*Q = D2*( 0 R )
+*>
+*> where U1, U2, and Q are orthogonal matrices. CGGQRCS uses the QR
+*> factorization with column pivoting and the 2-by-1 CS decomposition to
+*> compute the GSVD.
+*>
+*> Let L be the effective numerical rank of the matrix (A**T,B**T)**T,
+*> then R is a L-by-L nonsingular upper triangular matrix, D1 and
+*> D2 are M-by-L and P-by-L "diagonal" matrices and of the
+*> following structures, respectively:
+*>
+*>                        K   K1
+*>        D1 =     (  0   0   0 )
+*>              K  (  0   S   0 )
+*>              K1 (  0   0   I )
+*>
+*>                    K2  K
+*>        D2 =  K2 (  I   0   0 )
+*>              K  (  0   C   0 )
+*>                 (  0   0   0 )
+*>
+*>                 N-L  L
+*>   ( 0 R ) = L (  0   R )
+*>
+*> where
+*>
+*>   K  = MIN(M, P, L, M + P - L),
+*>   K1 = MAX(L - P, 0),
+*>   K2 = MAX(L - M, 0),
+*>   C  = diag( COS(THETA(1)), ..., COS(THETA(K)) ),
+*>   S  = diag( SIN(THETA(1)), ..., SIN(THETA(K)) ), and
+*>   C^2 + S^2 = I.
+*>
+*> The routine computes C, S, R, and optionally the orthogonal
+*> transformation matrices U, V and Q. If L <= M, then R is stored in
+*> A(1:L, 1:L) on exit. Otherwise, the first M rows of R are stored in
+*> A(:, 1:L) and R( M+1:, M+1: ) is stored in B(1:L-M, 1:L-M). In both
+*> cases, only the upper triangular part is stored.
+*>
+*> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
+*> A and B implicitly gives the SVD of A*inv(B):
+*>                      A*inv(B) = U1*(D1*inv(D2))*U2**T.
+*> If (A**T,B**T)**T  has orthonormal columns, then the GSVD of A and B
+*> is also equal to the CS decomposition of A and B. Furthermore, the
+*> GSVD can be used to derive the solution of the eigenvalue problem:
+*>                      A**T*A x = lambda * B**T*B x.
+*> In some literature, the GSVD of A and B is presented in the form
+*>                  U1**T*A*X = ( 0 D1 ),   U2**T*B*X = ( 0 D2 )
+*> where U1 and U2 are orthogonal and X is nonsingular, D1 and D2 are
+*> ``diagonal''.  The former GSVD form can be converted to the latter
+*> form by taking the nonsingular matrix X as
+*>
+*>                      X = Q*( I   0    )
+*>                            ( 0 inv(R) ).
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBU1
+*> \verbatim
+*>          JOBU1 is CHARACTER*1
+*>          = 'Y':  Orthogonal matrix U1 is computed;
+*>          = 'N':  U1 is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBU2
+*> \verbatim
+*>          JOBU2 is CHARACTER*1
+*>          = 'Y':  Orthogonal matrix U2 is computed;
+*>          = 'N':  U2 is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBQT
+*> \verbatim
+*>          JOBQT is CHARACTER*1
+*>          = 'Y':  Orthogonal matrix Q is computed;
+*>          = 'N':  Q is not computed.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrices A and B.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] P
+*> \verbatim
+*>          P is INTEGER
+*>          The number of rows of the matrix B.  P >= 0.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL
+*>
+*>          On exit, W is a radix power chosen such that the Frobenius
+*>          norm of A and W*B are within sqrt(radix) and 1/sqrt(radix)
+*>          of each other.
+*> \endverbatim
+*>
+*> \param[out] L
+*> \verbatim
+*>          L is INTEGER
+*>          On exit, the effective numerical rank of the matrix
+*>          (A**T, B**T)**T.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*>          On exit, A contains the triangular matrix R or the first M
+*>          rows of R, respectively. See Purpose for details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (LDB,N)
+*>          On entry, the P-by-N matrix B.
+*>          On exit, if L > M, then B contains the last L - M rows of
+*>          the triangular matrix R. See Purpose for details.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,P).
+*> \endverbatim
+*>
+*> \param[out] THETA
+*> \verbatim
+*>          THETA is REAL array, dimension (N)
+*>
+*>          On exit, THETA contains K = MIN(M, P, L, M + P - L) values
+*>          in radians in ascending order.
+*> \endverbatim
+*>
+*> \param[out] U1
+*> \verbatim
+*>          U1 is COMPLEX array, dimension (LDU1,M)
+*>          If JOBU1 = 'Y', U1 contains the M-by-M orthogonal matrix U1.
+*>          If JOBU1 = 'N', U1 is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDU1
+*> \verbatim
+*>          LDU1 is INTEGER
+*>          The leading dimension of the array U1. LDU1 >= max(1,M) if
+*>          JOBU1 = 'Y'; LDU1 >= 1 otherwise.
+*> \endverbatim
+*>
+*> \param[out] U2
+*> \verbatim
+*>          U2 is COMPLEX array, dimension (LDU2,P)
+*>          If JOBU2 = 'Y', U2 contains the P-by-P orthogonal matrix U2.
+*>          If JOBU2 = 'N', U2 is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDU2
+*> \verbatim
+*>          LDU2 is INTEGER
+*>          The leading dimension of the array U2. LDU2 >= max(1,P) if
+*>          JOBU2 = 'Y'; LDU2 >= 1 otherwise.
+*> \endverbatim
+*>
+*> \param[out] QT
+*> \verbatim
+*>          QT is COMPLEX array, dimension (LDQT,N)
+*>          If JOBQT = 'Y', QT contains the N-by-N orthogonal matrix
+*>          Q**T.
+*> \endverbatim
+*>
+*> \param[in] LDQT
+*> \verbatim
+*>          LDQT is INTEGER
+*>          The leading dimension of the array QT. LDQT >= max(1,N) if
+*>          JOBQT = 'Y'; LDQT >= 1 otherwise.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal size of the WORK array,
+*>          returns this value as the first entry of the WORK array, and
+*>          no error message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (MAX(1,LRWORK))
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER
+*>          The dimension of the array RWORK.
+*>
+*>          If LRWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the RWORK array, returns
+*>          this value as the first entry of the work array, and no error
+*>          message related to LRWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (M + N + P)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>          > 0:  CBBCSD did not converge. For further details, see
+*>                subroutine CUNCSDBY1.
+*> \endverbatim
+*
+*> \par Internal Parameters:
+*  =========================
+*>
+*> \verbatim
+*>  TOL     REAL
+*>          Let G = (A**T,B**T)**T. TOL is the threshold to determine
+*>          the effective rank of G. Generally, it is set to
+*>                   TOL = MAX(M,P,N) * norm(G) * MACHEPS,
+*>          where norm(G) is the Frobenius norm of G.
+*>          The size of TOL may affect the size of backward error of the
+*>          decomposition.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Christoph Conrads (https://christoph-conrads.name)
+*
+*> \date October 2019
+*
+*> \ingroup realGEsing
+*
+*> \par Contributors:
+*  ==================
+*>
+*>     Christoph Conrads (https://christoph-conrads.name)
+*>
+*
+*> \par Further Details:
+*  =====================
+*>
+*>  CGGQRCS should be significantly faster than CGGSVD and CGGSVD3 for
+*>  large matrices because the matrices A and B are reduced to a pair of
+*>  well-conditioned bidiagonal matrices instead of pairs of upper
+*>  triangular matrices. On the downside, CGGQRCS requires a much larger
+*>  workspace whose dimension must be queried at run-time.
+*>
+*  =====================================================================
+      SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
+     $                    A, LDA, B, LDB,
+     $                    THETA, U1, LDU1, U2, LDU2, QT, LDQT,
+     $                    WORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine (version 3.X.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
+*     <DATE>
+*
+      IMPLICIT NONE
+*     .. Scalar Arguments ..
+      CHARACTER          JOBU1, JOBU2, JOBQT
+      INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
+     $                   L, M, N, P, LWORK, LRWORK
+      COMPLEX            W
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * )
+      REAL               RWORK( * )
+      COMPLEX            A( LDA, * ), B( LDB, * ), THETA( * ),
+     $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQT, * ),
+     $                   WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Local Scalars ..
+      LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
+      INTEGER            I, J, Z, R, LDG, LWKOPT, LRWKOPT
+      REAL               GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
+      COMPLEX            ZERO, ONE, CNAN
+*     .. Local Arrays ..
+      COMPLEX            G( M + P, N ), TAU( MIN( M + P, N ) )
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      REAL               SLAMCH, CLANGE
+      EXTERNAL           LSAME, SLAMCH, SLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLACPY, CLASCL, CGEQP3, CUNGQR, CGERQF, CUNGRQ,
+     $                   CUNCSD2BY1, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Decode and test the input parameters
+*
+      WANTU1 = LSAME( JOBU1, 'Y' )
+      WANTU2 = LSAME( JOBU2, 'Y' )
+      WANTQT = LSAME( JOBQT, 'Y' )
+      LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 )
+      LWKOPT = 1
+      LRWKOPT = 2*N
+*
+*     Initialize variables
+*
+      L = MIN( M + P, N )
+      Z = ( M + P ) * N
+      IF ( LQUERY ) THEN
+         G = 0
+      ELSE
+         G = RESHAPE( WORK(1:Z), (/ M + P, N /) )
+      END IF
+      TAU = WORK( Z + 1 )
+      LDG = M + P
+      ZERO = (0.0E0, 0.0E0)
+      ONE = (1.0E0, 0.0E0)
+*     Computing 0.0 / 0.0 directly causes compiler errors
+      NAN = 1.0E0
+      NAN = 0.0 / (NAN - 1.0E0)
+      CNAN = CMPLX(NAN,NAN)
+*
+*     Test the input arguments
+*
+      INFO = 0
+      IF( .NOT.( WANTU1 .OR. LSAME( JOBU1, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( WANTU2 .OR. LSAME( JOBU2, 'N' ) ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( WANTQT .OR. LSAME( JOBQT, 'N' ) ) ) THEN
+         INFO = -3
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( P.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -10
+      ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
+         INFO = -12
+      ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
+         INFO = -15
+      ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
+         INFO = -17
+      ELSE IF( LDQT.LT.1 .OR. ( WANTQT .AND. LDQT.LT.N ) ) THEN
+         INFO = -19
+      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
+         INFO = -23
+      ELSE IF( LRWORK.LT.2*N .AND. .NOT.LQUERY ) THEN
+         INFO = -25
+      END IF
+*
+*     Compute workspace
+*
+      IF( INFO.EQ.0 ) THEN
+         CALL CGEQP3( M+P, N, G, LDG, IWORK, TAU,
+     $                WORK, -1, RWORK, INFO )
+         LWKOPT = INT( WORK( 1 ) )
+
+         CALL CUNGQR( M + P, L, L, G, LDG, TAU, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+
+         CALL CUNCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
+     $                    G, LDG, G, LDG,
+     $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
+     $                    WORK, -1, RWORK, LRWORK, IWORK, INFO )
+*        Add L to LWKOPT for xGEQP3, xUNGQR because of array TAU
+         LWKOPT = MAX( LWKOPT + L, INT( WORK( 1 ) ) )
+         LWKOPT = Z + LWKOPT
+         LRWKOPT = MAX( 2*N, INT( RWORK( 1 ) ) )
+
+*        DGERQF stores L scalar factors for the elementary reflectors
+         CALL CGERQF( L, N, QT, LDQT, TAU, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
+
+         CALL CUNGRQ( N, N, L, QT, LDQT, TAU, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
+
+         WORK( 1 ) = CMPLX( REAL( LWKOPT ), 0.0E0 )
+         RWORK( 1 ) = REAL( LRWKOPT )
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CGGQRCS', -INFO )
+         RETURN
+      END IF
+      IF( LQUERY ) THEN
+         RETURN
+      ENDIF
+*
+*     Scale matrix B such that norm(A) \approx norm(B)
+*
+      NORMA = CLANGE( 'F', M, N, A, LDA, RWORK )
+      NORMB = CLANGE( 'F', P, N, B, LDB, RWORK )
+*
+      IF ( NORMB.EQ.0 ) THEN
+         W = 1.0E0
+      ELSE
+         BASE = SLAMCH( 'B' )
+         W = BASE ** INT( LOG( NORMA / NORMB ) / LOG( BASE ) )
+*
+         CALL CLASCL( 'G', -1, -1, 1.0E0, W, P, N, B, LDB, INFO )
+         IF ( INFO.NE.0 ) THEN
+            RETURN
+         END IF
+      END IF
+*
+*     Copy matrices A, B into the (M+P) x n matrix G
+*
+      CALL CLACPY( 'A', M, N, A, LDA, G( P + 1, 1 ), LDG )
+      CALL CLACPY( 'A', P, N, B, LDB, G( 1, 1 ), LDG )
+*
+*     DEBUG
+*
+      CALL CLASET( 'A', M, N, CNAN, CNAN, A, LDA )
+      CALL CLASET( 'A', P, N, CNAN, CNAN, B, LDB )
+*
+*     Compute the Frobenius norm of matrix G
+*
+      GNORM = CLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
+*
+*     Get machine precision and set up threshold for determining
+*     the effective numerical rank of the matrix G.
+*
+      ULP = SLAMCH( 'Precision' )
+      UNFL = SLAMCH( 'Safe Minimum' )
+      TOL = MAX( M + P, N ) * MAX( GNORM, UNFL ) * ULP
+*
+*     IWORK stores the column permutations computed by CGEQP3.
+*     Columns J where IWORK( J ) is non-zero are permuted to the front
+*     so we set the all entries to zero here.
+*
+      DO 10 J = 1, N
+         IWORK( J ) = 0
+   10 CONTINUE
+*
+*     Compute the QR factorization with column pivoting GΠ = Q1 R1
+*
+      CALL CGEQP3( M + P, N, G, LDG, IWORK, TAU,
+     $             WORK( Z + 1 ), LWORK - Z, RWORK, INFO )
+      IF( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     Determine the rank of G
+*
+      R = 0
+      DO 20 I = 1, MIN( M + P, N )
+         IF( ABS( G( I, I ) ).LE.TOL ) THEN
+            EXIT
+         END IF
+         R = R + 1
+   20 CONTINUE
+*
+*     Handle rank=0 case
+*
+      IF( R.EQ.0 ) THEN
+         IF( WANTU1 ) THEN
+            CALL SLASET( 'A', M, M, ZERO, ONE, U1, LDU1 )
+         END IF
+         IF( WANTU2 ) THEN
+            CALL SLASET( 'A', P, P, ZERO, ONE, U2, LDU2 )
+         END IF
+         IF( WANTQT ) THEN
+            CALL SLASET( 'A', N, N, ZERO, ONE, QT, LDQT )
+         END IF
+*
+         WORK( 1 ) = CMPLX( REAL(LWKOPT), 0.0E0 )
+         RWORK( 1 ) = REAL(LRWKOPT)
+         RETURN
+      END IF
+*
+*     Copy R1( 1:R, : ) into A, B and set lower triangular part to zero
+*
+      IF( R.LE.M ) THEN
+          CALL CLACPY( 'U', R, N, G, LDG, A, LDA )
+          CALL CLASET( 'L', R - 1, N, ZERO, ZERO, A( 2, 1 ), LDA )
+      ELSE
+          CALL CLACPY( 'U', M, N, G, LDG, A, LDA )
+          CALL CLACPY( 'U', R - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
+*
+          CALL CLASET( 'L', M - 1, N, ZERO, ZERO, A( 2, 1 ), LDA )
+          CALL CLASET( 'L', R-M-1, N, ZERO, ZERO, B( 2, 1 ), LDB )
+      END IF
+*
+*     Explicitly form Q1 so that we can compute the CS decomposition
+*
+      CALL CUNGQR( M + P, R, R, G, LDG, TAU,
+     $             WORK( Z + 1 ), LWORK - Z, INFO )
+      IF ( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     DEBUG
+*
+      TAU(:) = CNAN
+*
+*     Compute the CS decomposition of Q1( :, 1:R )
+*
+      CALL CUNCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, R,
+     $                 G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
+     $                 U2, LDU2, U1, LDU1, QT, LDQT,
+     $                 WORK( Z + 1 ), LWORK - Z,
+     $                 RWORK, LRWORK, IWORK( N + 1 ), INFO )
+      IF( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     DEBUG
+*
+      WORK(1:LWORK) = CNAN
+      RWORK(1:LRWORK) = NAN
+*
+*     Copy V^T from QT to G
+*
+      CALL CLACPY( 'A', R, R, QT, LDQT, G, LDG )
+*
+*     DEBUG
+*
+      CALL CLASET( 'A', N, N, CNAN, CNAN, QT, LDQT )
+*
+*     Compute V^T R1( 1:R, : ) in the last R rows of QT
+*
+      IF ( R.LE.M ) THEN
+         CALL CGEMM( 'N', 'N', R, N, R, ONE, G, LDG,
+     $               A, LDA, ZERO, QT( N-R+1, 1 ), LDQT )
+      ELSE
+         CALL CGEMM( 'N', 'N', R, N, M, ONE, G( 1, 1 ), LDG,
+     $               A, LDA, ZERO, QT( N-R+1, 1 ), LDQT )
+         CALL CGEMM( 'N', 'N', R, N - M, R - M, ONE,
+     $               G( 1, M + 1 ), LDG, B, LDB,
+     $               ONE, QT( N-R+1, M+1 ), LDQT )
+      END IF
+*
+*     DEBUG
+*
+      CALL CLASET( 'A', M, N, CNAN, CNAN, A, LDA )
+      CALL CLASET( 'A', P, N, CNAN, CNAN, B, LDB )
+      WORK(1:LWORK) = CNAN
+*
+*     Compute the RQ decomposition of V^T R1( 1:R, : )
+*
+      TAU = WORK(1:L)
+      CALL CGERQF( R, N, QT( N-R+1, 1 ), LDQT, TAU,
+     $             WORK( L + 1 ), LWORK - L, INFO )
+      IF ( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     Copy matrix R from QT( N-R+1:N, N-R+1:N ) to A, B
+*
+      IF ( R.LE.M ) THEN
+         CALL CLACPY( 'U', R, R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
+      ELSE
+         CALL CLACPY( 'U', M,     R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
+         CALL CLACPY( 'U', R - M, R - M, QT( N-R+M+1, N-R+M+1 ), LDQT,
+     $                B, LDB )
+      END IF
+*
+*     DEBUG
+*
+      CALL CLASET( 'U', R, R, CNAN, CNAN, QT( 1, N-R+1 ), LDQT )
+      WORK( L+1:LWORK ) = CNAN
+*
+*     Explicitly form Q^T
+*
+      IF( WANTQT ) THEN
+         CALL CUNGRQ( N, N, R, QT, LDQT, TAU,
+     $                WORK( L + 1 ), LWORK - L, INFO )
+         IF ( INFO.NE.0 ) THEN
+            RETURN
+         END IF
+*
+*     Revert column permutation Π by permuting the rows of Q^T
+*
+         CALL CLAPMT( .FALSE., N, N, QT, LDQT, IWORK )
+      END IF
+*
+      WORK( 1 ) = CMPLX( REAL(LWKOPT), 0.0E0 )
+      RWORK( 1 ) = REAL(LRWKOPT)
+
+      RETURN
+*
+*     End of CGGQRCS
+*
+      END

From 8e30cd1d38ab0d5be03ec33700e29276f2bdef3a Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sun, 15 Dec 2019 13:42:26 +0100
Subject: [PATCH 023/101] Fix harmless out-of-bounds accesses for ASAN

Fix harmless out-of-bounds accesses to avoid spurious address sanitizer
(ASAN) failures.
---
 SRC/sggqrcs.f    | 4 +++-
 SRC/sorcsd2by1.f | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 2e9ab3fc90..97df193f14 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -525,7 +525,9 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       IF( R.LE.M ) THEN
           CALL SLACPY( 'U', R, N, G, LDG, A, LDA )
-          CALL SLASET( 'L', R - 1, N, 0.0E0, 0.0E0, A( 2, 1 ), LDA )
+          IF( M.GT.2 ) THEN
+             CALL SLASET( 'L', R - 1, N, 0.0E0, 0.0E0, A( 2, 1 ), LDA )
+          END IF
       ELSE
           CALL SLACPY( 'U', M, N, G, LDG, A, LDA )
           CALL SLACPY( 'U', R - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
diff --git a/SRC/sorcsd2by1.f b/SRC/sorcsd2by1.f
index 25c317f6f6..1521347f96 100644
--- a/SRC/sorcsd2by1.f
+++ b/SRC/sorcsd2by1.f
@@ -517,10 +517,12 @@ SUBROUTINE SORCSD2BY1( JOBU1, JOBU2, JOBV1T, M, P, Q, X11, LDX11,
                V1T(1,J) = ZERO
                V1T(J,1) = ZERO
             END DO
-            CALL SLACPY( 'U', Q-1, Q-1, X21(1,2), LDX21, V1T(2,2),
+            IF( Q.GT.1 ) THEN
+               CALL SLACPY( 'U', Q-1, Q-1, X21(1,2), LDX21, V1T(2,2),
      $                   LDV1T )
-            CALL SORGLQ( Q-1, Q-1, Q-1, V1T(2,2), LDV1T, WORK(ITAUQ1),
+               CALL SORGLQ( Q-1, Q-1, Q-1, V1T(2,2), LDV1T,WORK(ITAUQ1),
      $                   WORK(IORGLQ), LORGLQ, CHILDINFO )
+            ENDIF
          END IF
 *
 *        Simultaneously diagonalize X11 and X21.

From fffae5ff2c07f439dcffa1767d9bae7cca142b05 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sun, 15 Dec 2019 21:02:34 +0100
Subject: [PATCH 024/101] SGGQRCS: fix branch condition causing NaNs

---
 SRC/sggqrcs.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 97df193f14..10ed0da214 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -525,7 +525,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       IF( R.LE.M ) THEN
           CALL SLACPY( 'U', R, N, G, LDG, A, LDA )
-          IF( M.GT.2 ) THEN
+          IF( M.GT.1 ) THEN
              CALL SLASET( 'L', R - 1, N, 0.0E0, 0.0E0, A( 2, 1 ), LDA )
           END IF
       ELSE

From 4f0080a5050159bf4f82cb50019e02aeb9a405c2 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Fri, 10 Apr 2020 13:34:54 +0200
Subject: [PATCH 025/101] CGGQRCS: fix an EXTERNAL statement

---
 SRC/cggqrcs.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 18a79aae35..8cd0ef5663 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -360,7 +360,7 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     .. External Functions ..
       LOGICAL            LSAME
       REAL               SLAMCH, CLANGE
-      EXTERNAL           LSAME, SLAMCH, SLANGE
+      EXTERNAL           LSAME, SLAMCH, CLANGE
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLACPY, CLASCL, CGEQP3, CUNGQR, CGERQF, CUNGRQ,

From 706d9520b7e12c8103446c09897c2170c913ee19 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Fri, 10 Apr 2020 20:03:42 +0200
Subject: [PATCH 026/101] CGGQRCS: fix argument type

---
 SRC/cggqrcs.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 8cd0ef5663..488c581310 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -337,7 +337,7 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       CHARACTER          JOBU1, JOBU2, JOBQT
       INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
      $                   L, M, N, P, LWORK, LRWORK
-      COMPLEX            W
+      REAL               W
 *     ..
 *     .. Array Arguments ..
       INTEGER            IWORK( * )

From 5110d9213297b335ac7c1a4e67d73d985e3f66e4 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sat, 11 Apr 2020 21:16:34 +0200
Subject: [PATCH 027/101] SGGQRCS: improve comments

* fix typos
* explain optimal workspace size computation
---
 SRC/sggqrcs.f | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 10ed0da214..9ad19c862c 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -299,7 +299,7 @@
 *> \par Further Details:
 *  =====================
 *>
-*>  SGGQRCS should be significantly faster than DGGSVD and DGGSVD3 for
+*>  SGGQRCS should be significantly faster than SGGSVD and SGGSVD3 for
 *>  large matrices because the matrices A and B are reduced to a pair of
 *>  well-conditioned bidiagonal matrices instead of pairs of upper
 *>  triangular matrices. On the downside, SGGQRCS requires a much larger
@@ -418,9 +418,10 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
      $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
      $                    WORK, -1, IWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+*        The matrix (A, B) must be stored sequentially for SORCSD2BY1
          LWKOPT = Z + LWKOPT
 
-*        DGERQF stores L scalar factors for the elementary reflectors
+*        SGERQF stores L scalar factors for the elementary reflectors
          CALL SGERQF( L, N, QT, LDQT, WORK, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
 
@@ -476,7 +477,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       UNFL = SLAMCH( 'Safe Minimum' )
       TOL = MAX( M + P, N ) * MAX( GNORM, UNFL ) * ULP
 *
-*     IWORK stores the column permutations computed by DGEQP3.
+*     IWORK stores the column permutations computed by SGEQP3.
 *     Columns J where IWORK( J ) is non-zero are permuted to the front
 *     so we set the all entries to zero here.
 *

From 9944790ffbcb847aa1cf3ef1dc83c070c2135448 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sat, 11 Apr 2020 21:17:41 +0200
Subject: [PATCH 028/101] CGGQRCS: multiple fixes

* fix an argument type
* fix name of called function
---
 SRC/cggqrcs.f | 69 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 31 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 488c581310..2f5192e5e7 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -30,8 +30,9 @@
 *       REAL               W
 *       ..
 *       .. Array Arguments ..
-*       INTEGER            IWORK( * ), RWORK( * )
-*       COMPLEX            A( LDA, * ), B( LDB, * ), THETA( * ),
+*       INTEGER            IWORK( * )
+*       REAL               THETA( * ), RWORK( * )
+*       COMPLEX            A( LDA, * ), B( LDB, * ),
 *      $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQ, * ),
 *      $                   WORK( * )
 *       ..
@@ -304,7 +305,7 @@
 *
 *> \date October 2019
 *
-*> \ingroup realGEsing
+*> \ingroup complexOTHERcomputational
 *
 *> \par Contributors:
 *  ==================
@@ -341,8 +342,8 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     ..
 *     .. Array Arguments ..
       INTEGER            IWORK( * )
-      REAL               RWORK( * )
-      COMPLEX            A( LDA, * ), B( LDB, * ), THETA( * ),
+      REAL               THETA( * ), RWORK( * )
+      COMPLEX            A( LDA, * ), B( LDB, * ),
      $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQT, * ),
      $                   WORK( * )
 *     ..
@@ -355,7 +356,7 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       REAL               GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
       COMPLEX            ZERO, ONE, CNAN
 *     .. Local Arrays ..
-      COMPLEX            G( M + P, N ), TAU( MIN( M + P, N ) )
+      COMPLEX            G( M + P, N )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -389,7 +390,6 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       ELSE
          G = RESHAPE( WORK(1:Z), (/ M + P, N /) )
       END IF
-      TAU = WORK( Z + 1 )
       LDG = M + P
       ZERO = (0.0E0, 0.0E0)
       ONE = (1.0E0, 0.0E0)
@@ -429,30 +429,31 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          INFO = -25
       END IF
 *
-*     Compute workspace
+*     Compute optimal workspace size
 *
       IF( INFO.EQ.0 ) THEN
-         CALL CGEQP3( M+P, N, G, LDG, IWORK, TAU,
+*        CGEQP3, CUNGQR read/store L scalar factors
+         CALL CGEQP3( M+P, N, G, LDG, IWORK, WORK,
      $                WORK, -1, RWORK, INFO )
-         LWKOPT = INT( WORK( 1 ) )
+         LWKOPT = INT( WORK( 1 ) ) + L
 
-         CALL CUNGQR( M + P, L, L, G, LDG, TAU, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+         CALL CUNGQR( M + P, L, L, G, LDG, WORK, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
 
          CALL CUNCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
      $                    G, LDG, G, LDG,
      $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
      $                    WORK, -1, RWORK, LRWORK, IWORK, INFO )
-*        Add L to LWKOPT for xGEQP3, xUNGQR because of array TAU
-         LWKOPT = MAX( LWKOPT + L, INT( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+*        The matrix (A, B) must be stored sequentially for xUNCSD2BY1
          LWKOPT = Z + LWKOPT
          LRWKOPT = MAX( 2*N, INT( RWORK( 1 ) ) )
 
-*        DGERQF stores L scalar factors for the elementary reflectors
-         CALL CGERQF( L, N, QT, LDQT, TAU, WORK, -1, INFO )
+*        CGERQF, CUNGRQ read/store L scalar factors
+         CALL CGERQF( L, N, QT, LDQT, WORK, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
 
-         CALL CUNGRQ( N, N, L, QT, LDQT, TAU, WORK, -1, INFO )
+         CALL CUNGRQ( N, N, L, QT, LDQT, WORK, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
 
          WORK( 1 ) = CMPLX( REAL( LWKOPT ), 0.0E0 )
@@ -467,6 +468,10 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          RETURN
       ENDIF
 *
+*     DEBUG
+*
+      IWORK( 1:M+N+P ) = -1
+*
 *     Scale matrix B such that norm(A) \approx norm(B)
 *
       NORMA = CLANGE( 'F', M, N, A, LDA, RWORK )
@@ -515,8 +520,8 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Compute the QR factorization with column pivoting GΠ = Q1 R1
 *
-      CALL CGEQP3( M + P, N, G, LDG, IWORK, TAU,
-     $             WORK( Z + 1 ), LWORK - Z, RWORK, INFO )
+      CALL CGEQP3( M + P, N, G, LDG, IWORK, WORK( Z + 1 ),
+     $             WORK( Z + L + 1 ), LWORK - Z - L, RWORK, INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
       END IF
@@ -524,24 +529,26 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     Determine the rank of G
 *
       R = 0
-      DO 20 I = 1, MIN( M + P, N )
+      DO 20 I = 1, L
          IF( ABS( G( I, I ) ).LE.TOL ) THEN
             EXIT
          END IF
          R = R + 1
    20 CONTINUE
+*
+      L = R
 *
 *     Handle rank=0 case
 *
       IF( R.EQ.0 ) THEN
          IF( WANTU1 ) THEN
-            CALL SLASET( 'A', M, M, ZERO, ONE, U1, LDU1 )
+            CALL CLASET( 'A', M, M, ZERO, ONE, U1, LDU1 )
          END IF
          IF( WANTU2 ) THEN
-            CALL SLASET( 'A', P, P, ZERO, ONE, U2, LDU2 )
+            CALL CLASET( 'A', P, P, ZERO, ONE, U2, LDU2 )
          END IF
          IF( WANTQT ) THEN
-            CALL SLASET( 'A', N, N, ZERO, ONE, QT, LDQT )
+            CALL CLASET( 'A', N, N, ZERO, ONE, QT, LDQT )
          END IF
 *
          WORK( 1 ) = CMPLX( REAL(LWKOPT), 0.0E0 )
@@ -564,15 +571,16 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition
 *
-      CALL CUNGQR( M + P, R, R, G, LDG, TAU,
-     $             WORK( Z + 1 ), LWORK - Z, INFO )
+      CALL CUNGQR( M + P, R, R, G, LDG, WORK( Z + 1 ),
+     $             WORK( Z + L + 1 ), LWORK - Z - L, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
 *
 *     DEBUG
 *
-      TAU(:) = CNAN
+      RWORK( 1:LRWORK ) = NAN
+      WORK( Z+1:LWORK ) = CNAN
 *
 *     Compute the CS decomposition of Q1( :, 1:R )
 *
@@ -587,8 +595,8 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     DEBUG
 *
-      WORK(1:LWORK) = CNAN
-      RWORK(1:LRWORK) = NAN
+      WORK( 1:LWORK ) = CNAN
+      RWORK( 1:LRWORK ) = NAN
 *
 *     Copy V^T from QT to G
 *
@@ -619,8 +627,7 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Compute the RQ decomposition of V^T R1( 1:R, : )
 *
-      TAU = WORK(1:L)
-      CALL CGERQF( R, N, QT( N-R+1, 1 ), LDQT, TAU,
+      CALL CGERQF( R, N, QT( N-R+1, 1 ), LDQT, WORK( 1 ),
      $             WORK( L + 1 ), LWORK - L, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
@@ -644,7 +651,7 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     Explicitly form Q^T
 *
       IF( WANTQT ) THEN
-         CALL CUNGRQ( N, N, R, QT, LDQT, TAU,
+         CALL CUNGRQ( N, N, R, QT, LDQT, WORK,
      $                WORK( L + 1 ), LWORK - L, INFO )
          IF ( INFO.NE.0 ) THEN
             RETURN

From 7caf824557774510a42de2f69aea509d9f064acf Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 15 Apr 2020 18:48:06 +0200
Subject: [PATCH 029/101] xGGQRCS: replace a loop with scalar assignment

---
 SRC/cggqrcs.f | 4 +---
 SRC/dggqrcs.f | 4 +---
 SRC/sggqrcs.f | 4 +---
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 2f5192e5e7..bbbf5f0b43 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -514,9 +514,7 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     Columns J where IWORK( J ) is non-zero are permuted to the front
 *     so we set the all entries to zero here.
 *
-      DO 10 J = 1, N
-         IWORK( J ) = 0
-   10 CONTINUE
+      IWORK( 1:N ) = 0
 *
 *     Compute the QR factorization with column pivoting GΠ = Q1 R1
 *
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 59e9875bf5..ac0ec76fbb 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -480,9 +480,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     Columns J where IWORK( J ) is non-zero are permuted to the front
 *     so we set the all entries to zero here.
 *
-      DO 10 J = 1, N
-         IWORK( J ) = 0
-   10 CONTINUE
+      IWORK( 1:N ) = 0
 *
 *     Compute the QR factorization with column pivoting GΠ = Q1 R1
 *
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 9ad19c862c..e94a9170bd 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -481,9 +481,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     Columns J where IWORK( J ) is non-zero are permuted to the front
 *     so we set the all entries to zero here.
 *
-      DO 10 J = 1, N
-         IWORK( J ) = 0
-   10 CONTINUE
+      IWORK( 1:N ) = 0
 *
 *     Compute the QR factorization with column pivoting GΠ = Q1 R1
 *

From fe9ee05a6d768680a59f764d337e5bb5093120af Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 15 Apr 2020 19:09:40 +0200
Subject: [PATCH 030/101] xGGQRCS: remove integer variable `R`

Remove integer variable `R` indicating the rank because
* integer `L` was already introduced for this purpose
* the purpose of `R` was never documented, and
* it can be confused with the triangular matrices occuring throghout the
  computation.
---
 SRC/cggqrcs.f | 90 +++++++++++++++++++++++++--------------------------
 SRC/dggqrcs.f | 78 ++++++++++++++++++++++----------------------
 SRC/sggqrcs.f | 84 ++++++++++++++++++++++++-----------------------
 3 files changed, 125 insertions(+), 127 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index bbbf5f0b43..462d051da7 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -352,7 +352,7 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
-      INTEGER            I, J, Z, R, LDG, LWKOPT, LRWKOPT
+      INTEGER            I, J, LMAX, Z, LDG, LWKOPT, LRWKOPT
       REAL               GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
       COMPLEX            ZERO, ONE, CNAN
 *     .. Local Arrays ..
@@ -383,7 +383,8 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Initialize variables
 *
-      L = MIN( M + P, N )
+      L = 0
+      LMAX = MIN( M + P, N )
       Z = ( M + P ) * N
       IF ( LQUERY ) THEN
          G = 0
@@ -432,15 +433,15 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     Compute optimal workspace size
 *
       IF( INFO.EQ.0 ) THEN
-*        CGEQP3, CUNGQR read/store L scalar factors
+*        CGEQP3, CUNGQR read/store LMAX scalar factors
          CALL CGEQP3( M+P, N, G, LDG, IWORK, WORK,
      $                WORK, -1, RWORK, INFO )
-         LWKOPT = INT( WORK( 1 ) ) + L
+         LWKOPT = INT( WORK( 1 ) ) + LMAX
 
-         CALL CUNGQR( M + P, L, L, G, LDG, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
+         CALL CUNGQR( M + P, LMAX, LMAX, G, LDG, WORK, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
 
-         CALL CUNCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
+         CALL CUNCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, LMAX,
      $                    G, LDG, G, LDG,
      $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
      $                    WORK, -1, RWORK, LRWORK, IWORK, INFO )
@@ -449,12 +450,12 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          LWKOPT = Z + LWKOPT
          LRWKOPT = MAX( 2*N, INT( RWORK( 1 ) ) )
 
-*        CGERQF, CUNGRQ read/store L scalar factors
-         CALL CGERQF( L, N, QT, LDQT, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
+*        CGERQF, CUNGRQ read/store up to LMAX scalar factors
+         CALL CGERQF( LMAX, N, QT, LDQT, WORK, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
 
-         CALL CUNGRQ( N, N, L, QT, LDQT, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
+         CALL CUNGRQ( N, N, LMAX, QT, LDQT, WORK, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
 
          WORK( 1 ) = CMPLX( REAL( LWKOPT ), 0.0E0 )
          RWORK( 1 ) = REAL( LRWKOPT )
@@ -519,26 +520,23 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     Compute the QR factorization with column pivoting GΠ = Q1 R1
 *
       CALL CGEQP3( M + P, N, G, LDG, IWORK, WORK( Z + 1 ),
-     $             WORK( Z + L + 1 ), LWORK - Z - L, RWORK, INFO )
+     $             WORK( Z + LMAX + 1 ), LWORK - Z - LMAX, RWORK, INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
       END IF
 *
 *     Determine the rank of G
 *
-      R = 0
-      DO 20 I = 1, L
+      DO 20 I = 1, LMAX
          IF( ABS( G( I, I ) ).LE.TOL ) THEN
             EXIT
          END IF
-         R = R + 1
+         L = L + 1
    20 CONTINUE
-*
-      L = R
 *
 *     Handle rank=0 case
 *
-      IF( R.EQ.0 ) THEN
+      IF( L.EQ.0 ) THEN
          IF( WANTU1 ) THEN
             CALL CLASET( 'A', M, M, ZERO, ONE, U1, LDU1 )
          END IF
@@ -554,22 +552,22 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          RETURN
       END IF
 *
-*     Copy R1( 1:R, : ) into A, B and set lower triangular part to zero
+*     Copy R1( 1:L, : ) into A, B and set lower triangular part to zero
 *
-      IF( R.LE.M ) THEN
-          CALL CLACPY( 'U', R, N, G, LDG, A, LDA )
-          CALL CLASET( 'L', R - 1, N, ZERO, ZERO, A( 2, 1 ), LDA )
+      IF( L.LE.M ) THEN
+          CALL CLACPY( 'U', L, N, G, LDG, A, LDA )
+          CALL CLASET( 'L', L - 1, N, ZERO, ZERO, A( 2, 1 ), LDA )
       ELSE
           CALL CLACPY( 'U', M, N, G, LDG, A, LDA )
-          CALL CLACPY( 'U', R - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
+          CALL CLACPY( 'U', L - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
 *
           CALL CLASET( 'L', M - 1, N, ZERO, ZERO, A( 2, 1 ), LDA )
-          CALL CLASET( 'L', R-M-1, N, ZERO, ZERO, B( 2, 1 ), LDB )
+          CALL CLASET( 'L', L-M-1, N, ZERO, ZERO, B( 2, 1 ), LDB )
       END IF
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition
 *
-      CALL CUNGQR( M + P, R, R, G, LDG, WORK( Z + 1 ),
+      CALL CUNGQR( M + P, L, L, G, LDG, WORK( Z + 1 ),
      $             WORK( Z + L + 1 ), LWORK - Z - L, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
@@ -580,9 +578,9 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       RWORK( 1:LRWORK ) = NAN
       WORK( Z+1:LWORK ) = CNAN
 *
-*     Compute the CS decomposition of Q1( :, 1:R )
+*     Compute the CS decomposition of Q1( :, 1:L )
 *
-      CALL CUNCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, R,
+      CALL CUNCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
      $                 G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
      $                 U2, LDU2, U1, LDU1, QT, LDQT,
      $                 WORK( Z + 1 ), LWORK - Z,
@@ -598,23 +596,23 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Copy V^T from QT to G
 *
-      CALL CLACPY( 'A', R, R, QT, LDQT, G, LDG )
+      CALL CLACPY( 'A', L, L, QT, LDQT, G, LDG )
 *
 *     DEBUG
 *
       CALL CLASET( 'A', N, N, CNAN, CNAN, QT, LDQT )
 *
-*     Compute V^T R1( 1:R, : ) in the last R rows of QT
+*     Compute V^T R1( 1:L, : ) in the last L rows of QT
 *
-      IF ( R.LE.M ) THEN
-         CALL CGEMM( 'N', 'N', R, N, R, ONE, G, LDG,
-     $               A, LDA, ZERO, QT( N-R+1, 1 ), LDQT )
+      IF ( L.LE.M ) THEN
+         CALL CGEMM( 'N', 'N', L, N, L, ONE, G, LDG,
+     $               A, LDA, ZERO, QT( N-L+1, 1 ), LDQT )
       ELSE
-         CALL CGEMM( 'N', 'N', R, N, M, ONE, G( 1, 1 ), LDG,
-     $               A, LDA, ZERO, QT( N-R+1, 1 ), LDQT )
-         CALL CGEMM( 'N', 'N', R, N - M, R - M, ONE,
+         CALL CGEMM( 'N', 'N', L, N, M, ONE, G( 1, 1 ), LDG,
+     $               A, LDA, ZERO, QT( N-L+1, 1 ), LDQT )
+         CALL CGEMM( 'N', 'N', L, N - M, L - M, ONE,
      $               G( 1, M + 1 ), LDG, B, LDB,
-     $               ONE, QT( N-R+1, M+1 ), LDQT )
+     $               ONE, QT( N-L+1, M+1 ), LDQT )
       END IF
 *
 *     DEBUG
@@ -623,33 +621,33 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       CALL CLASET( 'A', P, N, CNAN, CNAN, B, LDB )
       WORK(1:LWORK) = CNAN
 *
-*     Compute the RQ decomposition of V^T R1( 1:R, : )
+*     Compute the RQ decomposition of V^T R1( 1:L, : )
 *
-      CALL CGERQF( R, N, QT( N-R+1, 1 ), LDQT, WORK( 1 ),
+      CALL CGERQF( L, N, QT( N-L+1, 1 ), LDQT, WORK( 1 ),
      $             WORK( L + 1 ), LWORK - L, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
 *
-*     Copy matrix R from QT( N-R+1:N, N-R+1:N ) to A, B
+*     Copy matrix L from QT( N-L+1:N, N-L+1:N ) to A, B
 *
-      IF ( R.LE.M ) THEN
-         CALL CLACPY( 'U', R, R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
+      IF ( L.LE.M ) THEN
+         CALL CLACPY( 'U', L, L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
       ELSE
-         CALL CLACPY( 'U', M,     R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
-         CALL CLACPY( 'U', R - M, R - M, QT( N-R+M+1, N-R+M+1 ), LDQT,
+         CALL CLACPY( 'U', M,     L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
+         CALL CLACPY( 'U', L - M, L - M, QT( N-L+M+1, N-L+M+1 ), LDQT,
      $                B, LDB )
       END IF
 *
 *     DEBUG
 *
-      CALL CLASET( 'U', R, R, CNAN, CNAN, QT( 1, N-R+1 ), LDQT )
+      CALL CLASET( 'U', L, L, CNAN, CNAN, QT( 1, N-L+1 ), LDQT )
       WORK( L+1:LWORK ) = CNAN
 *
 *     Explicitly form Q^T
 *
       IF( WANTQT ) THEN
-         CALL CUNGRQ( N, N, R, QT, LDQT, WORK,
+         CALL CUNGRQ( N, N, L, QT, LDQT, WORK,
      $                WORK( L + 1 ), LWORK - L, INFO )
          IF ( INFO.NE.0 ) THEN
             RETURN
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index ac0ec76fbb..627d32d51a 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -334,7 +334,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
-      INTEGER            I, J, Z, R, LDG, LWKOPT
+      INTEGER            I, J, LMAX, Z, LDG, LWKOPT
       DOUBLE PRECISION   GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
 *     .. Local Arrays ..
       DOUBLE PRECISION   G( M + P, N )
@@ -363,7 +363,8 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Initialize variables
 *
-      L = MIN( M + P, N )
+      L = 0
+      LMAX = MIN( M + P, N )
       Z = ( M + P ) * N
       IF ( LQUERY ) THEN
          G = 0
@@ -410,22 +411,22 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          CALL DGEQP3( M+P, N, G, LDG, IWORK, THETA, WORK, -1, INFO )
          LWKOPT = INT( WORK( 1 ) )
 
-         CALL DORGQR( M + P, L, L, G, LDG, THETA, WORK, -1, INFO )
+         CALL DORGQR( M + P, LMAX, LMAX, G, LDG, THETA, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 
-         CALL DORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
+         CALL DORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, LMAX,
      $                    G, LDG, G, LDG,
      $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
      $                    WORK, -1, IWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          LWKOPT = Z + LWKOPT
 
-*        DGERQF stores L scalar factors for the elementary reflectors
-         CALL DGERQF( L, N, QT, LDQT, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
+*        DGERQF stores LMAX scalar factors for the elementary reflectors
+         CALL DGERQF( LMAX, N, QT, LDQT, WORK, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
 
-         CALL DORGRQ( N, N, L, QT, LDQT, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
+         CALL DORGRQ( N, N, LMAX, QT, LDQT, WORK, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
 
          WORK( 1 ) = DBLE( LWKOPT )
       END IF
@@ -492,15 +493,12 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Determine the rank of G
 *
-      R = 0
       DO 20 I = 1, MIN( M + P, N )
          IF( ABS( G( I, I ) ).LE.TOL ) THEN
             EXIT
          END IF
-         R = R + 1
+         L = L + 1
    20 CONTINUE
-*
-      L = R
 *
 *     Handle rank=0 case
 *
@@ -519,22 +517,22 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          RETURN
       END IF
 *
-*     Copy R1( 1:R, : ) into A, B and set lower triangular part to zero
+*     Copy R1( 1:L, : ) into A, B and set lower triangular part to zero
 *
-      IF( R.LE.M ) THEN
-          CALL DLACPY( 'U', R, N, G, LDG, A, LDA )
-          CALL DLASET( 'L', R - 1, N, 0.0D0, 0.0D0, A( 2, 1 ), LDA )
+      IF( L.LE.M ) THEN
+          CALL DLACPY( 'U', L, N, G, LDG, A, LDA )
+          CALL DLASET( 'L', L - 1, N, 0.0D0, 0.0D0, A( 2, 1 ), LDA )
       ELSE
           CALL DLACPY( 'U', M, N, G, LDG, A, LDA )
-          CALL DLACPY( 'U', R - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
+          CALL DLACPY( 'U', L - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
 *
           CALL DLASET( 'L', M - 1, N, 0.0D0, 0.0D0, A( 2, 1 ), LDA )
-          CALL DLASET( 'L', R-M-1, N, 0.0D0, 0.0D0, B( 2, 1 ), LDB )
+          CALL DLASET( 'L', L-M-1, N, 0.0D0, 0.0D0, B( 2, 1 ), LDB )
       END IF
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition
 *
-      CALL DORGQR( M + P, R, R, G, LDG, THETA,
+      CALL DORGQR( M + P, L, L, G, LDG, THETA,
      $             WORK( Z + 1 ), LWORK - Z, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
@@ -544,9 +542,9 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       THETA(1:L) = NAN
 *
-*     Compute the CS decomposition of Q1( :, 1:R )
+*     Compute the CS decomposition of Q1( :, 1:L )
 *
-      CALL DORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, R,
+      CALL DORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
      $                 G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
      $                 U2, LDU2, U1, LDU1, QT, LDQT,
      $                 WORK( Z + 1 ), LWORK - Z, IWORK( N + 1 ), INFO )
@@ -560,23 +558,23 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Copy V^T from QT to G
 *
-      CALL DLACPY( 'A', R, R, QT, LDQT, G, LDG )
+      CALL DLACPY( 'A', L, L, QT, LDQT, G, LDG )
 *
 *     DEBUG
 *
       CALL DLASET( 'A', N, N, NAN, NAN, QT, LDQT )
 *
-*     Compute V^T R1( 1:R, : ) in the last R rows of QT
+*     Compute V^T R1( 1:L, : ) in the last L rows of QT
 *
-      IF ( R.LE.M ) THEN
-         CALL DGEMM( 'N', 'N', R, N, R, 1.0D0, G, LDG,
-     $               A, LDA, 0.0D0, QT( N-R+1, 1 ), LDQT )
+      IF ( L.LE.M ) THEN
+         CALL DGEMM( 'N', 'N', L, N, L, 1.0D0, G, LDG,
+     $               A, LDA, 0.0D0, QT( N-L+1, 1 ), LDQT )
       ELSE
-         CALL DGEMM( 'N', 'N', R, N, M, 1.0D0, G( 1, 1 ), LDG,
-     $               A, LDA, 0.0D0, QT( N-R+1, 1 ), LDQT )
-         CALL DGEMM( 'N', 'N', R, N - M, R - M, 1.0D0,
+         CALL DGEMM( 'N', 'N', L, N, M, 1.0D0, G( 1, 1 ), LDG,
+     $               A, LDA, 0.0D0, QT( N-L+1, 1 ), LDQT )
+         CALL DGEMM( 'N', 'N', L, N - M, L - M, 1.0D0,
      $               G( 1, M + 1 ), LDG, B, LDB,
-     $               1.0D0, QT( N-R+1, M+1 ), LDQT )
+     $               1.0D0, QT( N-L+1, M+1 ), LDQT )
       END IF
 *
 *     DEBUG
@@ -585,33 +583,33 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       CALL DLASET( 'A', P, N, NAN, NAN, B, LDB )
       WORK(1:LWORK) = NAN
 *
-*     Compute the RQ decomposition of V^T R1( 1:R, : )
+*     Compute the RQ decomposition of V^T R1( 1:L, : )
 *
-      CALL DGERQF( R, N, QT( N-R+1, 1 ), LDQT, WORK,
+      CALL DGERQF( L, N, QT( N-L+1, 1 ), LDQT, WORK,
      $             WORK( L + 1 ), LWORK - L, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
 *
-*     Copy matrix R from QT( N-R+1:N, N-R+1:N ) to A, B
+*     Copy matrix L from QT( N-L+1:N, N-L+1:N ) to A, B
 *
-      IF ( R.LE.M ) THEN
-         CALL DLACPY( 'U', R, R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
+      IF ( L.LE.M ) THEN
+         CALL DLACPY( 'U', L, L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
       ELSE
-         CALL DLACPY( 'U', M,     R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
-         CALL DLACPY( 'U', R - M, R - M, QT( N-R+M+1, N-R+M+1 ), LDQT,
+         CALL DLACPY( 'U', M,     L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
+         CALL DLACPY( 'U', L - M, L - M, QT( N-L+M+1, N-L+M+1 ), LDQT,
      $                B, LDB )
       END IF
 *
 *     DEBUG
 *
-      CALL DLASET( 'U', R, R, NAN, NAN, QT( 1, N-R+1 ), LDQT )
+      CALL DLASET( 'U', L, L, NAN, NAN, QT( 1, N-L+1 ), LDQT )
       WORK( L+1:LWORK ) = NAN
 *
 *     Explicitly form Q^T
 *
       IF( WANTQT ) THEN
-         CALL DORGRQ( N, N, R, QT, LDQT, WORK,
+         CALL DORGRQ( N, N, L, QT, LDQT, WORK,
      $                WORK( L + 1 ), LWORK - L, INFO )
          IF ( INFO.NE.0 ) THEN
             RETURN
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index e94a9170bd..c7a3ac42bc 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -334,7 +334,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
-      INTEGER            I, J, Z, R, LDG, LWKOPT
+      INTEGER            I, J, LMAX, Z, LDG, LWKOPT
       REAL               GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
 *     .. Local Arrays ..
       REAL               G( M + P, N )
@@ -363,7 +363,8 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Initialize variables
 *
-      L = MIN( M + P, N )
+      L = 0
+      LMAX = MIN( M + P, N )
       Z = ( M + P ) * N
       IF ( LQUERY ) THEN
          G = 0
@@ -410,10 +411,10 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          CALL SGEQP3( M+P, N, G, LDG, IWORK, THETA, WORK, -1, INFO )
          LWKOPT = INT( WORK( 1 ) )
 
-         CALL SORGQR( M + P, L, L, G, LDG, THETA, WORK, -1, INFO )
+         CALL SORGQR( M + P, LMAX, LMAX, G, LDG, THETA, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 
-         CALL SORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
+         CALL SORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, LMAX,
      $                    G, LDG, G, LDG,
      $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
      $                    WORK, -1, IWORK, INFO )
@@ -421,12 +422,12 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *        The matrix (A, B) must be stored sequentially for SORCSD2BY1
          LWKOPT = Z + LWKOPT
 
-*        SGERQF stores L scalar factors for the elementary reflectors
-         CALL SGERQF( L, N, QT, LDQT, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
+*        SGERQF stores LMAX scalar factors for the elementary reflectors
+         CALL SGERQF( LMAX, N, QT, LDQT, WORK, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
 
-         CALL SORGRQ( N, N, L, QT, LDQT, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + L )
+         CALL SORGRQ( N, N, LMAX, QT, LDQT, WORK, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
 
          WORK( 1 ) = REAL( LWKOPT )
       END IF
@@ -493,15 +494,12 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Determine the rank of G
 *
-      R = 0
       DO 20 I = 1, MIN( M + P, N )
          IF( ABS( G( I, I ) ).LE.TOL ) THEN
             EXIT
          END IF
-         R = R + 1
+         L = L + 1
    20 CONTINUE
-*
-      L = R
 *
 *     Handle rank=0 case
 *
@@ -520,24 +518,28 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          RETURN
       END IF
 *
-*     Copy R1( 1:R, : ) into A, B and set lower triangular part to zero
+*     Copy R1( 1:L, : ) into A, B and set lower triangular part to zero
 *
-      IF( R.LE.M ) THEN
-          CALL SLACPY( 'U', R, N, G, LDG, A, LDA )
+      IF( L.LE.M ) THEN
+          CALL SLACPY( 'U', L, N, G, LDG, A, LDA )
           IF( M.GT.1 ) THEN
-             CALL SLASET( 'L', R - 1, N, 0.0E0, 0.0E0, A( 2, 1 ), LDA )
+             CALL SLASET( 'L', L - 1, N, 0.0E0, 0.0E0, A( 2, 1 ), LDA )
           END IF
       ELSE
           CALL SLACPY( 'U', M, N, G, LDG, A, LDA )
-          CALL SLACPY( 'U', R - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
+          CALL SLACPY( 'U', L - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
 *
-          CALL SLASET( 'L', M - 1, N, 0.0E0, 0.0E0, A( 2, 1 ), LDA )
-          CALL SLASET( 'L', R-M-1, N, 0.0E0, 0.0E0, B( 2, 1 ), LDB )
+          IF( M.GT.1 ) THEN
+              CALL SLASET( 'L', M - 1, N, 0.0E0, 0.0E0, A( 2, 1 ), LDA )
+          END IF
+          IF( P.GT.1 ) THEN
+              CALL SLASET( 'L', L-M-1, N, 0.0E0, 0.0E0, B( 2, 1 ), LDB )
+          END IF
       END IF
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition
 *
-      CALL SORGQR( M + P, R, R, G, LDG, THETA,
+      CALL SORGQR( M + P, L, L, G, LDG, THETA,
      $             WORK( Z + 1 ), LWORK - Z, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
@@ -547,9 +549,9 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       THETA(1:N) = NAN
 *
-*     Compute the CS decomposition of Q1( :, 1:R )
+*     Compute the CS decomposition of Q1( :, 1:L )
 *
-      CALL SORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, R,
+      CALL SORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
      $                 G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
      $                 U2, LDU2, U1, LDU1, QT, LDQT,
      $                 WORK( Z + 1 ), LWORK - Z, IWORK( N + 1 ), INFO )
@@ -563,23 +565,23 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Copy V^T from QT to G
 *
-      CALL SLACPY( 'A', R, R, QT, LDQT, G, LDG )
+      CALL SLACPY( 'A', L, L, QT, LDQT, G, LDG )
 *
 *     DEBUG
 *
       CALL SLASET( 'A', N, N, NAN, NAN, QT, LDQT )
 *
-*     Compute V^T R1( 1:R, : ) in the last R rows of QT
+*     Compute V^T R1( 1:L, : ) in the last L rows of QT
 *
-      IF ( R.LE.M ) THEN
-         CALL SGEMM( 'N', 'N', R, N, R, 1.0E0, G, LDG,
-     $               A, LDA, 0.0E0, QT( N-R+1, 1 ), LDQT )
+      IF ( L.LE.M ) THEN
+         CALL SGEMM( 'N', 'N', L, N, L, 1.0E0, G, LDG,
+     $               A, LDA, 0.0E0, QT( N-L+1, 1 ), LDQT )
       ELSE
-         CALL SGEMM( 'N', 'N', R, N, M, 1.0E0, G( 1, 1 ), LDG,
-     $               A, LDA, 0.0E0, QT( N-R+1, 1 ), LDQT )
-         CALL SGEMM( 'N', 'N', R, N - M, R - M, 1.0E0,
+         CALL SGEMM( 'N', 'N', L, N, M, 1.0E0, G( 1, 1 ), LDG,
+     $               A, LDA, 0.0E0, QT( N-L+1, 1 ), LDQT )
+         CALL SGEMM( 'N', 'N', L, N - M, L - M, 1.0E0,
      $               G( 1, M + 1 ), LDG, B, LDB,
-     $               1.0E0, QT( N-R+1, M+1 ), LDQT )
+     $               1.0E0, QT( N-L+1, M+1 ), LDQT )
       END IF
 *
 *     DEBUG
@@ -588,33 +590,33 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       CALL SLASET( 'A', P, N, NAN, NAN, B, LDB )
       WORK(1:LWORK) = NAN
 *
-*     Compute the RQ decomposition of V^T R1( 1:R, : )
+*     Compute the RQ decomposition of V^T R1( 1:L, : )
 *
-      CALL SGERQF( R, N, QT( N-R+1, 1 ), LDQT, WORK,
+      CALL SGERQF( L, N, QT( N-L+1, 1 ), LDQT, WORK,
      $             WORK( L + 1 ), LWORK - L, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
 *
-*     Copy matrix R from QT( N-R+1:N, N-R+1:N ) to A, B
+*     Copy matrix L from QT( N-L+1:N, N-L+1:N ) to A, B
 *
-      IF ( R.LE.M ) THEN
-         CALL SLACPY( 'U', R, R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
+      IF ( L.LE.M ) THEN
+         CALL SLACPY( 'U', L, L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
       ELSE
-         CALL SLACPY( 'U', M,     R, QT( N-R+1, N-R+1 ), LDQT, A, LDA )
-         CALL SLACPY( 'U', R - M, R - M, QT( N-R+M+1, N-R+M+1 ), LDQT,
+         CALL SLACPY( 'U', M,     L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
+         CALL SLACPY( 'U', L - M, L - M, QT( N-L+M+1, N-L+M+1 ), LDQT,
      $                B, LDB )
       END IF
 *
 *     DEBUG
 *
-      CALL SLASET( 'U', R, R, NAN, NAN, QT( 1, N-R+1 ), LDQT )
+      CALL SLASET( 'U', L, L, NAN, NAN, QT( 1, N-L+1 ), LDQT )
       WORK( L+1:LWORK ) = NAN
 *
 *     Explicitly form Q^T
 *
       IF( WANTQT ) THEN
-         CALL SORGRQ( N, N, R, QT, LDQT, WORK,
+         CALL SORGRQ( N, N, L, QT, LDQT, WORK,
      $                WORK( L + 1 ), LWORK - L, INFO )
          IF ( INFO.NE.0 ) THEN
             RETURN

From c9e51670c67a90ecc7a3b2bda8cca4007813d8a5 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 15 Apr 2020 20:15:56 +0200
Subject: [PATCH 031/101] CGGQRCS: fix LRWORK computation

For CUNCSD2BY1, memory consumption is not at maximum with maximum matrix
dimensions.
---
 SRC/cggqrcs.f | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 462d051da7..95277d92af 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -352,7 +352,8 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
-      INTEGER            I, J, LMAX, Z, LDG, LWKOPT, LRWKOPT
+      INTEGER            I, J, LMAX, Z, LDG, LWKOPT, LRWKOPT,
+     $                   LRWORK2BY1, K2BY1
       REAL               GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
       COMPLEX            ZERO, ONE, CNAN
 *     .. Local Arrays ..
@@ -448,7 +449,11 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *        The matrix (A, B) must be stored sequentially for xUNCSD2BY1
          LWKOPT = Z + LWKOPT
-         LRWKOPT = MAX( 2*N, INT( RWORK( 1 ) ) )
+*        Adjust CUNCSD2BY1 LRWORK for case with maximum memory
+*        consumption
+         K2BY1 = MIN( m, p, n, MAX( 0, M+P-N) )
+         LRWORK2BY1 = INT( RWORK(1) ) - 16 * K2BY1 + 16 * MIN( M,P,N )
+         LRWKOPT = MAX( 2*N, LRWORK2BY1 )
 
 *        CGERQF, CUNGRQ read/store up to LMAX scalar factors
          CALL CGERQF( LMAX, N, QT, LDQT, WORK, WORK, -1, INFO )

From 2a55ab93c23ec073649ca34e4370c1d20c142729 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sun, 19 Apr 2020 15:20:43 +0200
Subject: [PATCH 032/101] DGGQRCS: set all THETA entries to NaN

---
 SRC/dggqrcs.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 627d32d51a..0f0db91a8c 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -540,7 +540,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     DEBUG
 *
-      THETA(1:L) = NAN
+      THETA(1:N) = NAN
 *
 *     Compute the CS decomposition of Q1( :, 1:L )
 *

From 3bb2c514cd992c5d759452a3448fd52d46004106 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sun, 19 Apr 2020 15:53:38 +0200
Subject: [PATCH 033/101] xGGQRCS: disallow zero dimensions

---
 SRC/cggqrcs.f | 14 +++++++-------
 SRC/dggqrcs.f | 14 +++++++-------
 SRC/sggqrcs.f | 14 +++++++-------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 95277d92af..4a4a4e7e46 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -129,19 +129,19 @@
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
-*>          The number of rows of the matrix A.  M >= 0.
+*>          The number of rows of the matrix A.  M >= 1.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>          The number of columns of the matrices A and B.  N >= 0.
+*>          The number of columns of the matrices A and B.  N >= 1.
 *> \endverbatim
 *>
 *> \param[in] P
 *> \verbatim
 *>          P is INTEGER
-*>          The number of rows of the matrix B.  P >= 0.
+*>          The number of rows of the matrix B.  P >= 1.
 *> \endverbatim
 *>
 *> \param[out] W
@@ -390,7 +390,7 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       IF ( LQUERY ) THEN
          G = 0
       ELSE
-         G = RESHAPE( WORK(1:Z), (/ M + P, N /) )
+         G = WORK( 1 )
       END IF
       LDG = M + P
       ZERO = (0.0E0, 0.0E0)
@@ -409,11 +409,11 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          INFO = -2
       ELSE IF( .NOT.( WANTQT .OR. LSAME( JOBQT, 'N' ) ) ) THEN
          INFO = -3
-      ELSE IF( M.LT.0 ) THEN
+      ELSE IF( M.LT.1 ) THEN
          INFO = -4
-      ELSE IF( N.LT.0 ) THEN
+      ELSE IF( N.LT.1 ) THEN
          INFO = -5
-      ELSE IF( P.LT.0 ) THEN
+      ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -10
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 0f0db91a8c..d7bd4e4cfd 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -128,19 +128,19 @@
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
-*>          The number of rows of the matrix A.  M >= 0.
+*>          The number of rows of the matrix A.  M >= 1.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>          The number of columns of the matrices A and B.  N >= 0.
+*>          The number of columns of the matrices A and B.  N >= 1.
 *> \endverbatim
 *>
 *> \param[in] P
 *> \verbatim
 *>          P is INTEGER
-*>          The number of rows of the matrix B.  P >= 0.
+*>          The number of rows of the matrix B.  P >= 1.
 *> \endverbatim
 *>
 *> \param[out] W
@@ -369,7 +369,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       IF ( LQUERY ) THEN
          G = 0
       ELSE
-         G = RESHAPE( WORK(1:Z), (/ M + P, N /) )
+         G = WORK( 1 )
       END IF
       LDG = M + P
 *     Computing 0.0 / 0.0 directly causes compiler errors
@@ -385,11 +385,11 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          INFO = -2
       ELSE IF( .NOT.( WANTQT .OR. LSAME( JOBQT, 'N' ) ) ) THEN
          INFO = -3
-      ELSE IF( M.LT.0 ) THEN
+      ELSE IF( M.LT.1 ) THEN
          INFO = -4
-      ELSE IF( N.LT.0 ) THEN
+      ELSE IF( N.LT.1 ) THEN
          INFO = -5
-      ELSE IF( P.LT.0 ) THEN
+      ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -10
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index c7a3ac42bc..fc8e0cd66d 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -128,19 +128,19 @@
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
-*>          The number of rows of the matrix A.  M >= 0.
+*>          The number of rows of the matrix A.  M >= 1.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>          The number of columns of the matrices A and B.  N >= 0.
+*>          The number of columns of the matrices A and B.  N >= 1.
 *> \endverbatim
 *>
 *> \param[in] P
 *> \verbatim
 *>          P is INTEGER
-*>          The number of rows of the matrix B.  P >= 0.
+*>          The number of rows of the matrix B.  P >= 1.
 *> \endverbatim
 *>
 *> \param[out] W
@@ -369,7 +369,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       IF ( LQUERY ) THEN
          G = 0
       ELSE
-         G = RESHAPE( WORK(1:Z), (/ M + P, N /) )
+         G = WORK( 1 )
       END IF
       LDG = M + P
 *     Computing 0.0 / 0.0 directly causes compiler errors
@@ -385,11 +385,11 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          INFO = -2
       ELSE IF( .NOT.( WANTQT .OR. LSAME( JOBQT, 'N' ) ) ) THEN
          INFO = -3
-      ELSE IF( M.LT.0 ) THEN
+      ELSE IF( M.LT.1 ) THEN
          INFO = -4
-      ELSE IF( N.LT.0 ) THEN
+      ELSE IF( N.LT.1 ) THEN
          INFO = -5
-      ELSE IF( P.LT.0 ) THEN
+      ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -10

From 087875f5363c0f4e7bea253ff0b0c686f5e5ce69 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sun, 19 Apr 2020 19:24:33 +0200
Subject: [PATCH 034/101] CGGQRCS: fix LRWORK computation

---
 SRC/cggqrcs.f | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 4a4a4e7e46..dcd8668cad 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -353,7 +353,7 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
       INTEGER            I, J, LMAX, Z, LDG, LWKOPT, LRWKOPT,
-     $                   LRWORK2BY1, K2BY1
+     $                   LRWORK2BY1
       REAL               GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
       COMPLEX            ZERO, ONE, CNAN
 *     .. Local Arrays ..
@@ -451,8 +451,13 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          LWKOPT = Z + LWKOPT
 *        Adjust CUNCSD2BY1 LRWORK for case with maximum memory
 *        consumption
-         K2BY1 = MIN( m, p, n, MAX( 0, M+P-N) )
-         LRWORK2BY1 = INT( RWORK(1) ) - 16 * K2BY1 + 16 * MIN( M,P,N )
+         LRWORK2BY1 = INT( RWORK(1) )
+*        Select safe xUNCSD2BY1 IBBCSD value
+     $                - 9 * MAX( 0, MIN( M, P, N, M+P-N-1 ) )
+     $                + 9 * MAX( 1, MIN( M, P, N ) )
+*        Select safe xUNCSD2BY1 LBBCSD value
+     $                - 8 * MAX( 0, MIN( M, P, N, M+P-N ) )
+     $                + 8 * MIN( M, P, N )
          LRWKOPT = MAX( 2*N, LRWORK2BY1 )
 
 *        CGERQF, CUNGRQ read/store up to LMAX scalar factors

From 5f307f62c68ab46f20e1e0dcd46163ef6b21c3cb Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Mon, 20 Apr 2020 00:45:44 +0200
Subject: [PATCH 035/101] CGGQRCS: fix documentation

---
 SRC/cggqrcs.f | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index dcd8668cad..866d21eeb2 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -44,7 +44,7 @@
 *> \verbatim
 *>
 *> CGGQRCS computes the generalized singular value decomposition (GSVD)
-*> of an M-by-N real matrix A and P-by-N real matrix B:
+*> of an M-by-N complex matrix A and P-by-N complex matrix B:
 *>
 *>       U1**T*A*Q = D1*( 0 R ),    U2**T*B*Q = D2*( 0 R )
 *>
@@ -53,7 +53,7 @@
 *> compute the GSVD.
 *>
 *> Let L be the effective numerical rank of the matrix (A**T,B**T)**T,
-*> then R is a L-by-L nonsingular upper triangular matrix, D1 and
+*> then R is an L-by-L nonsingular upper triangular matrix, D1 and
 *> D2 are M-by-L and P-by-L "diagonal" matrices and of the
 *> following structures, respectively:
 *>

From 512163197b8d916c30c29f530d7b573eb9283878 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 22 Apr 2020 15:01:33 +0200
Subject: [PATCH 036/101] Add ZGGQRCS, COMPLEX*16 GSVD via QR, CSD

---
 SRC/CMakeLists.txt |   2 +-
 SRC/zggqrcs.f      | 678 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 679 insertions(+), 1 deletion(-)
 create mode 100644 SRC/zggqrcs.f

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 627954d906..2622423ba3 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -375,7 +375,7 @@ set(ZLASRC
    zgetri.f zgetrs.f
    zggbak.f zggbal.f
    zgges.f  zgges3.f zggesx.f zggev.f  zggev3.f zggevx.f
-   zggglm.f zgghrd.f zgghd3.f zgglse.f zggqrf.f zggrqf.f
+   zggglm.f zgghrd.f zgghd3.f zgglse.f zggqrf.f zggrqf.f zggqrcs.f
    zggsvd3.f zggsvp3.f
    zgtcon.f zgtrfs.f zgtsv.f  zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f
    zhbevd.f zhbevx.f zhbgst.f zhbgv.f  zhbgvd.f zhbgvx.f zhbtrd.f
diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
new file mode 100644
index 0000000000..d9e49a25b2
--- /dev/null
+++ b/SRC/zggqrcs.f
@@ -0,0 +1,678 @@
+*> \brief <b> ZGGQRCS computes the singular value decomposition (SVD) for OTHER matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGGQRCS + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sggqrcs.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sggqrcs.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sggqrcs.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
+*                           A, LDA, B, LDB,
+*                           THETA, U1, LDU1, U2, LDU2, QT, LDQT,
+*                           WORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBU1, JOB2, JOBQT
+*       INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
+*      $                   M, N, P, L, LWORK, LRWORK
+*       DOUBLE PRECISION   W
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IWORK( * )
+*       DOUBLE PRECISION   THETA( * ), RWORK( * )
+*       COMPLEX*16         A( LDA, * ), B( LDB, * ),
+*      $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQ, * ),
+*      $                   WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGGQRCS computes the generalized singular value decomposition (GSVD)
+*> of an M-by-N complex matrix A and P-by-N complex matrix B:
+*>
+*>       U1**T*A*Q = D1*( 0 R ),    U2**T*B*Q = D2*( 0 R )
+*>
+*> where U1, U2, and Q are orthogonal matrices. ZGGQRCS uses the QR
+*> factorization with column pivoting and the 2-by-1 CS decomposition to
+*> compute the GSVD.
+*>
+*> Let L be the effective numerical rank of the matrix (A**T,B**T)**T,
+*> then R is an L-by-L nonsingular upper triangular matrix, D1 and
+*> D2 are M-by-L and P-by-L "diagonal" matrices and of the
+*> following structures, respectively:
+*>
+*>                        K   K1
+*>        D1 =     (  0   0   0 )
+*>              K  (  0   S   0 )
+*>              K1 (  0   0   I )
+*>
+*>                    K2  K
+*>        D2 =  K2 (  I   0   0 )
+*>              K  (  0   C   0 )
+*>                 (  0   0   0 )
+*>
+*>                 N-L  L
+*>   ( 0 R ) = L (  0   R )
+*>
+*> where
+*>
+*>   K  = MIN(M, P, L, M + P - L),
+*>   K1 = MAX(L - P, 0),
+*>   K2 = MAX(L - M, 0),
+*>   C  = diag( COS(THETA(1)), ..., COS(THETA(K)) ),
+*>   S  = diag( SIN(THETA(1)), ..., SIN(THETA(K)) ), and
+*>   C^2 + S^2 = I.
+*>
+*> The routine computes C, S, R, and optionally the orthogonal
+*> transformation matrices U, V and Q. If L <= M, then R is stored in
+*> A(1:L, 1:L) on exit. Otherwise, the first M rows of R are stored in
+*> A(:, 1:L) and R( M+1:, M+1: ) is stored in B(1:L-M, 1:L-M). In both
+*> cases, only the upper triangular part is stored.
+*>
+*> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
+*> A and B implicitly gives the SVD of A*inv(B):
+*>                      A*inv(B) = U1*(D1*inv(D2))*U2**T.
+*> If (A**T,B**T)**T  has orthonormal columns, then the GSVD of A and B
+*> is also equal to the CS decomposition of A and B. Furthermore, the
+*> GSVD can be used to derive the solution of the eigenvalue problem:
+*>                      A**T*A x = lambda * B**T*B x.
+*> In some literature, the GSVD of A and B is presented in the form
+*>                  U1**T*A*X = ( 0 D1 ),   U2**T*B*X = ( 0 D2 )
+*> where U1 and U2 are orthogonal and X is nonsingular, D1 and D2 are
+*> ``diagonal''.  The former GSVD form can be converted to the latter
+*> form by taking the nonsingular matrix X as
+*>
+*>                      X = Q*( I   0    )
+*>                            ( 0 inv(R) ).
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBU1
+*> \verbatim
+*>          JOBU1 is CHARACTER*1
+*>          = 'Y':  Orthogonal matrix U1 is computed;
+*>          = 'N':  U1 is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBU2
+*> \verbatim
+*>          JOBU2 is CHARACTER*1
+*>          = 'Y':  Orthogonal matrix U2 is computed;
+*>          = 'N':  U2 is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBQT
+*> \verbatim
+*>          JOBQT is CHARACTER*1
+*>          = 'Y':  Orthogonal matrix Q is computed;
+*>          = 'N':  Q is not computed.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 1.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrices A and B.  N >= 1.
+*> \endverbatim
+*>
+*> \param[in] P
+*> \verbatim
+*>          P is INTEGER
+*>          The number of rows of the matrix B.  P >= 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION
+*>
+*>          On exit, W is a radix power chosen such that the Frobenius
+*>          norm of A and W*B are within sqrt(radix) and 1/sqrt(radix)
+*>          of each other.
+*> \endverbatim
+*>
+*> \param[out] L
+*> \verbatim
+*>          L is INTEGER
+*>          On exit, the effective numerical rank of the matrix
+*>          (A**T, B**T)**T.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*>          On exit, A contains the triangular matrix R or the first M
+*>          rows of R, respectively. See Purpose for details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (LDB,N)
+*>          On entry, the P-by-N matrix B.
+*>          On exit, if L > M, then B contains the last L - M rows of
+*>          the triangular matrix R. See Purpose for details.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,P).
+*> \endverbatim
+*>
+*> \param[out] THETA
+*> \verbatim
+*>          THETA is DOUBLE PRECISION array, dimension (N)
+*>
+*>          On exit, THETA contains K = MIN(M, P, L, M + P - L) values
+*>          in radians in ascending order.
+*> \endverbatim
+*>
+*> \param[out] U1
+*> \verbatim
+*>          U1 is COMPLEX*16 array, dimension (LDU1,M)
+*>          If JOBU1 = 'Y', U1 contains the M-by-M orthogonal matrix U1.
+*>          If JOBU1 = 'N', U1 is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDU1
+*> \verbatim
+*>          LDU1 is INTEGER
+*>          The leading dimension of the array U1. LDU1 >= max(1,M) if
+*>          JOBU1 = 'Y'; LDU1 >= 1 otherwise.
+*> \endverbatim
+*>
+*> \param[out] U2
+*> \verbatim
+*>          U2 is COMPLEX*16 array, dimension (LDU2,P)
+*>          If JOBU2 = 'Y', U2 contains the P-by-P orthogonal matrix U2.
+*>          If JOBU2 = 'N', U2 is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDU2
+*> \verbatim
+*>          LDU2 is INTEGER
+*>          The leading dimension of the array U2. LDU2 >= max(1,P) if
+*>          JOBU2 = 'Y'; LDU2 >= 1 otherwise.
+*> \endverbatim
+*>
+*> \param[out] QT
+*> \verbatim
+*>          QT is COMPLEX*16 array, dimension (LDQT,N)
+*>          If JOBQT = 'Y', QT contains the N-by-N orthogonal matrix
+*>          Q**T.
+*> \endverbatim
+*>
+*> \param[in] LDQT
+*> \verbatim
+*>          LDQT is INTEGER
+*>          The leading dimension of the array QT. LDQT >= max(1,N) if
+*>          JOBQT = 'Y'; LDQT >= 1 otherwise.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal size of the WORK array,
+*>          returns this value as the first entry of the WORK array, and
+*>          no error message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (MAX(1,LRWORK))
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER
+*>          The dimension of the array RWORK.
+*>
+*>          If LRWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the RWORK array, returns
+*>          this value as the first entry of the work array, and no error
+*>          message related to LRWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (M + N + P)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>          > 0:  ZBBCSD did not converge. For further details, see
+*>                subroutine ZUNCSDBY1.
+*> \endverbatim
+*
+*> \par Internal Parameters:
+*  =========================
+*>
+*> \verbatim
+*>  TOL     DOUBLE PRECISION
+*>          Let G = (A**T,B**T)**T. TOL is the threshold to determine
+*>          the effective rank of G. Generally, it is set to
+*>                   TOL = MAX(M,P,N) * norm(G) * MACHEPS,
+*>          where norm(G) is the Frobenius norm of G.
+*>          The size of TOL may affect the size of backward error of the
+*>          decomposition.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Christoph Conrads (https://christoph-conrads.name)
+*
+*> \date April 2020
+*
+*> \ingroup complex16OTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*>     Christoph Conrads (https://christoph-conrads.name)
+*>
+*
+*> \par Further Details:
+*  =====================
+*>
+*>  ZGGQRCS should be significantly faster than ZGGSVD and ZGGSVD3 for
+*>  large matrices because the matrices A and B are reduced to a pair of
+*>  well-conditioned bidiagonal matrices instead of pairs of upper
+*>  triangular matrices. On the downside, ZGGQRCS requires a much larger
+*>  workspace whose dimension must be queried at run-time.
+*>
+*  =====================================================================
+      SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
+     $                    A, LDA, B, LDB,
+     $                    THETA, U1, LDU1, U2, LDU2, QT, LDQT,
+     $                    WORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine (version 3.X.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
+*     <DATE>
+*
+      IMPLICIT NONE
+*     .. Scalar Arguments ..
+      CHARACTER          JOBU1, JOBU2, JOBQT
+      INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
+     $                   L, M, N, P, LWORK, LRWORK
+      DOUBLE PRECISION   W
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * )
+      DOUBLE PRECISION   THETA( * ), RWORK( * )
+      COMPLEX*16         A( LDA, * ), B( LDB, * ),
+     $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQT, * ),
+     $                   WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Local Scalars ..
+      LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
+      INTEGER            I, J, LMAX, Z, LDG, LWKOPT, LRWKOPT,
+     $                   LRWORK2BY1
+      DOUBLE PRECISION   GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
+      COMPLEX*16         ZERO, ONE, ZNAN
+*     .. Local Arrays ..
+      COMPLEX*16         G( M + P, N )
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      DOUBLE PRECISION   DLAMCH, ZLANGE
+      EXTERNAL           LSAME, DLAMCH, ZLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZGEMM, ZGEQP3, ZGERQF, ZLACPY, ZLAPMT, ZLASCL,
+     $                   ZLASET, ZUNGQR, ZUNGRQ, ZUNCSD2BY1, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Decode and test the input parameters
+*
+      WANTU1 = LSAME( JOBU1, 'Y' )
+      WANTU2 = LSAME( JOBU2, 'Y' )
+      WANTQT = LSAME( JOBQT, 'Y' )
+      LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 )
+      LWKOPT = 1
+      LRWKOPT = 2*N
+*
+*     Initialize variables
+*
+      L = 0
+      LMAX = MIN( M + P, N )
+      Z = ( M + P ) * N
+      IF ( LQUERY ) THEN
+         G = 0
+      ELSE
+         G = WORK( 1 )
+      END IF
+      LDG = M + P
+      ZERO = (0.0D0, 0.0D0)
+      ONE = (1.0D0, 0.0D0)
+*     Computing 0.0 / 0.0 directly causes compiler errors
+      NAN = 1.0D0
+      NAN = 0.0 / (NAN - 1.0D0)
+      ZNAN = DCMPLX(NAN,NAN)
+*
+*     Test the input arguments
+*
+      INFO = 0
+      IF( .NOT.( WANTU1 .OR. LSAME( JOBU1, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( WANTU2 .OR. LSAME( JOBU2, 'N' ) ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( WANTQT .OR. LSAME( JOBQT, 'N' ) ) ) THEN
+         INFO = -3
+      ELSE IF( M.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( N.LT.1 ) THEN
+         INFO = -5
+      ELSE IF( P.LT.1 ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -10
+      ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
+         INFO = -12
+      ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
+         INFO = -15
+      ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
+         INFO = -17
+      ELSE IF( LDQT.LT.1 .OR. ( WANTQT .AND. LDQT.LT.N ) ) THEN
+         INFO = -19
+      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
+         INFO = -23
+      ELSE IF( LRWORK.LT.2*N .AND. .NOT.LQUERY ) THEN
+         INFO = -25
+      END IF
+*
+*     Compute optimal workspace size
+*
+      IF( INFO.EQ.0 ) THEN
+*        ZGEQP3, ZUNGQR read/store LMAX scalar factors
+         CALL ZGEQP3( M+P, N, G, LDG, IWORK, WORK,
+     $                WORK, -1, RWORK, INFO )
+         LWKOPT = INT( WORK( 1 ) ) + LMAX
+
+         CALL ZUNGQR( M + P, LMAX, LMAX, G, LDG, WORK, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
+
+         CALL ZUNCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, LMAX,
+     $                    G, LDG, G, LDG,
+     $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
+     $                    WORK, -1, RWORK, LRWORK, IWORK, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+*        The matrix (A, B) must be stored sequentially for xUNCSD2BY1
+         LWKOPT = Z + LWKOPT
+*        Adjust ZUNCSD2BY1 LRWORK for case with maximum memory
+*        consumption
+         LRWORK2BY1 = INT( RWORK(1) )
+*        Select safe xUNCSD2BY1 IBBCSD value
+     $                - 9 * MAX( 0, MIN( M, P, N, M+P-N-1 ) )
+     $                + 9 * MAX( 1, MIN( M, P, N ) )
+*        Select safe xUNCSD2BY1 LBBCSD value
+     $                - 8 * MAX( 0, MIN( M, P, N, M+P-N ) )
+     $                + 8 * MIN( M, P, N )
+         LRWKOPT = MAX( 2*N, LRWORK2BY1 )
+
+*        ZGERQF, ZUNGRQ read/store up to LMAX scalar factors
+         CALL ZGERQF( LMAX, N, QT, LDQT, WORK, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
+
+         CALL ZUNGRQ( N, N, LMAX, QT, LDQT, WORK, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
+
+         WORK( 1 ) = DCMPLX( DBLE( LWKOPT ), 0.0D0 )
+         RWORK( 1 ) = DBLE( LRWKOPT )
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZGGQRCS', -INFO )
+         RETURN
+      END IF
+      IF( LQUERY ) THEN
+         RETURN
+      ENDIF
+*
+*     DEBUG
+*
+      IWORK( 1:M+N+P ) = -1
+*
+*     Scale matrix B such that norm(A) \approx norm(B)
+*
+      NORMA = ZLANGE( 'F', M, N, A, LDA, RWORK )
+      NORMB = ZLANGE( 'F', P, N, B, LDB, RWORK )
+*
+      IF ( NORMB.EQ.0 ) THEN
+         W = 1.0D0
+      ELSE
+         BASE = DLAMCH( 'B' )
+         W = BASE ** INT( LOG( NORMA / NORMB ) / LOG( BASE ) )
+*
+         CALL ZLASCL( 'G', -1, -1, 1.0D0, W, P, N, B, LDB, INFO )
+         IF ( INFO.NE.0 ) THEN
+            RETURN
+         END IF
+      END IF
+*
+*     Copy matrices A, B into the (M+P) x n matrix G
+*
+      CALL ZLACPY( 'A', M, N, A, LDA, G( P + 1, 1 ), LDG )
+      CALL ZLACPY( 'A', P, N, B, LDB, G( 1, 1 ), LDG )
+*
+*     DEBUG
+*
+      CALL ZLASET( 'A', M, N, ZNAN, ZNAN, A, LDA )
+      CALL ZLASET( 'A', P, N, ZNAN, ZNAN, B, LDB )
+*
+*     Compute the Frobenius norm of matrix G
+*
+      GNORM = ZLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
+*
+*     Get machine precision and set up threshold for determining
+*     the effective numerical rank of the matrix G.
+*
+      ULP = DLAMCH( 'Precision' )
+      UNFL = DLAMCH( 'Safe Minimum' )
+      TOL = MAX( M + P, N ) * MAX( GNORM, UNFL ) * ULP
+*
+*     IWORK stores the column permutations computed by ZGEQP3.
+*     Columns J where IWORK( J ) is non-zero are permuted to the front
+*     so we set the all entries to zero here.
+*
+      IWORK( 1:N ) = 0
+*
+*     Compute the QR factorization with column pivoting GΠ = Q1 R1
+*
+      CALL ZGEQP3( M + P, N, G, LDG, IWORK, WORK( Z + 1 ),
+     $             WORK( Z + LMAX + 1 ), LWORK - Z - LMAX, RWORK, INFO )
+      IF( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     Determine the rank of G
+*
+      DO 20 I = 1, LMAX
+         IF( ABS( G( I, I ) ).LE.TOL ) THEN
+            EXIT
+         END IF
+         L = L + 1
+   20 CONTINUE
+*
+*     Handle rank=0 case
+*
+      IF( L.EQ.0 ) THEN
+         IF( WANTU1 ) THEN
+            CALL ZLASET( 'A', M, M, ZERO, ONE, U1, LDU1 )
+         END IF
+         IF( WANTU2 ) THEN
+            CALL ZLASET( 'A', P, P, ZERO, ONE, U2, LDU2 )
+         END IF
+         IF( WANTQT ) THEN
+            CALL ZLASET( 'A', N, N, ZERO, ONE, QT, LDQT )
+         END IF
+*
+         WORK( 1 ) = DCMPLX( DBLE(LWKOPT), 0.0D0 )
+         RWORK( 1 ) = DBLE(LRWKOPT)
+         RETURN
+      END IF
+*
+*     Copy R1( 1:L, : ) into A, B and set lower triangular part to zero
+*
+      IF( L.LE.M ) THEN
+          CALL ZLACPY( 'U', L, N, G, LDG, A, LDA )
+          CALL ZLASET( 'L', L - 1, N, ZERO, ZERO, A( 2, 1 ), LDA )
+      ELSE
+          CALL ZLACPY( 'U', M, N, G, LDG, A, LDA )
+          CALL ZLACPY( 'U', L - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
+*
+          CALL ZLASET( 'L', M - 1, N, ZERO, ZERO, A( 2, 1 ), LDA )
+          CALL ZLASET( 'L', L-M-1, N, ZERO, ZERO, B( 2, 1 ), LDB )
+      END IF
+*
+*     Explicitly form Q1 so that we can compute the CS decomposition
+*
+      CALL ZUNGQR( M + P, L, L, G, LDG, WORK( Z + 1 ),
+     $             WORK( Z + L + 1 ), LWORK - Z - L, INFO )
+      IF ( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     DEBUG
+*
+      RWORK( 1:LRWORK ) = NAN
+      WORK( Z+1:LWORK ) = ZNAN
+*
+*     Compute the CS decomposition of Q1( :, 1:L )
+*
+      CALL ZUNCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
+     $                 G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
+     $                 U2, LDU2, U1, LDU1, QT, LDQT,
+     $                 WORK( Z + 1 ), LWORK - Z,
+     $                 RWORK, LRWORK, IWORK( N + 1 ), INFO )
+      IF( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     DEBUG
+*
+      WORK( 1:LWORK ) = ZNAN
+      RWORK( 1:LRWORK ) = NAN
+*
+*     Copy V^T from QT to G
+*
+      CALL ZLACPY( 'A', L, L, QT, LDQT, G, LDG )
+*
+*     DEBUG
+*
+      CALL ZLASET( 'A', N, N, ZNAN, ZNAN, QT, LDQT )
+*
+*     Compute V^T R1( 1:L, : ) in the last L rows of QT
+*
+      IF ( L.LE.M ) THEN
+         CALL ZGEMM( 'N', 'N', L, N, L, ONE, G, LDG,
+     $               A, LDA, ZERO, QT( N-L+1, 1 ), LDQT )
+      ELSE
+         CALL ZGEMM( 'N', 'N', L, N, M, ONE, G( 1, 1 ), LDG,
+     $               A, LDA, ZERO, QT( N-L+1, 1 ), LDQT )
+         CALL ZGEMM( 'N', 'N', L, N - M, L - M, ONE,
+     $               G( 1, M + 1 ), LDG, B, LDB,
+     $               ONE, QT( N-L+1, M+1 ), LDQT )
+      END IF
+*
+*     DEBUG
+*
+      CALL ZLASET( 'A', M, N, ZNAN, ZNAN, A, LDA )
+      CALL ZLASET( 'A', P, N, ZNAN, ZNAN, B, LDB )
+      WORK(1:LWORK) = ZNAN
+*
+*     Compute the RQ decomposition of V^T R1( 1:L, : )
+*
+      CALL ZGERQF( L, N, QT( N-L+1, 1 ), LDQT, WORK( 1 ),
+     $             WORK( L + 1 ), LWORK - L, INFO )
+      IF ( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     Copy matrix L from QT( N-L+1:N, N-L+1:N ) to A, B
+*
+      IF ( L.LE.M ) THEN
+         CALL ZLACPY( 'U', L, L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
+      ELSE
+         CALL ZLACPY( 'U', M,     L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
+         CALL ZLACPY( 'U', L - M, L - M, QT( N-L+M+1, N-L+M+1 ), LDQT,
+     $                B, LDB )
+      END IF
+*
+*     DEBUG
+*
+      CALL ZLASET( 'U', L, L, ZNAN, ZNAN, QT( 1, N-L+1 ), LDQT )
+      WORK( L+1:LWORK ) = ZNAN
+*
+*     Explicitly form Q^T
+*
+      IF( WANTQT ) THEN
+         CALL ZUNGRQ( N, N, L, QT, LDQT, WORK,
+     $                WORK( L + 1 ), LWORK - L, INFO )
+         IF ( INFO.NE.0 ) THEN
+            RETURN
+         END IF
+*
+*     Revert column permutation Π by permuting the rows of Q^T
+*
+         CALL ZLAPMT( .FALSE., N, N, QT, LDQT, IWORK )
+      END IF
+*
+      WORK( 1 ) = DCMPLX( DBLE(LWKOPT), 0.0D0 )
+      RWORK( 1 ) = DBLE(LRWKOPT)
+
+      RETURN
+*
+*     End of ZGGQRCS
+*
+      END

From 846a97a3660668e92ec018d1426f1e351b9f6cc3 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 22 Apr 2020 15:08:55 +0200
Subject: [PATCH 037/101] xGGQRCS: complete list of external subroutines

---
 SRC/cggqrcs.f | 4 ++--
 SRC/dggqrcs.f | 4 ++--
 SRC/sggqrcs.f | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 866d21eeb2..498c1b5d2f 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -365,8 +365,8 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       EXTERNAL           LSAME, SLAMCH, CLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLACPY, CLASCL, CGEQP3, CUNGQR, CGERQF, CUNGRQ,
-     $                   CUNCSD2BY1, XERBLA
+      EXTERNAL           CGEMM, CGEQP3, CGERQF, CLACPY, CLAPMT, CLASCL,
+     $                   CLASET, CUNGQR, CUNGRQ, CUNCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index d7bd4e4cfd..8ed449da57 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -345,8 +345,8 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       EXTERNAL           LSAME, DLAMCH, DLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLACPY, DLASCL, DGEQP3, DORGQR, DGERQF, QORGRQ,
-     $                   DORCSD2BY1, XERBLA
+      EXTERNAL           DGEMM, DGEQP3, DGERQF, DLACPY, DLAPMT, DLASCL,
+     $                   DLASET, DORGQR, DORGRQ, DORCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index fc8e0cd66d..0ef872b418 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -345,8 +345,8 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       EXTERNAL           LSAME, SLAMCH, SLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLACPY, SLASCL, SGEQP3, SORGQR, SGERQF, SORGRQ,
-     $                   SORCSD2BY1, XERBLA
+      EXTERNAL           SGEMM, SGEQP3, SGERQF, SLACPY, SLAPMT, SLASCL,
+     $                   SLASET, SORGQR, SORGRQ, SORCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN

From bf662501323588864b20dac3327e4c2570ed4a67 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Tue, 28 Apr 2020 15:25:48 +0200
Subject: [PATCH 038/101] SGGQRCS: fix generalized singular values

In an attempt to avoid a large backward error, xGGQRCS scales on of the
input matrices so that both input matrices have comparable norm. This
changes (not perturb, changes!) the generalized singular values. The
previously committed test focusing on the singular values computed by
xGGQRCS highlighted the need to fix the singular values.

With this commit, it became obvious that
* the singular values can be computed to very high relative accuracy
  (small forward error),
* the matrices computed by xGGQRCS change significantly with scaling
  (large backward error).

Consequently, matrix scaling cannot be used and row sorting must be
applied to the matrix `(A, B)` before the initial QR decomposition.
---
 SRC/sggqrcs.f | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 0ef872b418..934a2b0685 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -341,7 +341,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, TAN, ATAN
       EXTERNAL           LSAME, SLAMCH, SLANGE
 *     ..
 *     .. External Subroutines ..
@@ -626,6 +626,16 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
          CALL SLAPMT( .FALSE., N, N, QT, LDQT, IWORK )
       END IF
+*
+*     Adjust generalized singular values for matrix scaling
+*
+      DO I = 1, MIN( M, P, L, M + P - L )
+*        Do not adjust singular value if THETA(i) is greater than pi/2
+*        (i.e. TAN(THETA(I)) < 0)
+         IF ( TAN( THETA(I) ) >= 0 ) THEN
+            THETA(I) = ATAN( W * TAN( THETA(I) ) )
+         END IF
+      END DO
 *
       WORK( 1 ) = REAL( LWKOPT )
       RETURN

From 366de5ec91f18c0cb4dbac3318e9e289edc9701e Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Tue, 28 Apr 2020 15:44:48 +0200
Subject: [PATCH 039/101] Revert "SGGQRCS: fix generalized singular values"

---
 SRC/sggqrcs.f | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 934a2b0685..0ef872b418 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -341,7 +341,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANGE, TAN, ATAN
+      REAL               SLAMCH, SLANGE
       EXTERNAL           LSAME, SLAMCH, SLANGE
 *     ..
 *     .. External Subroutines ..
@@ -626,16 +626,6 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
          CALL SLAPMT( .FALSE., N, N, QT, LDQT, IWORK )
       END IF
-*
-*     Adjust generalized singular values for matrix scaling
-*
-      DO I = 1, MIN( M, P, L, M + P - L )
-*        Do not adjust singular value if THETA(i) is greater than pi/2
-*        (i.e. TAN(THETA(I)) < 0)
-         IF ( TAN( THETA(I) ) >= 0 ) THEN
-            THETA(I) = ATAN( W * TAN( THETA(I) ) )
-         END IF
-      END DO
 *
       WORK( 1 ) = REAL( LWKOPT )
       RETURN

From 0f06a18cd3f14b86d989b75a52120513ee2bb162 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Tue, 28 Apr 2020 20:35:46 +0200
Subject: [PATCH 040/101] Add SLASRTI sorting indices based on numbers

xLASRT sorts an array of real values. xLASRTI sorts an array of indices
referencing real values in an array.

SLASRTI is based on SLASRT.
---
 SRC/CMakeLists.txt |   2 +-
 SRC/slasrti.f      | 306 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 307 insertions(+), 1 deletion(-)
 create mode 100644 SRC/slasrti.f

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 2622423ba3..05e73a7935 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -54,7 +54,7 @@ set(SCLAUX
    slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f
    slasd7.f slasd8.f slasda.f slasdq.f slasdt.f
    slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f
-   slasr.f  slasrt.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f
+   slasr.f  slasrt.f slasrti.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f
    ssteqr.f ssterf.f slaisnan.f sisnan.f
    slartgp.f slartgs.f
    ${SECOND_SRC})
diff --git a/SRC/slasrti.f b/SRC/slasrti.f
new file mode 100644
index 0000000000..2eb004dcdb
--- /dev/null
+++ b/SRC/slasrti.f
@@ -0,0 +1,306 @@
+*> \brief \b SLASRTI sorts array indices based on the referenced numbers
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SLASRTI + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slasrti.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slasrti.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slasrti.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SLASRTI( ID, N, X, INDICES, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          ID
+*       INTEGER            INFO, N
+*       ..
+*       .. Array Arguments ..
+*       REAL               X( * )
+*       INTEGER            INDICES( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Sort the numbers in X in increasing order (if ID = 'I') or
+*> in decreasing order (if ID = 'X' ).
+*>
+*> Use Quick Sort, reverting to Insertion sort on arrays of
+*> size <= 20. Dimension of STACK limits N to about 2**32.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] ID
+*> \verbatim
+*>          ID is CHARACTER*1
+*>          = 'I': sort X in increasing order;
+*>          = 'X': sort X in decreasing order.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The length of the array X.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is REAL array, dimension (N)
+*>          On entry, the array to be sorted.
+*>          On exit, X has been sorted into increasing order
+*>          (X(1) <= ... <= X(N) ) or into decreasing order
+*>          (X(1) >= ... >= X(N) ), depending on ID.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Christoph Conrads (https://christoph-conrads.name)
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date April 2020
+*
+*> \ingroup auxOTHERcomputational
+*
+*  =====================================================================
+      SUBROUTINE SLASRTI( ID, N, X, INDICES, INFO )
+*
+*  -- LAPACK computational routine (version TODO) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     TODO
+*
+*     .. Scalar Arguments ..
+      CHARACTER          ID
+      INTEGER            INFO, N
+*     ..
+*     .. Array Arguments ..
+      REAL               X( * )
+      INTEGER            INDICES( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            SELECT
+      PARAMETER          ( SELECT = 20 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            DIR, ENDD, I, J, START, STKPNT, TMP
+      REAL               P1, P2, P3, PIVOT
+*     ..
+*     .. Local Arrays ..
+      INTEGER            STACK( 2, 32 )
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      DIR = -1
+      IF( LSAME( ID, 'D' ) ) THEN
+         DIR = 0
+      ELSE IF( LSAME( ID, 'I' ) ) THEN
+         DIR = 1
+      END IF
+      IF( DIR.EQ.-1 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SLASRTI', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.LE.1 )
+     $   RETURN
+*
+      STKPNT = 1
+      STACK( 1, 1 ) = 1
+      STACK( 2, 1 ) = N
+   10 CONTINUE
+      START = STACK( 1, STKPNT )
+      ENDD = STACK( 2, STKPNT )
+      STKPNT = STKPNT - 1
+      IF( ENDD-START.LE.SELECT .AND. ENDD-START.GT.0 ) THEN
+*
+*        Do Insertion sort on X( START:ENDD )
+*
+         IF( DIR.EQ.0 ) THEN
+*
+*           Sort into decreasing order
+*
+            DO 30 I = START + 1, ENDD
+               DO 20 J = I, START + 1, -1
+                  IF( X( INDICES(J) ).GT.X( INDICES(J-1) ) ) THEN
+                     TMP = INDICES( J )
+                     INDICES( J ) = INDICES( J-1 )
+                     INDICES( J-1 ) = TMP
+                  ELSE
+                     GO TO 30
+                  END IF
+   20          CONTINUE
+   30       CONTINUE
+*
+         ELSE
+*
+*           Sort into increasing order
+*
+            DO 50 I = START + 1, ENDD
+               DO 40 J = I, START + 1, -1
+                  IF( X( INDICES(J) ).LT.X( INDICES(J-1) ) ) THEN
+                     TMP = INDICES( J )
+                     INDICES( J ) = INDICES( J-1 )
+                     INDICES( J-1 ) = TMP
+                  ELSE
+                     GO TO 50
+                  END IF
+   40          CONTINUE
+   50       CONTINUE
+*
+         END IF
+*
+      ELSE IF( ENDD-START.GT.SELECT ) THEN
+*
+*        Partition X( START:ENDD ) and stack parts, largest one first
+*
+*        Choose partition entry as median of 3
+*
+         P1 = X( INDICES(START) )
+         P2 = X( INDICES(ENDD) )
+         I = INDICES( ( START+ENDD ) / 2 )
+         P3 = X( INDICES(I) )
+         IF( P1.LT.P2 ) THEN
+            IF( P3.LT.P1 ) THEN
+               PIVOT = P1
+            ELSE IF( P3.LT.P2 ) THEN
+               PIVOT = P3
+            ELSE
+               PIVOT = P2
+            END IF
+         ELSE
+            IF( P3.LT.P2 ) THEN
+               PIVOT = P2
+            ELSE IF( P3.LT.P1 ) THEN
+               PIVOT = P3
+            ELSE
+               PIVOT = P1
+            END IF
+         END IF
+*
+         IF( DIR.EQ.0 ) THEN
+*
+*           Sort into decreasing order
+*
+            I = START - 1
+            J = ENDD + 1
+   60       CONTINUE
+   70       CONTINUE
+            J = J - 1
+            IF( X( INDICES(J) ).LT.PIVOT )
+     $         GO TO 70
+   80       CONTINUE
+            I = I + 1
+            IF( X( INDICES(I) ).GT.PIVOT )
+     $         GO TO 80
+            IF( I.LT.J ) THEN
+               TMP = INDICES( I )
+               INDICES( I ) = INDICES( J )
+               INDICES( J ) = TMP
+               GO TO 60
+            END IF
+            IF( J-START.GT.ENDD-J-1 ) THEN
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = START
+               STACK( 2, STKPNT ) = J
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = J + 1
+               STACK( 2, STKPNT ) = ENDD
+            ELSE
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = J + 1
+               STACK( 2, STKPNT ) = ENDD
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = START
+               STACK( 2, STKPNT ) = J
+            END IF
+         ELSE
+*
+*           Sort into increasing order
+*
+            I = START - 1
+            J = ENDD + 1
+   90       CONTINUE
+  100       CONTINUE
+            J = J - 1
+            IF( X( INDICES(J) ).GT.PIVOT )
+     $         GO TO 100
+  110       CONTINUE
+            I = I + 1
+            IF( X( INDICES(I) ).LT.PIVOT )
+     $         GO TO 110
+            IF( I.LT.J ) THEN
+               TMP = INDICES( I )
+               INDICES( I ) = INDICES( J )
+               INDICES( J ) = TMP
+               GO TO 90
+            END IF
+            IF( J-START.GT.ENDD-J-1 ) THEN
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = START
+               STACK( 2, STKPNT ) = J
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = J + 1
+               STACK( 2, STKPNT ) = ENDD
+            ELSE
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = J + 1
+               STACK( 2, STKPNT ) = ENDD
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = START
+               STACK( 2, STKPNT ) = J
+            END IF
+         END IF
+      END IF
+      IF( STKPNT.GT.0 )
+     $   GO TO 10
+      RETURN
+*
+*     End of SLASRTI
+*
+      END

From 29d3e3bded841ff1c17736016c008a8960645fd0 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Tue, 28 Apr 2020 20:49:55 +0200
Subject: [PATCH 041/101] SLASRTI: update documentation

---
 SRC/slasrti.f | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/SRC/slasrti.f b/SRC/slasrti.f
index 2eb004dcdb..ca94cd5e08 100644
--- a/SRC/slasrti.f
+++ b/SRC/slasrti.f
@@ -35,8 +35,8 @@
 *>
 *> \verbatim
 *>
-*> Sort the numbers in X in increasing order (if ID = 'I') or
-*> in decreasing order (if ID = 'X' ).
+*> Sort the numbers in X indirectly in increasing order (if ID = 'I') or
+*> in decreasing order (if ID = 'X' ) using the array of INDICES.
 *>
 *> Use Quick Sort, reverting to Insertion sort on arrays of
 *> size <= 20. Dimension of STACK limits N to about 2**32.
@@ -49,22 +49,31 @@
 *> \verbatim
 *>          ID is CHARACTER*1
 *>          = 'I': sort X in increasing order;
-*>          = 'X': sort X in decreasing order.
+*>          = 'D': sort X in decreasing order.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>          The length of the array X.
+*>          The length of the array X and INDICES.
 *> \endverbatim
 *>
-*> \param[in,out] X
+*> \param[in] X
 *> \verbatim
 *>          X is REAL array, dimension (N)
-*>          On entry, the array to be sorted.
-*>          On exit, X has been sorted into increasing order
-*>          (X(1) <= ... <= X(N) ) or into decreasing order
-*>          (X(1) >= ... >= X(N) ), depending on ID.
+*>          The values to be sorted.
+*> \endverbatim
+*>
+*> \param[in,out] INDICES
+*> \verbatim
+*>          X is INTEGER array, dimension (N)
+*>          On entry, the indices of values in X to be sorted.
+*>          On exit, X has been sorted into
+*>          increasing order such that
+*>          X( INDICES(1) ) <= ... <= X( INDICES(N) )
+*>          or decreasing order such that
+*>          X( INDICES(1) ) >= ... >= X( INDICES(N) )
+*>          depending on ID.
 *> \endverbatim
 *>
 *> \param[out] INFO

From 1e512dbfe27bea38075e8c34172f59050ada5afd Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Tue, 28 Apr 2020 20:53:55 +0200
Subject: [PATCH 042/101] Add DLASRTI

---
 SRC/CMakeLists.txt |   2 +-
 SRC/dlasrti.f      | 315 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 316 insertions(+), 1 deletion(-)
 create mode 100644 SRC/dlasrti.f

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 05e73a7935..fab55246e7 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -73,7 +73,7 @@ set(DZLAUX
    dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f
    dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f
    dlaset.f dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f
-   dlasr.f  dlasrt.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f
+   dlasr.f  dlasrt.f dlasrti.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f
    dsteqr.f dsterf.f dlaisnan.f disnan.f
    dlartgp.f dlartgs.f
    ../INSTALL/dlamch.f ${DSECOND_SRC})
diff --git a/SRC/dlasrti.f b/SRC/dlasrti.f
new file mode 100644
index 0000000000..f44c473555
--- /dev/null
+++ b/SRC/dlasrti.f
@@ -0,0 +1,315 @@
+*> \brief \b DLASRTI sorts array indices based on the referenced numbers
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DLASRTI + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlasrti.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlasrti.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlasrti.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DLASRTI( ID, N, X, INDICES, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          ID
+*       INTEGER            INFO, N
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   X( * )
+*       INTEGER            INDICES( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Sort the numbers in X indirectly in increasing order (if ID = 'I') or
+*> in decreasing order (if ID = 'X' ) using the array of INDICES.
+*>
+*> Use Quick Sort, reverting to Insertion sort on arrays of
+*> size <= 20. Dimension of STACK limits N to about 2**32.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] ID
+*> \verbatim
+*>          ID is CHARACTER*1
+*>          = 'I': sort X in increasing order;
+*>          = 'D': sort X in decreasing order.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The length of the array X and INDICES.
+*> \endverbatim
+*>
+*> \param[in] X
+*> \verbatim
+*>          X is REAL array, dimension (N)
+*>          The values to be sorted.
+*> \endverbatim
+*>
+*> \param[in,out] INDICES
+*> \verbatim
+*>          X is INTEGER array, dimension (N)
+*>          On entry, the indices of values in X to be sorted.
+*>          On exit, X has been sorted into
+*>          increasing order such that
+*>          X( INDICES(1) ) <= ... <= X( INDICES(N) )
+*>          or decreasing order such that
+*>          X( INDICES(1) ) >= ... >= X( INDICES(N) )
+*>          depending on ID.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Christoph Conrads (https://christoph-conrads.name)
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date April 2020
+*
+*> \ingroup auxOTHERcomputational
+*
+*  =====================================================================
+      SUBROUTINE DLASRTI( ID, N, X, INDICES, INFO )
+*
+*  -- LAPACK computational routine (version TODO) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     TODO
+*
+*     .. Scalar Arguments ..
+      CHARACTER          ID
+      INTEGER            INFO, N
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   X( * )
+      INTEGER            INDICES( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            SELECT
+      PARAMETER          ( SELECT = 20 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            DIR, ENDD, I, J, START, STKPNT, TMP
+      DOUBLE PRECISION   P1, P2, P3, PIVOT
+*     ..
+*     .. Local Arrays ..
+      INTEGER            STACK( 2, 32 )
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      DIR = -1
+      IF( LSAME( ID, 'D' ) ) THEN
+         DIR = 0
+      ELSE IF( LSAME( ID, 'I' ) ) THEN
+         DIR = 1
+      END IF
+      IF( DIR.EQ.-1 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DLASRTI', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.LE.1 )
+     $   RETURN
+*
+      STKPNT = 1
+      STACK( 1, 1 ) = 1
+      STACK( 2, 1 ) = N
+   10 CONTINUE
+      START = STACK( 1, STKPNT )
+      ENDD = STACK( 2, STKPNT )
+      STKPNT = STKPNT - 1
+      IF( ENDD-START.LE.SELECT .AND. ENDD-START.GT.0 ) THEN
+*
+*        Do Insertion sort on X( START:ENDD )
+*
+         IF( DIR.EQ.0 ) THEN
+*
+*           Sort into decreasing order
+*
+            DO 30 I = START + 1, ENDD
+               DO 20 J = I, START + 1, -1
+                  IF( X( INDICES(J) ).GT.X( INDICES(J-1) ) ) THEN
+                     TMP = INDICES( J )
+                     INDICES( J ) = INDICES( J-1 )
+                     INDICES( J-1 ) = TMP
+                  ELSE
+                     GO TO 30
+                  END IF
+   20          CONTINUE
+   30       CONTINUE
+*
+         ELSE
+*
+*           Sort into increasing order
+*
+            DO 50 I = START + 1, ENDD
+               DO 40 J = I, START + 1, -1
+                  IF( X( INDICES(J) ).LT.X( INDICES(J-1) ) ) THEN
+                     TMP = INDICES( J )
+                     INDICES( J ) = INDICES( J-1 )
+                     INDICES( J-1 ) = TMP
+                  ELSE
+                     GO TO 50
+                  END IF
+   40          CONTINUE
+   50       CONTINUE
+*
+         END IF
+*
+      ELSE IF( ENDD-START.GT.SELECT ) THEN
+*
+*        Partition X( START:ENDD ) and stack parts, largest one first
+*
+*        Choose partition entry as median of 3
+*
+         P1 = X( INDICES(START) )
+         P2 = X( INDICES(ENDD) )
+         I = INDICES( ( START+ENDD ) / 2 )
+         P3 = X( INDICES(I) )
+         IF( P1.LT.P2 ) THEN
+            IF( P3.LT.P1 ) THEN
+               PIVOT = P1
+            ELSE IF( P3.LT.P2 ) THEN
+               PIVOT = P3
+            ELSE
+               PIVOT = P2
+            END IF
+         ELSE
+            IF( P3.LT.P2 ) THEN
+               PIVOT = P2
+            ELSE IF( P3.LT.P1 ) THEN
+               PIVOT = P3
+            ELSE
+               PIVOT = P1
+            END IF
+         END IF
+*
+         IF( DIR.EQ.0 ) THEN
+*
+*           Sort into decreasing order
+*
+            I = START - 1
+            J = ENDD + 1
+   60       CONTINUE
+   70       CONTINUE
+            J = J - 1
+            IF( X( INDICES(J) ).LT.PIVOT )
+     $         GO TO 70
+   80       CONTINUE
+            I = I + 1
+            IF( X( INDICES(I) ).GT.PIVOT )
+     $         GO TO 80
+            IF( I.LT.J ) THEN
+               TMP = INDICES( I )
+               INDICES( I ) = INDICES( J )
+               INDICES( J ) = TMP
+               GO TO 60
+            END IF
+            IF( J-START.GT.ENDD-J-1 ) THEN
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = START
+               STACK( 2, STKPNT ) = J
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = J + 1
+               STACK( 2, STKPNT ) = ENDD
+            ELSE
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = J + 1
+               STACK( 2, STKPNT ) = ENDD
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = START
+               STACK( 2, STKPNT ) = J
+            END IF
+         ELSE
+*
+*           Sort into increasing order
+*
+            I = START - 1
+            J = ENDD + 1
+   90       CONTINUE
+  100       CONTINUE
+            J = J - 1
+            IF( X( INDICES(J) ).GT.PIVOT )
+     $         GO TO 100
+  110       CONTINUE
+            I = I + 1
+            IF( X( INDICES(I) ).LT.PIVOT )
+     $         GO TO 110
+            IF( I.LT.J ) THEN
+               TMP = INDICES( I )
+               INDICES( I ) = INDICES( J )
+               INDICES( J ) = TMP
+               GO TO 90
+            END IF
+            IF( J-START.GT.ENDD-J-1 ) THEN
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = START
+               STACK( 2, STKPNT ) = J
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = J + 1
+               STACK( 2, STKPNT ) = ENDD
+            ELSE
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = J + 1
+               STACK( 2, STKPNT ) = ENDD
+               STKPNT = STKPNT + 1
+               STACK( 1, STKPNT ) = START
+               STACK( 2, STKPNT ) = J
+            END IF
+         END IF
+      END IF
+      IF( STKPNT.GT.0 )
+     $   GO TO 10
+      RETURN
+*
+*     End of DLASRTI
+*
+      END

From a238a655bad33717368dd5514b6329192068c65e Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 29 Apr 2020 17:09:38 +0200
Subject: [PATCH 043/101] Add SLASRTR, a function sorting rows by max norm

---
 SRC/CMakeLists.txt |   2 +-
 SRC/slasrtr.f      | 192 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 193 insertions(+), 1 deletion(-)
 create mode 100644 SRC/slasrtr.f

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index fab55246e7..1838be28a3 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -54,7 +54,7 @@ set(SCLAUX
    slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f
    slasd7.f slasd8.f slasda.f slasdq.f slasdt.f
    slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f
-   slasr.f  slasrt.f slasrti.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f
+   slasr.f  slasrt.f slasrti.f slasrtr.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f
    ssteqr.f ssterf.f slaisnan.f sisnan.f
    slartgp.f slartgs.f
    ${SECOND_SRC})
diff --git a/SRC/slasrtr.f b/SRC/slasrtr.f
new file mode 100644
index 0000000000..3f5824725e
--- /dev/null
+++ b/SRC/slasrtr.f
@@ -0,0 +1,192 @@
+*> \brief \b SLASRTR sorts array indices based on the referenced numbers
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SLASRTR + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slasrtr.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slasrtr.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slasrtr.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SLASRTR( ID, M, N, A, LDA, IPVT, WORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          ID
+*       INTEGER            INFO, M, N, LDA
+*       ..
+*       .. Array Arguments ..
+*       REAL               A( LDA, * ), WORK( * )
+*       INTEGER            IPVT( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*> Apply row sorting based on the maximum norm of the rows of A. The
+*> rows can be sorted in increasing (if ID = 'I') or decreasing order
+*> (if ID = 'D').
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] ID
+*> \verbatim
+*>          ID is CHARACTER*1
+*>          = 'I': sort the rows of A by increasing row norm;
+*>          = 'D': sort the rows of A by decreasing row norm.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>          On entry, the M by N matrix A.
+*>          On exit, A contains the row-permuted matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] IPVT
+*> \verbatim
+*>          IPVT is INTEGER array, dimension (M)
+*>          On exit, if IPVT(J)=K, then the J-th row of A*P was the
+*>          the K-th row of A.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (M).
+*>          On exit, WORK contains the maximum norms of the rows of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Christoph Conrads (https://christoph-conrads.name)
+*
+*> \date April 2020
+*
+*> \ingroup auxOTHERcomputational
+*
+*  =====================================================================
+      SUBROUTINE SLASRTR( ID, M, N, A, LDA, IPVT, WORK, INFO )
+*
+*  -- LAPACK computational routine (version TODO) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     TODO
+*
+*     .. Scalar Arguments ..
+      CHARACTER          ID
+      INTEGER            INFO, M, N, LDA
+*     ..
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), WORK( * )
+      INTEGER            IPVT( * )
+*     ..
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, J
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF( .NOT.LSAME( ID, 'I' ) .AND. .NOT.LSAME( ID, 'D' ) ) THEN
+         INFO = -1
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -5
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SLASRTR', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( M.LE.1 .OR. N.LT.1 )
+     $   RETURN
+*
+*     Compute maximum norm of each row
+*
+      DO I = 1, M
+         WORK( I ) = ABS( A( I, 1 ) )
+      END DO
+*
+      DO I = 1, M
+         DO J = 2, N
+            WORK( I ) = MAX( WORK( I ), ABS( A( I, J ) ) )
+         END DO
+      END DO
+*
+*     Sort row indices
+*
+      DO I = 1, M
+         IPVT( I ) = I
+      END DO
+*
+      CALL SLASRTI( ID, M, WORK, IPVT, INFO )
+      IF( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     Sort rows
+*
+      CALL SLAPMR( .TRUE., M, N, A, LDA, IPVT )
+*
+*     End of SLASRTR
+*
+      END

From 7cc4495208362687d650c8b62fb7246053285ddd Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 29 Apr 2020 17:14:31 +0200
Subject: [PATCH 044/101] xGGQRCS: fix typos

---
 SRC/cggqrcs.f | 6 +++---
 SRC/zggqrcs.f | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 498c1b5d2f..afc43eb0bf 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -7,11 +7,11 @@
 *
 *> \htmlonly
 *> Download CGGQRCS + dependencies
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sggqrcs.f">
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cggqrcs.f">
 *> [TGZ]</a>
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sggqrcs.f">
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cggqrcs.f">
 *> [ZIP]</a>
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sggqrcs.f">
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cggqrcs.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index d9e49a25b2..8bee9e7542 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -7,11 +7,11 @@
 *
 *> \htmlonly
 *> Download ZGGQRCS + dependencies
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sggqrcs.f">
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zggqrcs.f">
 *> [TGZ]</a>
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sggqrcs.f">
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zggqrcs.f">
 *> [ZIP]</a>
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sggqrcs.f">
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zggqrcs.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *

From 3bd430dd15ca15f8e5cbf8a74cd1e3c18e4b1fc4 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 29 Apr 2020 17:14:51 +0200
Subject: [PATCH 045/101] xLASRTI: fix typos

---
 SRC/dlasrti.f | 4 ++--
 SRC/slasrti.f | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/SRC/dlasrti.f b/SRC/dlasrti.f
index f44c473555..c128ac1d03 100644
--- a/SRC/dlasrti.f
+++ b/SRC/dlasrti.f
@@ -36,7 +36,7 @@
 *> \verbatim
 *>
 *> Sort the numbers in X indirectly in increasing order (if ID = 'I') or
-*> in decreasing order (if ID = 'X' ) using the array of INDICES.
+*> in decreasing order (if ID = 'D' ) using the array of INDICES.
 *>
 *> Use Quick Sort, reverting to Insertion sort on arrays of
 *> size <= 20. Dimension of STACK limits N to about 2**32.
@@ -60,7 +60,7 @@
 *>
 *> \param[in] X
 *> \verbatim
-*>          X is REAL array, dimension (N)
+*>          X is DOUBLE PRECISION array, dimension (N)
 *>          The values to be sorted.
 *> \endverbatim
 *>
diff --git a/SRC/slasrti.f b/SRC/slasrti.f
index ca94cd5e08..337fa580b0 100644
--- a/SRC/slasrti.f
+++ b/SRC/slasrti.f
@@ -36,7 +36,7 @@
 *> \verbatim
 *>
 *> Sort the numbers in X indirectly in increasing order (if ID = 'I') or
-*> in decreasing order (if ID = 'X' ) using the array of INDICES.
+*> in decreasing order (if ID = 'D' ) using the array of INDICES.
 *>
 *> Use Quick Sort, reverting to Insertion sort on arrays of
 *> size <= 20. Dimension of STACK limits N to about 2**32.

From ae68646e3f6c70c9f9ab754ff3e38b4f5b528676 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 29 Apr 2020 17:25:37 +0200
Subject: [PATCH 046/101] SLASRTR: list missing external subroutines

---
 SRC/slasrtr.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SRC/slasrtr.f b/SRC/slasrtr.f
index 3f5824725e..b32b7d967a 100644
--- a/SRC/slasrtr.f
+++ b/SRC/slasrtr.f
@@ -131,7 +131,7 @@ SUBROUTINE SLASRTR( ID, M, N, A, LDA, IPVT, WORK, INFO )
       EXTERNAL           LSAME
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           XERBLA
+      EXTERNAL           XERBLA, SLAPMR, SLASRTI
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX

From eb7f37e9ac110cb4f5f0e4194c0799526aad67c9 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 29 Apr 2020 17:34:54 +0200
Subject: [PATCH 047/101] Add DLASRTR

---
 SRC/CMakeLists.txt |   2 +-
 SRC/dlasrtr.f      | 192 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 193 insertions(+), 1 deletion(-)
 create mode 100644 SRC/dlasrtr.f

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 1838be28a3..cc23cbda13 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -73,7 +73,7 @@ set(DZLAUX
    dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f
    dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f
    dlaset.f dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f
-   dlasr.f  dlasrt.f dlasrti.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f
+   dlasr.f  dlasrt.f dlasrti.f dlasrtr.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f
    dsteqr.f dsterf.f dlaisnan.f disnan.f
    dlartgp.f dlartgs.f
    ../INSTALL/dlamch.f ${DSECOND_SRC})
diff --git a/SRC/dlasrtr.f b/SRC/dlasrtr.f
new file mode 100644
index 0000000000..308c97176d
--- /dev/null
+++ b/SRC/dlasrtr.f
@@ -0,0 +1,192 @@
+*> \brief \b DLASRTR sorts array indices based on the referenced numbers
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DLASRTR + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlasrtr.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlasrtr.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlasrtr.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DLASRTR( ID, M, N, A, LDA, IPVT, WORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          ID
+*       INTEGER            INFO, M, N, LDA
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   A( LDA, * ), WORK( * )
+*       INTEGER            IPVT( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*> Apply row sorting based on the maximum norm of the rows of A. The
+*> rows can be sorted in increasing (if ID = 'I') or decreasing order
+*> (if ID = 'D').
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] ID
+*> \verbatim
+*>          ID is CHARACTER*1
+*>          = 'I': sort the rows of A by increasing row norm;
+*>          = 'D': sort the rows of A by decreasing row norm.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          On entry, the M by N matrix A.
+*>          On exit, A contains the row-permuted matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] IPVT
+*> \verbatim
+*>          IPVT is INTEGER array, dimension (M)
+*>          On exit, if IPVT(J)=K, then the J-th row of A*P was the
+*>          the K-th row of A.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (M).
+*>          On exit, WORK contains the maximum norms of the rows of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Christoph Conrads (https://christoph-conrads.name)
+*
+*> \date April 2020
+*
+*> \ingroup auxOTHERcomputational
+*
+*  =====================================================================
+      SUBROUTINE DLASRTR( ID, M, N, A, LDA, IPVT, WORK, INFO )
+*
+*  -- LAPACK computational routine (version TODO) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     TODO
+*
+*     .. Scalar Arguments ..
+      CHARACTER          ID
+      INTEGER            INFO, M, N, LDA
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), WORK( * )
+      INTEGER            IPVT( * )
+*     ..
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, J
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, DLAPMR, DLASRTI
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF( .NOT.LSAME( ID, 'I' ) .AND. .NOT.LSAME( ID, 'D' ) ) THEN
+         INFO = -1
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -5
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DLASRTR', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( M.LE.1 .OR. N.LT.1 )
+     $   RETURN
+*
+*     Compute maximum norm of each row
+*
+      DO I = 1, M
+         WORK( I ) = ABS( A( I, 1 ) )
+      END DO
+*
+      DO I = 1, M
+         DO J = 2, N
+            WORK( I ) = MAX( WORK( I ), ABS( A( I, J ) ) )
+         END DO
+      END DO
+*
+*     Sort row indices
+*
+      DO I = 1, M
+         IPVT( I ) = I
+      END DO
+*
+      CALL DLASRTI( ID, M, WORK, IPVT, INFO )
+      IF( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     Sort rows
+*
+      CALL DLAPMR( .TRUE., M, N, A, LDA, IPVT )
+*
+*     End of DLASRTR
+*
+      END

From aaaf98c7a4bb1e5b340c1cd8678c3d70374395fd Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 29 Apr 2020 19:05:12 +0200
Subject: [PATCH 048/101] Add CLASRTR

---
 SRC/CMakeLists.txt |   2 +-
 SRC/clasrtr.f      | 194 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 195 insertions(+), 1 deletion(-)
 create mode 100644 SRC/clasrtr.f

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index cc23cbda13..34978bc4d3 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -213,7 +213,7 @@ set(CLASRC
    claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
    clarf.f  clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
    clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f
-   clarz.f  clarzb.f clarzt.f clascl.f claset.f clasr.f  classq.f90
+   clarz.f  clarzb.f clarzt.f clascl.f claset.f clasr.f clasrtr.f classq.f90
    claswp.f clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
    clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f
    clauu2.f clauum.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f
diff --git a/SRC/clasrtr.f b/SRC/clasrtr.f
new file mode 100644
index 0000000000..5a2db59462
--- /dev/null
+++ b/SRC/clasrtr.f
@@ -0,0 +1,194 @@
+*> \brief \b CLASRTR sorts array indices based on the referenced numbers
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CLASRTR + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/clasrtr.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/clasrtr.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/clasrtr.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CLASRTR( ID, M, N, A, LDA, IPVT, RWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          ID
+*       INTEGER            INFO, M, N, LDA
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            A( LDA, * )
+*       REAL               RWORK( * )
+*       INTEGER            IPVT( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*> Apply row sorting based on the maximum norm of the rows of A. The
+*> rows can be sorted in increasing (if ID = 'I') or decreasing order
+*> (if ID = 'D').
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] ID
+*> \verbatim
+*>          ID is CHARACTER*1
+*>          = 'I': sort the rows of A by increasing row norm;
+*>          = 'D': sort the rows of A by decreasing row norm.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          On entry, the M by N matrix A.
+*>          On exit, A contains the row-permuted matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] IPVT
+*> \verbatim
+*>          IPVT is INTEGER array, dimension (M)
+*>          On exit, if IPVT(J)=K, then the J-th row of A*P was the
+*>          the K-th row of A.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (M).
+*>          On exit, RWORK contains the maximum norms of the rows of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Christoph Conrads (https://christoph-conrads.name)
+*
+*> \date April 2020
+*
+*> \ingroup auxOTHERcomputational
+*
+*  =====================================================================
+      SUBROUTINE CLASRTR( ID, M, N, A, LDA, IPVT, RWORK, INFO )
+*
+*  -- LAPACK computational routine (version TODO) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     TODO
+*
+*     .. Scalar Arguments ..
+      CHARACTER          ID
+      INTEGER            INFO, M, N, LDA
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * )
+      REAL               RWORK( * )
+      INTEGER            IPVT( * )
+*     ..
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, J
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, CLAPMR, SLASRTI
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF( .NOT.LSAME( ID, 'I' ) .AND. .NOT.LSAME( ID, 'D' ) ) THEN
+         INFO = -1
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -5
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CLASRTR', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( M.LE.1 .OR. N.LT.1 )
+     $   RETURN
+*
+*     Compute maximum norm of each row
+*
+      DO I = 1, M
+         RWORK( I ) = ABS( A( I, 1 ) )
+      END DO
+*
+      DO I = 1, M
+         DO J = 2, N
+            RWORK( I ) = MAX( RWORK( I ), ABS( A( I, J ) ) )
+         END DO
+      END DO
+*
+*     Sort row indices
+*
+      DO I = 1, M
+         IPVT( I ) = I
+      END DO
+*
+      CALL SLASRTI( ID, M, RWORK, IPVT, INFO )
+      IF( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     Sort rows
+*
+      CALL CLAPMR( .TRUE., M, N, A, LDA, IPVT )
+*
+*     End of CLASRTR
+*
+      END

From bb2c0b9811fb8312e8c606883312fd0515afc018 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 29 Apr 2020 19:05:28 +0200
Subject: [PATCH 049/101] Add ZLASRTR

---
 SRC/CMakeLists.txt |   2 +-
 SRC/zlasrtr.f      | 194 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 195 insertions(+), 1 deletion(-)
 create mode 100644 SRC/zlasrtr.f

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 34978bc4d3..daa0ba6725 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -410,7 +410,7 @@ set(ZLASRC
    zlarcm.f zlarf.f  zlarfb.f zlarfb_gett.f
    zlarfg.f zlarfgp.f zlarft.f
    zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f
-   zlarz.f  zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
+   zlarz.f  zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f zlasrtr.f
    zlassq.f90 zlaswp.f zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f
    zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f zlauu2.f
    zlauum.f zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f
diff --git a/SRC/zlasrtr.f b/SRC/zlasrtr.f
new file mode 100644
index 0000000000..22da41cf68
--- /dev/null
+++ b/SRC/zlasrtr.f
@@ -0,0 +1,194 @@
+*> \brief \b ZLASRTR sorts array indices based on the referenced numbers
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZLASRTR + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlasrtr.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlasrtr.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlasrtr.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZLASRTR( ID, M, N, A, LDA, IPVT, RWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          ID
+*       INTEGER            INFO, M, N, LDA
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         A( LDA, * )
+*       DOUBLE PRECISION   RWORK( * )
+*       INTEGER            IPVT( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*> Apply row sorting based on the maximum norm of the rows of A. The
+*> rows can be sorted in increasing (if ID = 'I') or decreasing order
+*> (if ID = 'D').
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] ID
+*> \verbatim
+*>          ID is CHARACTER*1
+*>          = 'I': sort the rows of A by increasing row norm;
+*>          = 'D': sort the rows of A by decreasing row norm.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          On entry, the M by N matrix A.
+*>          On exit, A contains the row-permuted matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] IPVT
+*> \verbatim
+*>          IPVT is INTEGER array, dimension (M)
+*>          On exit, if IPVT(J)=K, then the J-th row of A*P was the
+*>          the K-th row of A.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (M).
+*>          On exit, RWORK contains the maximum norms of the rows of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Christoph Conrads (https://christoph-conrads.name)
+*
+*> \date April 2020
+*
+*> \ingroup auxOTHERcomputational
+*
+*  =====================================================================
+      SUBROUTINE ZLASRTR( ID, M, N, A, LDA, IPVT, RWORK, INFO )
+*
+*  -- LAPACK computational routine (version TODO) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     TODO
+*
+*     .. Scalar Arguments ..
+      CHARACTER          ID
+      INTEGER            INFO, M, N, LDA
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * )
+      DOUBLE PRECISION   RWORK( * )
+      INTEGER            IPVT( * )
+*     ..
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, J
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, ZLAPMR, DLASRTI
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF( .NOT.LSAME( ID, 'I' ) .AND. .NOT.LSAME( ID, 'D' ) ) THEN
+         INFO = -1
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -5
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZLASRTR', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( M.LE.1 .OR. N.LT.1 )
+     $   RETURN
+*
+*     Compute maximum norm of each row
+*
+      DO I = 1, M
+         RWORK( I ) = ABS( A( I, 1 ) )
+      END DO
+*
+      DO I = 1, M
+         DO J = 2, N
+            RWORK( I ) = MAX( RWORK( I ), ABS( A( I, J ) ) )
+         END DO
+      END DO
+*
+*     Sort row indices
+*
+      DO I = 1, M
+         IPVT( I ) = I
+      END DO
+*
+      CALL DLASRTI( ID, M, RWORK, IPVT, INFO )
+      IF( INFO.NE.0 ) THEN
+         RETURN
+      END IF
+*
+*     Sort rows
+*
+      CALL ZLAPMR( .TRUE., M, N, A, LDA, IPVT )
+*
+*     End of ZLASRTR
+*
+      END

From b954151c19492104ba61fbbc742d47ae8e05034d Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Fri, 1 May 2020 12:28:33 +0200
Subject: [PATCH 050/101] xLASRTI: fix indexing error

The indexing error could lead to an infinite loop.
---
 SRC/dlasrti.f | 2 +-
 SRC/slasrti.f | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/SRC/dlasrti.f b/SRC/dlasrti.f
index c128ac1d03..794748770e 100644
--- a/SRC/dlasrti.f
+++ b/SRC/dlasrti.f
@@ -212,7 +212,7 @@ SUBROUTINE DLASRTI( ID, N, X, INDICES, INFO )
 *
          P1 = X( INDICES(START) )
          P2 = X( INDICES(ENDD) )
-         I = INDICES( ( START+ENDD ) / 2 )
+         I = ( START+ENDD ) / 2
          P3 = X( INDICES(I) )
          IF( P1.LT.P2 ) THEN
             IF( P3.LT.P1 ) THEN
diff --git a/SRC/slasrti.f b/SRC/slasrti.f
index 337fa580b0..cddeabc347 100644
--- a/SRC/slasrti.f
+++ b/SRC/slasrti.f
@@ -212,7 +212,7 @@ SUBROUTINE SLASRTI( ID, N, X, INDICES, INFO )
 *
          P1 = X( INDICES(START) )
          P2 = X( INDICES(ENDD) )
-         I = INDICES( ( START+ENDD ) / 2 )
+         I = ( START+ENDD ) / 2
          P3 = X( INDICES(I) )
          IF( P1.LT.P2 ) THEN
             IF( P3.LT.P1 ) THEN

From eefd3a7cf36fdc89d5322c8bafb1f9af70e9e5c1 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Fri, 1 May 2020 18:51:50 +0200
Subject: [PATCH 051/101] SGGQRCS: do not factor right-hand side GSVD matrix

The GSVD decomposes a pair of matrices A, B into
* `A = U1 D1 X`,
* `B = U2 D2 X`,

where `X` has full rank. Alternatively, one can compute the equivalent
decomposition
* `A = U1 D1 R Q^*`,
* `B = U2 D2 R Q^*`,

where `R` is upper triangular and `Q` orthogonal. The second form has
several advantages from the point of view of _numerical_ linear algebra.
None of these apply to the GSVD solvers based on QR and CS
decomposition.  Consequently, xGGQRCS returns from now on only the
product `X = RQ^*`.

This change makes xGGQRCS
* faster,
* more flexible (`X` is computed or not),
* easier to implement (no need to assemble `R Q^*` for tests), and
* numerically stable!

xGGQRCS becomes numerically stable because factoring `X` requires an
explicit LQ decomposition but the result may not be backward stable
unless `A` and `B` are similar in norm. Not computing the LQ
decomposition obviously solves this problem.

This commit is a proof-of-concept; {d,c,z}GGQRCS should be updated
accordingly.
---
 SRC/sggqrcs.f | 247 +++++++++++++++++++-------------------------------
 1 file changed, 93 insertions(+), 154 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 0ef872b418..c3f38ca887 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -18,21 +18,20 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
+*       SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
 *                           A, LDA, B, LDB,
-*                           THETA, U1, LDU1, U2, LDU2, QT, LDQT,
+*                           THETA, U1, LDU1, U2, LDU2
 *                           WORK, LWORK, IWORK, INFO )
 *
 *       .. Scalar Arguments ..
-*       CHARACTER          JOBU1, JOB2, JOBQT
-*       INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
-*      $                   M, N, P, L, LWORK
+*       CHARACTER          JOBU1, JOB2, JOBX
+*       INTEGER            INFO, LDA, LDB, LDU1, LDU2, M, N, P, L, LWORK
 *       REAL               W
 *       ..
 *       .. Array Arguments ..
 *       INTEGER            IWORK( * )
 *       REAL               A( LDA, * ), B( LDB, * ), THETA( * ),
-*      $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQ, * ),
+*      $                   U1( LDU1, * ), U2( LDU2, * ),
 *      $                   WORK( * )
 *       ..
 *
@@ -45,14 +44,14 @@
 *> SGGQRCS computes the generalized singular value decomposition (GSVD)
 *> of an M-by-N real matrix A and P-by-N real matrix B:
 *>
-*>       U1**T*A*Q = D1*( 0 R ),    U2**T*B*Q = D2*( 0 R )
+*>       A = U1 * D1 * X,           B = U2 * D2 * X
 *>
-*> where U1, U2, and Q are orthogonal matrices. SGGQRCS uses the QR
+*> where U1 and U2 are orthogonal matrices. SGGQRCS uses the QR
 *> factorization with column pivoting and the 2-by-1 CS decomposition to
 *> compute the GSVD.
 *>
 *> Let L be the effective numerical rank of the matrix (A**T,B**T)**T,
-*> then R is a L-by-L nonsingular upper triangular matrix, D1 and
+*> then X is a L-by-N nonsingular upper triangular matrix, D1 and
 *> D2 are M-by-L and P-by-L "diagonal" matrices and of the
 *> following structures, respectively:
 *>
@@ -66,9 +65,6 @@
 *>              K  (  0   C   0 )
 *>                 (  0   0   0 )
 *>
-*>                 N-L  L
-*>   ( 0 R ) = L (  0   R )
-*>
 *> where
 *>
 *>   K  = MIN(M, P, L, M + P - L),
@@ -78,27 +74,29 @@
 *>   S  = diag( SIN(THETA(1)), ..., SIN(THETA(K)) ), and
 *>   C^2 + S^2 = I.
 *>
-*> The routine computes C, S, R, and optionally the orthogonal
-*> transformation matrices U, V and Q. If L <= M, then R is stored in
-*> A(1:L, 1:L) on exit. Otherwise, the first M rows of R are stored in
-*> A(:, 1:L) and R( M+1:, M+1: ) is stored in B(1:L-M, 1:L-M). In both
-*> cases, only the upper triangular part is stored.
+*> The routine computes C, S and optionally the matrices U1, U2, and X.
+*> On exit, X is stored in WORK( 2:L*N ).
 *>
 *> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
 *> A and B implicitly gives the SVD of A*inv(B):
-*>                      A*inv(B) = U1*(D1*inv(D2))*U2**T.
+*>
+*>       A*inv(B) = U1*(D1*inv(D2))*U2**T.
+*>
 *> If (A**T,B**T)**T  has orthonormal columns, then the GSVD of A and B
 *> is also equal to the CS decomposition of A and B. Furthermore, the
 *> GSVD can be used to derive the solution of the eigenvalue problem:
-*>                      A**T*A x = lambda * B**T*B x.
+*>
+*>       A**T*A x = lambda * B**T*B x.
+*>
 *> In some literature, the GSVD of A and B is presented in the form
-*>                  U1**T*A*X = ( 0 D1 ),   U2**T*B*X = ( 0 D2 )
-*> where U1 and U2 are orthogonal and X is nonsingular, D1 and D2 are
-*> ``diagonal''.  The former GSVD form can be converted to the latter
-*> form by taking the nonsingular matrix X as
 *>
-*>                      X = Q*( I   0    )
-*>                            ( 0 inv(R) ).
+*>       U1**T*A*Q = D1*( 0 R ),    U2**T*B*Q = D2*( 0 R )
+*>
+*> where U1, U2, and Q are orthogonal matrices. The former GSVD form can
+*> be converted to the latter by computing the LQ decomposition of X. Be
+*> advised that the LQ decomposition may not be backward stable if,
+*> e.g., A and B differ significantly in norm; consider using xGGSVD3 if
+*> you need X factorized in this case.
 *> \endverbatim
 *
 *  Arguments:
@@ -118,11 +116,11 @@
 *>          = 'N':  U2 is not computed.
 *> \endverbatim
 *>
-*> \param[in] JOBQT
+*> \param[in] JOBX
 *> \verbatim
-*>          JOBQT is CHARACTER*1
-*>          = 'Y':  Orthogonal matrix Q is computed;
-*>          = 'N':  Q is not computed.
+*>          JOBX is CHARACTER*1
+*>          = 'Y':  Matrix X is computed;
+*>          = 'N':  X is not computed.
 *> \endverbatim
 *>
 *> \param[in] M
@@ -163,8 +161,6 @@
 *> \verbatim
 *>          A is REAL array, dimension (LDA,N)
 *>          On entry, the M-by-N matrix A.
-*>          On exit, A contains the triangular matrix R or the first M
-*>          rows of R, respectively. See Purpose for details.
 *> \endverbatim
 *>
 *> \param[in] LDA
@@ -177,8 +173,6 @@
 *> \verbatim
 *>          B is REAL array, dimension (LDB,N)
 *>          On entry, the P-by-N matrix B.
-*>          On exit, if L > M, then B contains the last L - M rows of
-*>          the triangular matrix R. See Purpose for details.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -223,20 +217,6 @@
 *>          JOBU2 = 'Y'; LDU2 >= 1 otherwise.
 *> \endverbatim
 *>
-*> \param[out] QT
-*> \verbatim
-*>          QT is REAL array, dimension (LDQT,N)
-*>          If JOBQT = 'Y', QT contains the N-by-N orthogonal matrix
-*>          Q**T.
-*> \endverbatim
-*>
-*> \param[in] LDQT
-*> \verbatim
-*>          LDQT is INTEGER
-*>          The leading dimension of the array QT. LDQT >= max(1,N) if
-*>          JOBQT = 'Y'; LDQT >= 1 otherwise.
-*> \endverbatim
-*>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array, dimension (MAX(1,LWORK))
@@ -306,9 +286,9 @@
 *>  workspace whose dimension must be queried at run-time.
 *>
 *  =====================================================================
-      SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
+      SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
      $                    A, LDA, B, LDB,
-     $                    THETA, U1, LDU1, U2, LDU2, QT, LDQT,
+     $                    THETA, U1, LDU1, U2, LDU2,
      $                    WORK, LWORK, IWORK, INFO )
 *
 *  -- LAPACK driver routine (version 3.7.0) --
@@ -318,26 +298,25 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       IMPLICIT NONE
 *     .. Scalar Arguments ..
-      CHARACTER          JOBU1, JOBU2, JOBQT
-      INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
-     $                   L, M, N, P, LWORK
+      CHARACTER          JOBU1, JOBU2, JOBX
+      INTEGER            INFO, LDA, LDB, LDU1, LDU2, L, M, N, P, LWORK
       REAL               W
 *     ..
 *     .. Array Arguments ..
       INTEGER            IWORK( * )
       REAL               A( LDA, * ), B( LDB, * ), THETA( * ),
-     $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQT, * ),
+     $                   U1( LDU1, * ), U2( LDU2, * ),
      $                   WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Local Scalars ..
-      LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
-      INTEGER            I, J, LMAX, Z, LDG, LWKOPT
+      LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
+      INTEGER            I, J, LMAX, Z, LDG, LDVT, LWKOPT
       REAL               GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
 *     .. Local Arrays ..
-      REAL               G( M + P, N )
+      REAL               G( M + P, N ), VT( N, N )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -357,7 +336,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       WANTU1 = LSAME( JOBU1, 'Y' )
       WANTU2 = LSAME( JOBU2, 'Y' )
-      WANTQT = LSAME( JOBQT, 'Y' )
+      WANTX = LSAME( JOBX, 'Y' )
       LQUERY = ( LWORK.EQ.-1 )
       LWKOPT = 1
 *
@@ -371,6 +350,12 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       ELSE
          G = WORK( 1 )
       END IF
+      IF( WANTX .AND. .NOT.LQUERY ) THEN
+         VT = WORK( Z + 1 )
+      ELSE
+         VT = 0
+      END IF
+      LDVT = N
       LDG = M + P
 *     Computing 0.0 / 0.0 directly causes compiler errors
       NAN = 1.0E0
@@ -383,7 +368,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          INFO = -1
       ELSE IF( .NOT.( WANTU2 .OR. LSAME( JOBU2, 'N' ) ) ) THEN
          INFO = -2
-      ELSE IF( .NOT.( WANTQT .OR. LSAME( JOBQT, 'N' ) ) ) THEN
+      ELSE IF( .NOT.( WANTX .OR. LSAME( JOBX, 'N' ) ) ) THEN
          INFO = -3
       ELSE IF( M.LT.1 ) THEN
          INFO = -4
@@ -399,10 +384,8 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          INFO = -15
       ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
          INFO = -17
-      ELSE IF( LDQT.LT.1 .OR. ( WANTQT .AND. LDQT.LT.N ) ) THEN
-         INFO = -19
       ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
-         INFO = -23
+         INFO = -21
       END IF
 *
 *     Compute workspace
@@ -414,20 +397,17 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          CALL SORGQR( M + P, LMAX, LMAX, G, LDG, THETA, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 
-         CALL SORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, LMAX,
+         CALL SORCSD2BY1( JOBU2, JOBU1, JOBX, M + P, P, LMAX,
      $                    G, LDG, G, LDG,
-     $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
+     $                    THETA, U2, LDU2, U1, LDU1, VT, LDVT,
      $                    WORK, -1, IWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *        The matrix (A, B) must be stored sequentially for SORCSD2BY1
-         LWKOPT = Z + LWKOPT
-
-*        SGERQF stores LMAX scalar factors for the elementary reflectors
-         CALL SGERQF( LMAX, N, QT, LDQT, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
-
-         CALL SORGRQ( N, N, LMAX, QT, LDQT, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
+         LWKOPT = LWKOPT + Z
+*        2-by-1 CSD matrix V1 must be stored
+         IF( WANTX ) THEN
+            LWKOPT = LWKOPT + LDVT*N
+         END IF
 
          WORK( 1 ) = REAL( LWKOPT )
       END IF
@@ -457,7 +437,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          END IF
       END IF
 *
-*     Copy matrices A, B into the (M+P) x n matrix G
+*     Copy matrices A, B into the (M+P) x N matrix G
 *
       CALL SLACPY( 'A', M, N, A, LDA, G( P + 1, 1 ), LDG )
       CALL SLACPY( 'A', P, N, B, LDB, G( 1, 1 ), LDG )
@@ -510,9 +490,6 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          IF( WANTU2 ) THEN
             CALL SLASET( 'A', P, P, 0.0E0, 1.0E0, U2, LDU2 )
          END IF
-         IF( WANTQT ) THEN
-            CALL SLASET( 'A', N, N, 0.0E0, 1.0E0, QT, LDQT )
-         END IF
 *
          WORK( 1 ) = REAL( LWKOPT )
          RETURN
@@ -520,21 +497,17 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Copy R1( 1:L, : ) into A, B and set lower triangular part to zero
 *
-      IF( L.LE.M ) THEN
-          CALL SLACPY( 'U', L, N, G, LDG, A, LDA )
-          IF( M.GT.1 ) THEN
+      IF( WANTX ) THEN
+         IF( L.LE.M ) THEN
+             CALL SLACPY( 'U', L, N, G, LDG, A, LDA )
              CALL SLASET( 'L', L - 1, N, 0.0E0, 0.0E0, A( 2, 1 ), LDA )
-          END IF
-      ELSE
-          CALL SLACPY( 'U', M, N, G, LDG, A, LDA )
-          CALL SLACPY( 'U', L - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
-*
-          IF( M.GT.1 ) THEN
-              CALL SLASET( 'L', M - 1, N, 0.0E0, 0.0E0, A( 2, 1 ), LDA )
-          END IF
-          IF( P.GT.1 ) THEN
-              CALL SLASET( 'L', L-M-1, N, 0.0E0, 0.0E0, B( 2, 1 ), LDB )
-          END IF
+         ELSE
+             CALL SLACPY( 'U', M, N, G, LDG, A, LDA )
+             CALL SLACPY( 'U', L - M, N - M, G( M+1,M+1 ), LDG, B, LDB )
+*
+             CALL SLASET( 'L', M - 1, N, 0.0E0, 0.0E0, A( 2, 1 ), LDA )
+             CALL SLASET( 'L', L-M-1, N, 0.0E0, 0.0E0, B( 2, 1 ), LDB )
+         END IF
       END IF
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition
@@ -547,84 +520,50 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     DEBUG
 *
-      THETA(1:N) = NAN
+      THETA( 1:N ) = NAN
 *
 *     Compute the CS decomposition of Q1( :, 1:L )
 *
-      CALL SORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
-     $                 G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
-     $                 U2, LDU2, U1, LDU1, QT, LDQT,
-     $                 WORK( Z + 1 ), LWORK - Z, IWORK( N + 1 ), INFO )
-      IF( INFO.NE.0 ) THEN
-         RETURN
-      END IF
-*
-*     DEBUG
-*
-      WORK(1:LWORK) = NAN
-*
-*     Copy V^T from QT to G
-*
-      CALL SLACPY( 'A', L, L, QT, LDQT, G, LDG )
-*
-*     DEBUG
-*
-      CALL SLASET( 'A', N, N, NAN, NAN, QT, LDQT )
-*
-*     Compute V^T R1( 1:L, : ) in the last L rows of QT
-*
-      IF ( L.LE.M ) THEN
-         CALL SGEMM( 'N', 'N', L, N, L, 1.0E0, G, LDG,
-     $               A, LDA, 0.0E0, QT( N-L+1, 1 ), LDQT )
+      IF( WANTX ) THEN
+         CALL SORCSD2BY1( JOBU2, JOBU1, JOBX, M + P, P, L,
+     $                    G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
+     $                    U2, LDU2, U1, LDU1, VT, LDVT,
+     $                    WORK( Z + LDVT*N + 1 ), LWORK - Z - LDVT*N,
+     $                    IWORK( N + 1 ), INFO )
       ELSE
-         CALL SGEMM( 'N', 'N', L, N, M, 1.0E0, G( 1, 1 ), LDG,
-     $               A, LDA, 0.0E0, QT( N-L+1, 1 ), LDQT )
-         CALL SGEMM( 'N', 'N', L, N - M, L - M, 1.0E0,
-     $               G( 1, M + 1 ), LDG, B, LDB,
-     $               1.0E0, QT( N-L+1, M+1 ), LDQT )
+         CALL SORCSD2BY1( JOBU2, JOBU1, JOBX, M + P, P, L,
+     $                    G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
+     $                    U2, LDU2, U1, LDU1, VT, LDVT,
+     $                    WORK( Z + 1 ), LWORK - Z,
+     $                    IWORK, INFO )
       END IF
-*
-*     DEBUG
-*
-      CALL SLASET( 'A', M, N, NAN, NAN, A, LDA )
-      CALL SLASET( 'A', P, N, NAN, NAN, B, LDB )
-      WORK(1:LWORK) = NAN
-*
-*     Compute the RQ decomposition of V^T R1( 1:L, : )
-*
-      CALL SGERQF( L, N, QT( N-L+1, 1 ), LDQT, WORK,
-     $             WORK( L + 1 ), LWORK - L, INFO )
-      IF ( INFO.NE.0 ) THEN
+      IF( INFO.NE.0 ) THEN
          RETURN
       END IF
 *
-*     Copy matrix L from QT( N-L+1:N, N-L+1:N ) to A, B
-*
-      IF ( L.LE.M ) THEN
-         CALL SLACPY( 'U', L, L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
-      ELSE
-         CALL SLACPY( 'U', M,     L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
-         CALL SLACPY( 'U', L - M, L - M, QT( N-L+M+1, N-L+M+1 ), LDQT,
-     $                B, LDB )
-      END IF
-*
 *     DEBUG
 *
-      CALL SLASET( 'U', L, L, NAN, NAN, QT( 1, N-L+1 ), LDQT )
-      WORK( L+1:LWORK ) = NAN
-*
-*     Explicitly form Q^T
-*
-      IF( WANTQT ) THEN
-         CALL SORGRQ( N, N, L, QT, LDQT, WORK,
-     $                WORK( L + 1 ), LWORK - L, INFO )
-         IF ( INFO.NE.0 ) THEN
-            RETURN
+      WORK( 1:LDG*N ) = NAN
+*
+*     Compute V^T R1( 1:L, : )
+*
+      IF( WANTX ) THEN
+         IF ( L.LE.M ) THEN
+            CALL SGEMM( 'N', 'N', L, N, L,
+     $                  1.0E0, VT, LDVT, A, LDA,
+     $                  0.0E0, WORK( 2 ), L )
+         ELSE
+            CALL SGEMM( 'N', 'N', L, N, M,
+     $                  1.0E0, VT( 1, 1 ), LDVT, A, LDA,
+     $                  0.0E0, WORK( 2 ), L )
+            CALL SGEMM( 'N', 'N', L, N - M, L - M,
+     $                  1.0E0, VT( 1, M + 1 ), LDVT, B, LDB,
+     $                  1.0E0, WORK( L*M + 2 ), L )
          END IF
 *
-*     Revert column permutation Π by permuting the rows of Q^T
+*     Revert column permutation Π by permuting the columns of X
 *
-         CALL SLAPMT( .FALSE., N, N, QT, LDQT, IWORK )
+         CALL SLAPMT( .FALSE., L, N, WORK( 2 ), L, IWORK )
       END IF
 *
       WORK( 1 ) = REAL( LWKOPT )

From 22735c97ab9d96e924a345528353ff6a2dd8443f Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Fri, 1 May 2020 19:07:31 +0200
Subject: [PATCH 052/101] SGGQRCS: fix typo

---
 SRC/sggqrcs.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index c3f38ca887..6ef61c44e0 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -75,7 +75,7 @@
 *>   C^2 + S^2 = I.
 *>
 *> The routine computes C, S and optionally the matrices U1, U2, and X.
-*> On exit, X is stored in WORK( 2:L*N ).
+*> On exit, X is stored in WORK( 2:L*N+1 ).
 *>
 *> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
 *> A and B implicitly gives the SVD of A*inv(B):

From 3a6b92b9d1664f358e274a8f13c5ed4fa794d2e5 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Fri, 1 May 2020 20:36:05 +0200
Subject: [PATCH 053/101] SGGQRCS: remove unused subroutines

---
 SRC/sggqrcs.f | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 6ef61c44e0..fa8f853ce2 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -324,8 +324,8 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
       EXTERNAL           LSAME, SLAMCH, SLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEMM, SGEQP3, SGERQF, SLACPY, SLAPMT, SLASCL,
-     $                   SLASET, SORGQR, SORGRQ, SORCSD2BY1, XERBLA
+      EXTERNAL           SGEMM, SGEQP3, SLACPY, SLAPMT, SLASCL,
+     $                   SLASET, SORGQR, SORCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN

From d7e01db425a3957a4ebd3d70a9617838fd905b35 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Fri, 1 May 2020 21:09:01 +0200
Subject: [PATCH 054/101] SGGQRCS: replace matrix scaling with row sorting

To achieve a backward stable solver, input matrices A, B were previously
scaled to be similar norm but this changes the generalized singular
values and I seem unable to compensate for the scaling effects by
* recomputing the singular values and
* rescaling the right-hand side GSVD matrix X.
---
 SRC/sggqrcs.f | 70 +++++++++++++++++++++------------------------------
 1 file changed, 29 insertions(+), 41 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index fa8f853ce2..cabadb499f 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
+*       SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *                           A, LDA, B, LDB,
 *                           THETA, U1, LDU1, U2, LDU2
 *                           WORK, LWORK, IWORK, INFO )
@@ -26,7 +26,6 @@
 *       .. Scalar Arguments ..
 *       CHARACTER          JOBU1, JOB2, JOBX
 *       INTEGER            INFO, LDA, LDB, LDU1, LDU2, M, N, P, L, LWORK
-*       REAL               W
 *       ..
 *       .. Array Arguments ..
 *       INTEGER            IWORK( * )
@@ -141,15 +140,6 @@
 *>          The number of rows of the matrix B.  P >= 1.
 *> \endverbatim
 *>
-*> \param[out] W
-*> \verbatim
-*>          W in REAL
-*>
-*>          On exit, W is a radix power chosen such that the Frobenius
-*>          norm of A and W*B are within sqrt(radix) and 1/sqrt(radix)
-*>          of each other.
-*> \endverbatim
-*>
 *> \param[out] L
 *> \verbatim
 *>          L is INTEGER
@@ -286,7 +276,7 @@
 *>  workspace whose dimension must be queried at run-time.
 *>
 *  =====================================================================
-      SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
+      SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                    A, LDA, B, LDB,
      $                    THETA, U1, LDU1, U2, LDU2,
      $                    WORK, LWORK, IWORK, INFO )
@@ -300,7 +290,6 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
 *     .. Scalar Arguments ..
       CHARACTER          JOBU1, JOBU2, JOBX
       INTEGER            INFO, LDA, LDB, LDU1, LDU2, L, M, N, P, LWORK
-      REAL               W
 *     ..
 *     .. Array Arguments ..
       INTEGER            IWORK( * )
@@ -314,7 +303,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
       INTEGER            I, J, LMAX, Z, LDG, LDVT, LWKOPT
-      REAL               GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
+      REAL               BASE, NAN, NORMG, TOL, ULP, UNFL
 *     .. Local Arrays ..
       REAL               G( M + P, N ), VT( N, N )
 *     ..
@@ -324,8 +313,8 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
       EXTERNAL           LSAME, SLAMCH, SLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEMM, SGEQP3, SLACPY, SLAPMT, SLASCL,
-     $                   SLASET, SORGQR, SORCSD2BY1, XERBLA
+      EXTERNAL           SGEMM, SGEQP3, SLACPY, SLAPMR, SLAPMT,
+     $                   SLASET, SLASRTR, SORGQR, SORCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -377,22 +366,25 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
       ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
-         INFO = -10
+         INFO = -9
       ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
-         INFO = -12
+         INFO = -11
       ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
-         INFO = -15
+         INFO = -14
       ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
-         INFO = -17
+         INFO = -16
       ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
-         INFO = -21
+         INFO = -18
       END IF
 *
 *     Compute workspace
 *
       IF( INFO.EQ.0 ) THEN
+*        SLASRTR workspace
+         LWKOPT = M + P
+
          CALL SGEQP3( M+P, N, G, LDG, IWORK, THETA, WORK, -1, INFO )
-         LWKOPT = INT( WORK( 1 ) )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 
          CALL SORGQR( M + P, LMAX, LMAX, G, LDG, THETA, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
@@ -420,23 +412,6 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
          RETURN
       ENDIF
 *
-*     Scale matrix B such that norm(A) \approx norm(B)
-*
-      NORMA = SLANGE( 'F', M, N, A, LDA, WORK )
-      NORMB = SLANGE( 'F', P, N, B, LDB, WORK )
-*
-      IF ( NORMB.EQ.0 ) THEN
-         W = 1.0E0
-      ELSE
-         BASE = SLAMCH( 'B' )
-         W = BASE ** INT( LOG( NORMA / NORMB ) / LOG( BASE ) )
-*
-         CALL SLASCL( 'G', -1, -1, 1.0E0, W, P, N, B, LDB, INFO )
-         IF ( INFO.NE.0 ) THEN
-            RETURN
-         END IF
-      END IF
-*
 *     Copy matrices A, B into the (M+P) x N matrix G
 *
       CALL SLACPY( 'A', M, N, A, LDA, G( P + 1, 1 ), LDG )
@@ -449,14 +424,25 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
 *
 *     Compute the Frobenius norm of matrix G
 *
-      GNORM = SLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
+      NORMG = SLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
 *
 *     Get machine precision and set up threshold for determining
 *     the effective numerical rank of the matrix G.
 *
       ULP = SLAMCH( 'Precision' )
       UNFL = SLAMCH( 'Safe Minimum' )
-      TOL = MAX( M + P, N ) * MAX( GNORM, UNFL ) * ULP
+      TOL = MAX( M + P, N ) * MAX( NORMG, UNFL ) * ULP
+*
+*     Apply row sorting for QR decomposition
+*     Row sorting is _necessary_ because the norms of A, B might differ
+*     significantly. Row sorting _combined_ with column pivoting leads
+*     to a small row-wise error, cf. §19.4 in N. J. Higham: "Accuracy
+*     and Stability of Numerical Algorithms". 2002.
+      CALL SLASRTR( 'D', M + P, N, G, LDG,
+     $              IWORK( N + 1 ), WORK( Z + 1 ), INFO )
+      IF( INFO.NE.0 ) THEN
+         RETURN
+      ENDIF
 *
 *     IWORK stores the column permutations computed by SGEQP3.
 *     Columns J where IWORK( J ) is non-zero are permuted to the front
@@ -517,6 +503,8 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
+*     Revert row sorting
+      CALL SLAPMR( .FALSE., M + P, L, G, LDG, IWORK( N + 1 ) )
 *
 *     DEBUG
 *

From 84cb49da11bcee14c04ad4db45f4e45762e8ed33 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sun, 3 May 2020 14:44:03 +0200
Subject: [PATCH 055/101] Revert "SGGQRCS: replace matrix scaling with row
 sorting"

Row sorting cannot replace matrix scaling, see the recently added tests.
---
 SRC/sggqrcs.f | 70 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 29 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index cabadb499f..fa8f853ce2 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
+*       SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
 *                           A, LDA, B, LDB,
 *                           THETA, U1, LDU1, U2, LDU2
 *                           WORK, LWORK, IWORK, INFO )
@@ -26,6 +26,7 @@
 *       .. Scalar Arguments ..
 *       CHARACTER          JOBU1, JOB2, JOBX
 *       INTEGER            INFO, LDA, LDB, LDU1, LDU2, M, N, P, L, LWORK
+*       REAL               W
 *       ..
 *       .. Array Arguments ..
 *       INTEGER            IWORK( * )
@@ -140,6 +141,15 @@
 *>          The number of rows of the matrix B.  P >= 1.
 *> \endverbatim
 *>
+*> \param[out] W
+*> \verbatim
+*>          W in REAL
+*>
+*>          On exit, W is a radix power chosen such that the Frobenius
+*>          norm of A and W*B are within sqrt(radix) and 1/sqrt(radix)
+*>          of each other.
+*> \endverbatim
+*>
 *> \param[out] L
 *> \verbatim
 *>          L is INTEGER
@@ -276,7 +286,7 @@
 *>  workspace whose dimension must be queried at run-time.
 *>
 *  =====================================================================
-      SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
+      SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
      $                    A, LDA, B, LDB,
      $                    THETA, U1, LDU1, U2, LDU2,
      $                    WORK, LWORK, IWORK, INFO )
@@ -290,6 +300,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     .. Scalar Arguments ..
       CHARACTER          JOBU1, JOBU2, JOBX
       INTEGER            INFO, LDA, LDB, LDU1, LDU2, L, M, N, P, LWORK
+      REAL               W
 *     ..
 *     .. Array Arguments ..
       INTEGER            IWORK( * )
@@ -303,7 +314,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
       INTEGER            I, J, LMAX, Z, LDG, LDVT, LWKOPT
-      REAL               BASE, NAN, NORMG, TOL, ULP, UNFL
+      REAL               GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
 *     .. Local Arrays ..
       REAL               G( M + P, N ), VT( N, N )
 *     ..
@@ -313,8 +324,8 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       EXTERNAL           LSAME, SLAMCH, SLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEMM, SGEQP3, SLACPY, SLAPMR, SLAPMT,
-     $                   SLASET, SLASRTR, SORGQR, SORCSD2BY1, XERBLA
+      EXTERNAL           SGEMM, SGEQP3, SLACPY, SLAPMT, SLASCL,
+     $                   SLASET, SORGQR, SORCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -366,25 +377,22 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
-         INFO = -9
+         INFO = -10
       ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
-         INFO = -11
+         INFO = -12
       ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
-         INFO = -14
+         INFO = -15
       ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
-         INFO = -16
+         INFO = -17
       ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
-         INFO = -18
+         INFO = -21
       END IF
 *
 *     Compute workspace
 *
       IF( INFO.EQ.0 ) THEN
-*        SLASRTR workspace
-         LWKOPT = M + P
-
          CALL SGEQP3( M+P, N, G, LDG, IWORK, THETA, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+         LWKOPT = INT( WORK( 1 ) )
 
          CALL SORGQR( M + P, LMAX, LMAX, G, LDG, THETA, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
@@ -412,6 +420,23 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          RETURN
       ENDIF
 *
+*     Scale matrix B such that norm(A) \approx norm(B)
+*
+      NORMA = SLANGE( 'F', M, N, A, LDA, WORK )
+      NORMB = SLANGE( 'F', P, N, B, LDB, WORK )
+*
+      IF ( NORMB.EQ.0 ) THEN
+         W = 1.0E0
+      ELSE
+         BASE = SLAMCH( 'B' )
+         W = BASE ** INT( LOG( NORMA / NORMB ) / LOG( BASE ) )
+*
+         CALL SLASCL( 'G', -1, -1, 1.0E0, W, P, N, B, LDB, INFO )
+         IF ( INFO.NE.0 ) THEN
+            RETURN
+         END IF
+      END IF
+*
 *     Copy matrices A, B into the (M+P) x N matrix G
 *
       CALL SLACPY( 'A', M, N, A, LDA, G( P + 1, 1 ), LDG )
@@ -424,25 +449,14 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Compute the Frobenius norm of matrix G
 *
-      NORMG = SLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
+      GNORM = SLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
 *
 *     Get machine precision and set up threshold for determining
 *     the effective numerical rank of the matrix G.
 *
       ULP = SLAMCH( 'Precision' )
       UNFL = SLAMCH( 'Safe Minimum' )
-      TOL = MAX( M + P, N ) * MAX( NORMG, UNFL ) * ULP
-*
-*     Apply row sorting for QR decomposition
-*     Row sorting is _necessary_ because the norms of A, B might differ
-*     significantly. Row sorting _combined_ with column pivoting leads
-*     to a small row-wise error, cf. §19.4 in N. J. Higham: "Accuracy
-*     and Stability of Numerical Algorithms". 2002.
-      CALL SLASRTR( 'D', M + P, N, G, LDG,
-     $              IWORK( N + 1 ), WORK( Z + 1 ), INFO )
-      IF( INFO.NE.0 ) THEN
-         RETURN
-      ENDIF
+      TOL = MAX( M + P, N ) * MAX( GNORM, UNFL ) * ULP
 *
 *     IWORK stores the column permutations computed by SGEQP3.
 *     Columns J where IWORK( J ) is non-zero are permuted to the front
@@ -503,8 +517,6 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
-*     Revert row sorting
-      CALL SLAPMR( .FALSE., M + P, L, G, LDG, IWORK( N + 1 ) )
 *
 *     DEBUG
 *

From 2678b0fa66e4e7250a76eaf57b3c2102cf93e634 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sun, 3 May 2020 17:57:07 +0200
Subject: [PATCH 056/101] SGGQRCS: revert effects of matrix scaling

---
 SRC/sggqrcs.f | 77 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 54 insertions(+), 23 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index fa8f853ce2..a4e4736376 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
+*       SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *                           A, LDA, B, LDB,
 *                           THETA, U1, LDU1, U2, LDU2
 *                           WORK, LWORK, IWORK, INFO )
@@ -26,7 +26,6 @@
 *       .. Scalar Arguments ..
 *       CHARACTER          JOBU1, JOB2, JOBX
 *       INTEGER            INFO, LDA, LDB, LDU1, LDU2, M, N, P, L, LWORK
-*       REAL               W
 *       ..
 *       .. Array Arguments ..
 *       INTEGER            IWORK( * )
@@ -141,15 +140,6 @@
 *>          The number of rows of the matrix B.  P >= 1.
 *> \endverbatim
 *>
-*> \param[out] W
-*> \verbatim
-*>          W in REAL
-*>
-*>          On exit, W is a radix power chosen such that the Frobenius
-*>          norm of A and W*B are within sqrt(radix) and 1/sqrt(radix)
-*>          of each other.
-*> \endverbatim
-*>
 *> \param[out] L
 *> \verbatim
 *>          L is INTEGER
@@ -286,7 +276,7 @@
 *>  workspace whose dimension must be queried at run-time.
 *>
 *  =====================================================================
-      SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
+      SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                    A, LDA, B, LDB,
      $                    THETA, U1, LDU1, U2, LDU2,
      $                    WORK, LWORK, IWORK, INFO )
@@ -300,7 +290,6 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
 *     .. Scalar Arguments ..
       CHARACTER          JOBU1, JOBU2, JOBX
       INTEGER            INFO, LDA, LDB, LDU1, LDU2, L, M, N, P, LWORK
-      REAL               W
 *     ..
 *     .. Array Arguments ..
       INTEGER            IWORK( * )
@@ -313,8 +302,9 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
 *
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
-      INTEGER            I, J, LMAX, Z, LDG, LDVT, LWKOPT
-      REAL               GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
+      INTEGER            I, J, K, K2, LMAX, Z, LDG, LDX, LDVT, LWKOPT
+      REAL               BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
+     $                   T, W
 *     .. Local Arrays ..
       REAL               G( M + P, N ), VT( N, N )
 *     ..
@@ -449,14 +439,14 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
 *
 *     Compute the Frobenius norm of matrix G
 *
-      GNORM = SLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
+      NORMG = SLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
 *
 *     Get machine precision and set up threshold for determining
 *     the effective numerical rank of the matrix G.
 *
       ULP = SLAMCH( 'Precision' )
       UNFL = SLAMCH( 'Safe Minimum' )
-      TOL = MAX( M + P, N ) * MAX( GNORM, UNFL ) * ULP
+      TOL = MAX( M + P, N ) * MAX( NORMG, UNFL ) * ULP
 *
 *     IWORK stores the column permutations computed by SGEQP3.
 *     Columns J where IWORK( J ) is non-zero are permuted to the front
@@ -545,25 +535,66 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, W, L,
 *
       WORK( 1:LDG*N ) = NAN
 *
-*     Compute V^T R1( 1:L, : )
+*     Compute X = V^T R1( 1:L, : ) and adjust for matrix scaling
 *
       IF( WANTX ) THEN
+         LDX = L
          IF ( L.LE.M ) THEN
             CALL SGEMM( 'N', 'N', L, N, L,
      $                  1.0E0, VT, LDVT, A, LDA,
-     $                  0.0E0, WORK( 2 ), L )
+     $                  0.0E0, WORK( 2 ), LDX )
          ELSE
             CALL SGEMM( 'N', 'N', L, N, M,
      $                  1.0E0, VT( 1, 1 ), LDVT, A, LDA,
-     $                  0.0E0, WORK( 2 ), L )
+     $                  0.0E0, WORK( 2 ), LDX )
             CALL SGEMM( 'N', 'N', L, N - M, L - M,
      $                  1.0E0, VT( 1, M + 1 ), LDVT, B, LDB,
-     $                  1.0E0, WORK( L*M + 2 ), L )
+     $                  1.0E0, WORK( L*M + 2 ), LDX )
+         END IF
+*        Revert column permutation Π by permuting the columns of X
+         CALL SLAPMT( .FALSE., L, N, WORK( 2 ), LDX, IWORK )
+*        Adjust generalized singular values for matrix scaling
+*        Prepare row scaling of X
+         IF( .NOT. W.EQ.1.0E0 ) THEN
+            K = MIN( M, P, L, M + P - L )
+            K2 = MAX( L - M, 0 )
+            DO I = 1, K
+               T = THETA( I )
+*              Do not adjust singular value if
+*              * THETA(I) is greater than pi/2
+*              * W=1 (otherwise we might compute sin(0) / sin(0) = 0/0)
+               IF( TAN( T ) < 0 ) THEN
+                  WORK( Z + I + 1 ) = 1.0E0
+*              ensure divisor is far away from zero
+               ELSE IF( W >= 1 ) THEN
+                  THETA( I ) = ATAN( W * TAN( T ) )
+                  WORK( Z + I + 1 ) = SIN( T ) / SIN( THETA( I ) )
+               ELSE
+                  THETA( I ) = ATAN( W * TAN( T ) )
+                  WORK( Z + I + 1 ) = COS( T ) / COS( THETA( I ) ) / W
+               END IF
+            END DO
+*           Adjust rows of X for matrix scaling
+            DO J = 0, N-1
+               DO I = 1, K2
+                  WORK( LDX*J + I + 1 ) = WORK( LDX*J + I + 1 ) / W
+               END DO
+               DO I = 1, K
+                  WORK( LDX*J + I + K2 + 1 ) =
+     $            WORK( LDX*J + I + K2 + 1 ) * WORK( Z + I + 1 )
+               END DO
+            END DO
          END IF
+      ELSE IF( .NOT. W.EQ.1.0E0 ) THEN
 *
-*     Revert column permutation Π by permuting the columns of X
+*        Adjust only generalized singular values for matrix scaling
 *
-         CALL SLAPMT( .FALSE., L, N, WORK( 2 ), L, IWORK )
+         DO I = 1, MIN( M, P, L, M + P - L )
+*           Do not adjust singular value if THETA(I) is greater than pi/2
+            IF( TAN( THETA(I) ) >= 0 ) THEN
+               THETA(I) = ATAN( W * TAN( THETA(I) ) )
+            END IF
+         END DO
       END IF
 *
       WORK( 1 ) = REAL( LWKOPT )

From 19a1e0b3c077546f9cf67b2cbfc211f5f1a33be5 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sun, 3 May 2020 22:34:34 +0200
Subject: [PATCH 057/101] SGGQRCS: add row sorting again, keep matrix scaling

---
 SRC/sggqrcs.f | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index a4e4736376..55b33ba35f 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -314,8 +314,8 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       EXTERNAL           LSAME, SLAMCH, SLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEMM, SGEQP3, SLACPY, SLAPMT, SLASCL,
-     $                   SLASET, SORGQR, SORCSD2BY1, XERBLA
+      EXTERNAL           SGEMM, SGEQP3, SLACPY, SLAPMR, SLAPMT, SLASCL,
+     $                   SLASET, SLASRTR, SORGQR, SORCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -381,7 +381,11 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     Compute workspace
 *
       IF( INFO.EQ.0 ) THEN
+*        SLASRTR workspace
+         LWKOPT = M + P
+
          CALL SGEQP3( M+P, N, G, LDG, IWORK, THETA, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          LWKOPT = INT( WORK( 1 ) )
 
          CALL SORGQR( M + P, LMAX, LMAX, G, LDG, THETA, WORK, -1, INFO )
@@ -448,6 +452,17 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       UNFL = SLAMCH( 'Safe Minimum' )
       TOL = MAX( M + P, N ) * MAX( NORMG, UNFL ) * ULP
 *
+*     Apply row sorting for QR decomposition
+*     Row sorting is _necessary_ because the norms of A, B might differ
+*     significantly. Row sorting _combined_ with column pivoting leads
+*     to a small row-wise error, cf. §19.4 in N. J. Higham: "Accuracy
+*     and Stability of Numerical Algorithms". 2002.
+      CALL SLASRTR( 'D', M + P, N, G, LDG,
+     $              IWORK( N + 1 ), WORK( Z + 1 ), INFO )
+      IF( INFO.NE.0 ) THEN
+         RETURN
+      ENDIF
+*
 *     IWORK stores the column permutations computed by SGEQP3.
 *     Columns J where IWORK( J ) is non-zero are permuted to the front
 *     so we set the all entries to zero here.
@@ -507,6 +522,8 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
+*     Revert row sorting
+      CALL SLAPMR( .FALSE., M + P, L, G, LDG, IWORK( N + 1 ) )
 *
 *     DEBUG
 *

From b17648cbfc8221540de2cd0730164290aa9378d2 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sun, 3 May 2020 22:52:07 +0200
Subject: [PATCH 058/101] SGGQRCS: fix error codes after recent API changes

---
 SRC/sggqrcs.f | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 55b33ba35f..6a1de2e1a0 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -367,15 +367,15 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
-         INFO = -10
+         INFO = -9
       ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
-         INFO = -12
+         INFO = -11
       ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
-         INFO = -15
+         INFO = -14
       ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
-         INFO = -17
+         INFO = -16
       ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
-         INFO = -21
+         INFO = -18
       END IF
 *
 *     Compute workspace

From 32ebe2b7053498a032eb610243bd9a136ae60238 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Mon, 4 May 2020 20:03:07 +0200
Subject: [PATCH 059/101] SGGQRCS: avoid unnecessary operations

---
 SRC/sggqrcs.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 6a1de2e1a0..a882e5a25f 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -608,7 +608,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
          DO I = 1, MIN( M, P, L, M + P - L )
 *           Do not adjust singular value if THETA(I) is greater than pi/2
-            IF( TAN( THETA(I) ) >= 0 ) THEN
+            IF( TAN( THETA(I) ) > 0 ) THEN
                THETA(I) = ATAN( W * TAN( THETA(I) ) )
             END IF
          END DO

From 357a5b9a2a3962d4afc000259ab47c5f822494ca Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Mon, 4 May 2020 20:03:52 +0200
Subject: [PATCH 060/101] SGGQRCS: fix incorrect branch condition

---
 SRC/sggqrcs.f | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index a882e5a25f..16e00f215e 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -578,17 +578,19 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
             DO I = 1, K
                T = THETA( I )
 *              Do not adjust singular value if
-*              * THETA(I) is greater than pi/2
-*              * W=1 (otherwise we might compute sin(0) / sin(0) = 0/0)
-               IF( TAN( T ) < 0 ) THEN
+*              * THETA(I) is greater than pi/2 (infinite singular value)
+*              * THETA(I) equals zero (singular value won't change)
+               IF( TAN( T ) <= 0 ) THEN
                   WORK( Z + I + 1 ) = 1.0E0
-*              ensure divisor is far away from zero
-               ELSE IF( W >= 1 ) THEN
-                  THETA( I ) = ATAN( W * TAN( T ) )
-                  WORK( Z + I + 1 ) = SIN( T ) / SIN( THETA( I ) )
                ELSE
+*              ensure sine, cosine divisor is far away from zero
+*              w is a power of two and will cause no trouble
                   THETA( I ) = ATAN( W * TAN( T ) )
-                  WORK( Z + I + 1 ) = COS( T ) / COS( THETA( I ) ) / W
+                  IF( SIN( THETA( I ) ) .GE. COS( THETA( I ) ) ) THEN
+                     WORK( Z + I + 1 ) = SIN( T ) / SIN( THETA( I ) )
+                  ELSE
+                     WORK( Z + I + 1 ) = COS( T ) / COS( THETA( I ) ) /W
+                  END IF
                END IF
             END DO
 *           Adjust rows of X for matrix scaling

From 69e0e6be85035eec69ab9f2db5689e6915b1ae09 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Mon, 4 May 2020 20:29:09 +0200
Subject: [PATCH 061/101] SGGQRCS: handle A=0 properly

---
 SRC/sggqrcs.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 16e00f215e..28fa0dc71f 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -419,7 +419,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       NORMA = SLANGE( 'F', M, N, A, LDA, WORK )
       NORMB = SLANGE( 'F', P, N, B, LDB, WORK )
 *
-      IF ( NORMB.EQ.0 ) THEN
+      IF( NORMB.EQ.0 .OR. NORMA.EQ.0 ) THEN
          W = 1.0E0
       ELSE
          BASE = SLAMCH( 'B' )

From f3e4a2d8378e9865b1d66d66af4374a3ab793f05 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Mon, 4 May 2020 21:27:02 +0200
Subject: [PATCH 062/101] SGGQRCS: fix row scaling with singular value zero

The singular value will not change but the corresponding row in the
right-hand side GSVD matrix X must be scaled.
---
 SRC/sggqrcs.f | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 28fa0dc71f..320167a3ad 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -577,10 +577,9 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
             K2 = MAX( L - M, 0 )
             DO I = 1, K
                T = THETA( I )
-*              Do not adjust singular value if
-*              * THETA(I) is greater than pi/2 (infinite singular value)
-*              * THETA(I) equals zero (singular value won't change)
-               IF( TAN( T ) <= 0 ) THEN
+*              Do not adjust singular value if THETA(I) is greater
+*              than pi/2 (infinite singular values won't change)
+               IF( TAN( T ) < 0 ) THEN
                   WORK( Z + I + 1 ) = 1.0E0
                ELSE
 *              ensure sine, cosine divisor is far away from zero

From 20c578d56cecc3fff716536ffc94074472c24a22 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Tue, 5 May 2020 12:29:00 +0200
Subject: [PATCH 063/101] SGGQRCS: fix documentation typo

---
 SRC/sggqrcs.f | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 320167a3ad..3125384a32 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -92,8 +92,8 @@
 *>       U1**T*A*Q = D1*( 0 R ),    U2**T*B*Q = D2*( 0 R )
 *>
 *> where U1, U2, and Q are orthogonal matrices. The former GSVD form can
-*> be converted to the latter by computing the LQ decomposition of X. Be
-*> advised that the LQ decomposition may not be backward stable if,
+*> be converted to the latter by computing the RQ decomposition of X. Be
+*> advised that the RQ decomposition may not be backward stable if,
 *> e.g., A and B differ significantly in norm; consider using xGGSVD3 if
 *> you need X factorized in this case.
 *> \endverbatim

From 060d77ca34aa862582a37635f1dd08dcbd48486b Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Tue, 5 May 2020 13:18:07 +0200
Subject: [PATCH 064/101] SGGQRCS: improve documentation

* highlight conditional backward stability
* recommend use of SGGSVD3 when necessary
* emphasize accuracy of computed singular values
---
 SRC/sggqrcs.f | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 3125384a32..c380c5af90 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -49,6 +49,14 @@
 *> factorization with column pivoting and the 2-by-1 CS decomposition to
 *> compute the GSVD.
 *>
+*> SGGQRCS is only conditionally backward stable when the computation of
+*> X is required. If
+*> * you do not know what this means, or if
+*> * the matrices A and B differ significantly in norm, or if
+*> * X is required in factorized form (see below),
+*>
+*> then it is strongly advised to use SGGSVD3 instead.
+*>
 *> Let L be the effective numerical rank of the matrix (A**T,B**T)**T,
 *> then X is a L-by-N nonsingular upper triangular matrix, D1 and
 *> D2 are M-by-L and P-by-L "diagonal" matrices and of the
@@ -89,13 +97,13 @@
 *>
 *> In some literature, the GSVD of A and B is presented in the form
 *>
-*>       U1**T*A*Q = D1*( 0 R ),    U2**T*B*Q = D2*( 0 R )
+*>       A = U1*D1*( 0 R )*Q**T,    B = U2*D2*( 0 R )*Q**T
+*>
+*> where U1, U2, and Q are orthogonal matrices. This latter GSVD form is
+*> computed directly by SGGSVD3. It is possible to convert between the
+*> two representations by calculating the RQ decomposition of X but this
+*> is not recommended for numerical reasons.
 *>
-*> where U1, U2, and Q are orthogonal matrices. The former GSVD form can
-*> be converted to the latter by computing the RQ decomposition of X. Be
-*> advised that the RQ decomposition may not be backward stable if,
-*> e.g., A and B differ significantly in norm; consider using xGGSVD3 if
-*> you need X factorized in this case.
 *> \endverbatim
 *
 *  Arguments:
@@ -269,11 +277,15 @@
 *> \par Further Details:
 *  =====================
 *>
-*>  SGGQRCS should be significantly faster than SGGSVD and SGGSVD3 for
-*>  large matrices because the matrices A and B are reduced to a pair of
+*>  SGGQRCS can compute the singular values with high relative accuracy
+*>  and should be significantly faster than SGGSVD3 for large matrices
+*>  because the matrices A and B are reduced to a pair of
 *>  well-conditioned bidiagonal matrices instead of pairs of upper
 *>  triangular matrices. On the downside, SGGQRCS requires a much larger
-*>  workspace whose dimension must be queried at run-time.
+*>  workspace whose dimension must be queried at run-time and the
+*>  computation of the right-hand side matrix X is only conditionally
+*>  backward stable; A and B must be similar in norm for backward
+*>  stability.
 *>
 *  =====================================================================
       SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,

From dc2f5e0e7dcd5b0837e8ea2237e55e01b252f375 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Tue, 5 May 2020 14:37:30 +0200
Subject: [PATCH 065/101] SGGQRCS: remove row sorting

Row sorting has no discernible effect on forward or backward errors.
---
 SRC/sggqrcs.f | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index c380c5af90..e7794d4df2 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -326,8 +326,8 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       EXTERNAL           LSAME, SLAMCH, SLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEMM, SGEQP3, SLACPY, SLAPMR, SLAPMT, SLASCL,
-     $                   SLASET, SLASRTR, SORGQR, SORCSD2BY1, XERBLA
+      EXTERNAL           SGEMM, SGEQP3, SLACPY, SLAPMT, SLASCL,
+     $                   SLASET, SORGQR, SORCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -464,17 +464,6 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       UNFL = SLAMCH( 'Safe Minimum' )
       TOL = MAX( M + P, N ) * MAX( NORMG, UNFL ) * ULP
 *
-*     Apply row sorting for QR decomposition
-*     Row sorting is _necessary_ because the norms of A, B might differ
-*     significantly. Row sorting _combined_ with column pivoting leads
-*     to a small row-wise error, cf. §19.4 in N. J. Higham: "Accuracy
-*     and Stability of Numerical Algorithms". 2002.
-      CALL SLASRTR( 'D', M + P, N, G, LDG,
-     $              IWORK( N + 1 ), WORK( Z + 1 ), INFO )
-      IF( INFO.NE.0 ) THEN
-         RETURN
-      ENDIF
-*
 *     IWORK stores the column permutations computed by SGEQP3.
 *     Columns J where IWORK( J ) is non-zero are permuted to the front
 *     so we set the all entries to zero here.
@@ -534,8 +523,6 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
-*     Revert row sorting
-      CALL SLAPMR( .FALSE., M + P, L, G, LDG, IWORK( N + 1 ) )
 *
 *     DEBUG
 *

From 676f894932f9fc5a5b2cccf0c3a280568577c82d Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Wed, 6 May 2020 15:11:26 +0200
Subject: [PATCH 066/101] SGGQRCS: ensure matrix scaling factors always >1

The singular values must be corrected for matrix scaling. Let w be a
power of two such that norm(wA) equals approximately norm(B). Then the
radians representation of the singular values must corrected with

x' = arctan(w * tan(x)),

where x is the computed radians value with matrix scaling and x' is the
computed radians value without matrix scaling. If w is less than one
and if x is large (close to pi/2), then roughly speaking

* tan(x) has a large derivative (i.e. it is badly conditioned),
* w * tan(x) will be smaller than tan(x),
* arctan(w * tan(x)) has derivative near one, because its argument
  is closer to zero.

This commit modifies the GSVD computation such that w >= 1 by choosing
to compute the GSVD of (A, B) or (B, A) depending on the relative norms.
Then

* tan(x) has a small derivative if x is sufficiently small,
* arctan(w * tan(x)) has derivative near zero, and
* if x is large (near pi/2), then x' will be near pi/2 as well.

This commit fixes the large relative forward errors detected by
xGGQRCS_test_singular_values<float>' when the singular values of the
matrix pair (A,B) where drawn from [0, pi/2000).
---
 SRC/sggqrcs.f | 140 +++++++++++++++++++++++++++-----------------------
 1 file changed, 76 insertions(+), 64 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index e7794d4df2..b2dd254614 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
+*       SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
 *                           A, LDA, B, LDB,
 *                           THETA, U1, LDU1, U2, LDU2
 *                           WORK, LWORK, IWORK, INFO )
@@ -155,6 +155,14 @@
 *>          (A**T, B**T)**T.
 *> \endverbatim
 *>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL
+*>          On exit, W is a radix power chosen such that the Frobenius
+*>          norm of A and W*B are with SQRT(RADIX) and 1/SQRT(RADIX) of
+*>          each other.
+*> \endverbatim
+*>
 *> \param[in,out] A
 *> \verbatim
 *>          A is REAL array, dimension (LDA,N)
@@ -288,7 +296,7 @@
 *>  stability.
 *>
 *  =====================================================================
-      SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
+      RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
      $                    A, LDA, B, LDB,
      $                    THETA, U1, LDU1, U2, LDU2,
      $                    WORK, LWORK, IWORK, INFO )
@@ -302,6 +310,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     .. Scalar Arguments ..
       CHARACTER          JOBU1, JOBU2, JOBX
       INTEGER            INFO, LDA, LDB, LDU1, LDU2, L, M, N, P, LWORK
+      REAL               W
 *     ..
 *     .. Array Arguments ..
       INTEGER            IWORK( * )
@@ -314,9 +323,9 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
-      INTEGER            I, J, K, K2, LMAX, Z, LDG, LDX, LDVT, LWKOPT
+      INTEGER            I, J, K, K1, LMAX, Z, LDG, LDX, LDVT, LWKOPT
       REAL               BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
-     $                   T, W
+     $                   T
 *     .. Local Arrays ..
       REAL               G( M + P, N ), VT( N, N )
 *     ..
@@ -330,7 +339,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                   SLASET, SORGQR, SORCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          MAX, MIN, SQRT
 *     ..
 *     .. Executable Statements ..
 *
@@ -342,27 +351,6 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       LQUERY = ( LWORK.EQ.-1 )
       LWKOPT = 1
 *
-*     Initialize variables
-*
-      L = 0
-      LMAX = MIN( M + P, N )
-      Z = ( M + P ) * N
-      IF ( LQUERY ) THEN
-         G = 0
-      ELSE
-         G = WORK( 1 )
-      END IF
-      IF( WANTX .AND. .NOT.LQUERY ) THEN
-         VT = WORK( Z + 1 )
-      ELSE
-         VT = 0
-      END IF
-      LDVT = N
-      LDG = M + P
-*     Computing 0.0 / 0.0 directly causes compiler errors
-      NAN = 1.0E0
-      NAN = 0.0 / (NAN - 1.0E0)
-*
 *     Test the input arguments
 *
       INFO = 0
@@ -379,22 +367,53 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
-         INFO = -9
+         INFO = -10
       ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
-         INFO = -11
+         INFO = -12
       ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
-         INFO = -14
+         INFO = -15
       ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
-         INFO = -16
+         INFO = -17
       ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
-         INFO = -18
+         INFO = -19
+      END IF
+*
+*     Make sure A is the matrix smaller in norm
+*
+      IF( INFO.EQ.0 ) THEN
+         NORMA = SLANGE( 'F', M, N, A, LDA, WORK )
+         NORMB = SLANGE( 'F', P, N, B, LDB, WORK )
+*
+         IF( NORMA.GT.SQRT( 2.0E0 ) * NORMB ) THEN
+            CALL SGGQRCS( JOBU2, JOBU1, JOBX, P, N, M, L, W,
+     $                    B, LDB, A, LDA,
+     $                    THETA,
+     $                    U2, LDU2, U1, LDU1,
+     $                    WORK, LWORK, IWORK, INFO )
+            W = -W
+            RETURN
+         ENDIF
       END IF
 *
+*     Initialize variables
+*
+*     Computing 0.0 / 0.0 directly causes compiler errors
+      NAN = 1.0E0
+      NAN = 0.0 / (NAN - 1.0E0)
+*
+      L = 0
+      LMAX = MIN( M + P, N )
+      Z = ( M + P ) * N
+      G = WORK( 1 )
+      LDG = M + P
+      VT = 0
+      LDVT = N
+      W = NAN
+*
 *     Compute workspace
 *
       IF( INFO.EQ.0 ) THEN
-*        SLASRTR workspace
-         LWKOPT = M + P
+         LWKOPT = 0
 
          CALL SGEQP3( M+P, N, G, LDG, IWORK, THETA, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
@@ -403,12 +422,12 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          CALL SORGQR( M + P, LMAX, LMAX, G, LDG, THETA, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 
-         CALL SORCSD2BY1( JOBU2, JOBU1, JOBX, M + P, P, LMAX,
+         CALL SORCSD2BY1( JOBU2, JOBU1, JOBX, M + P, M, LMAX,
      $                    G, LDG, G, LDG,
-     $                    THETA, U2, LDU2, U1, LDU1, VT, LDVT,
+     $                    THETA, U1, LDU1, U2, LDU2, VT, LDVT,
      $                    WORK, -1, IWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
-*        The matrix (A, B) must be stored sequentially for SORCSD2BY1
+*        The matrix (A, B) must be stored sequentially for SORGQR
          LWKOPT = LWKOPT + Z
 *        2-by-1 CSD matrix V1 must be stored
          IF( WANTX ) THEN
@@ -425,19 +444,20 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IF( LQUERY ) THEN
          RETURN
       ENDIF
+*     Finish initialization
+      IF( WANTX ) THEN
+         VT = WORK( Z + 1 )
+      END IF
 *
-*     Scale matrix B such that norm(A) \approx norm(B)
-*
-      NORMA = SLANGE( 'F', M, N, A, LDA, WORK )
-      NORMB = SLANGE( 'F', P, N, B, LDB, WORK )
+*     Scale matrix A such that norm(A) \approx norm(B)
 *
-      IF( NORMB.EQ.0 .OR. NORMA.EQ.0 ) THEN
+      IF( NORMA.EQ.0.0E0 ) THEN
          W = 1.0E0
       ELSE
          BASE = SLAMCH( 'B' )
-         W = BASE ** INT( LOG( NORMA / NORMB ) / LOG( BASE ) )
+         W = BASE ** INT( LOG( NORMB / NORMA ) / LOG( BASE ) )
 *
-         CALL SLASCL( 'G', -1, -1, 1.0E0, W, P, N, B, LDB, INFO )
+         CALL SLASCL( 'G', -1, -1, 1.0E0, W, M, N, A, LDA, INFO )
          IF ( INFO.NE.0 ) THEN
             RETURN
          END IF
@@ -445,8 +465,8 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Copy matrices A, B into the (M+P) x N matrix G
 *
-      CALL SLACPY( 'A', M, N, A, LDA, G( P + 1, 1 ), LDG )
-      CALL SLACPY( 'A', P, N, B, LDB, G( 1, 1 ), LDG )
+      CALL SLACPY( 'A', M, N, A, LDA, G( 1, 1 ), LDG )
+      CALL SLACPY( 'A', P, N, B, LDB, G( M + 1, 1 ), LDG )
 *
 *     DEBUG
 *
@@ -480,12 +500,12 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Determine the rank of G
 *
-      DO 20 I = 1, MIN( M + P, N )
+      DO I = 1, MIN( M + P, N )
          IF( ABS( G( I, I ) ).LE.TOL ) THEN
             EXIT
          END IF
          L = L + 1
-   20 CONTINUE
+      END DO
 *
 *     Handle rank=0 case
 *
@@ -530,19 +550,11 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Compute the CS decomposition of Q1( :, 1:L )
 *
-      IF( WANTX ) THEN
-         CALL SORCSD2BY1( JOBU2, JOBU1, JOBX, M + P, P, L,
-     $                    G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
-     $                    U2, LDU2, U1, LDU1, VT, LDVT,
-     $                    WORK( Z + LDVT*N + 1 ), LWORK - Z - LDVT*N,
-     $                    IWORK( N + 1 ), INFO )
-      ELSE
-         CALL SORCSD2BY1( JOBU2, JOBU1, JOBX, M + P, P, L,
-     $                    G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
-     $                    U2, LDU2, U1, LDU1, VT, LDVT,
-     $                    WORK( Z + 1 ), LWORK - Z,
-     $                    IWORK, INFO )
-      END IF
+      CALL SORCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, L,
+     $                 G( 1, 1 ), LDG, G( M + 1, 1 ), LDG, THETA,
+     $                 U1, LDU1, U2, LDU2, VT, LDVT,
+     $                 WORK( Z + LDVT*N + 1 ), LWORK - Z - LDVT*N,
+     $                 IWORK( N + 1 ), INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
       END IF
@@ -573,7 +585,7 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *        Prepare row scaling of X
          IF( .NOT. W.EQ.1.0E0 ) THEN
             K = MIN( M, P, L, M + P - L )
-            K2 = MAX( L - M, 0 )
+            K1 = MAX( L - P, 0 )
             DO I = 1, K
                T = THETA( I )
 *              Do not adjust singular value if THETA(I) is greater
@@ -593,12 +605,12 @@ SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
             END DO
 *           Adjust rows of X for matrix scaling
             DO J = 0, N-1
-               DO I = 1, K2
+               DO I = 1, K1
                   WORK( LDX*J + I + 1 ) = WORK( LDX*J + I + 1 ) / W
                END DO
                DO I = 1, K
-                  WORK( LDX*J + I + K2 + 1 ) =
-     $            WORK( LDX*J + I + K2 + 1 ) * WORK( Z + I + 1 )
+                  WORK( LDX*J + I + K1 + 1 ) =
+     $            WORK( LDX*J + I + K1 + 1 ) * WORK( Z + I + 1 )
                END DO
             END DO
          END IF

From b5eb358540874abe009c3a15ecb68ad36fa807cc Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 7 May 2020 16:35:26 +0200
Subject: [PATCH 067/101] SGGQRCS: return sine, cosine values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously SGGQRCS returned angles instead of sine, cosine values but
these computations (while numerically stable) may completely destroy the
backward stability of the solver. Specifically, in one test* a computed
angle x near π/2 was the best single-precision approximation to the true
value but cosine(x) was nowhere near the true cosine value, thereby
making it impossible to achieve a backward stable reassembly of the
input matrices from the GSVD.

For this reason, SGGQRCS returns sine and cosine values to the caller
instead of angles.

TODO: Documentation

*Test name  xGGQRCS_test_singular_accuracy_vs_radians_accuracy
---
 SRC/sggqrcs.f | 191 +++++++++++++++++++++++++++++---------------------
 1 file changed, 112 insertions(+), 79 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index b2dd254614..016e0d7a62 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -18,9 +18,10 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
+*       SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, SWAPPED,
 *                           A, LDA, B, LDB,
-*                           THETA, U1, LDU1, U2, LDU2
+*                           ALPHA, BETA,
+*                           U1, LDU1, U2, LDU2
 *                           WORK, LWORK, IWORK, INFO )
 *
 *       .. Scalar Arguments ..
@@ -29,7 +30,8 @@
 *       ..
 *       .. Array Arguments ..
 *       INTEGER            IWORK( * )
-*       REAL               A( LDA, * ), B( LDB, * ), THETA( * ),
+*       REAL               A( LDA, * ), B( LDB, * ),
+*      $                   ALPHA( N ), BETA( N ),
 *      $                   U1( LDU1, * ), U2( LDU2, * ),
 *      $                   WORK( * )
 *       ..
@@ -77,8 +79,8 @@
 *>   K  = MIN(M, P, L, M + P - L),
 *>   K1 = MAX(L - P, 0),
 *>   K2 = MAX(L - M, 0),
-*>   C  = diag( COS(THETA(1)), ..., COS(THETA(K)) ),
-*>   S  = diag( SIN(THETA(1)), ..., SIN(THETA(K)) ), and
+*>   S  = diag( ALPHA(1), ..., ALPHA(K) ),
+*>   C  = diag( BETA(1), ..., BETA(K) ), and
 *>   C^2 + S^2 = I.
 *>
 *> The routine computes C, S and optionally the matrices U1, U2, and X.
@@ -155,12 +157,12 @@
 *>          (A**T, B**T)**T.
 *> \endverbatim
 *>
-*> \param[out] W
+*> \param[out] SWAPPED
 *> \verbatim
-*>          W is REAL
-*>          On exit, W is a radix power chosen such that the Frobenius
-*>          norm of A and W*B are with SQRT(RADIX) and 1/SQRT(RADIX) of
-*>          each other.
+*>          L is LOGICAL
+*>          On exit, SWAPPED is true if SGGQRCS swapped the input
+*>          matrices A, B and computed the GSVD of (B, A); false
+*>          otherwise.
 *> \endverbatim
 *>
 *> \param[in,out] A
@@ -187,12 +189,19 @@
 *>          The leading dimension of the array B. LDB >= max(1,P).
 *> \endverbatim
 *>
-*> \param[out] THETA
+*> \param[out] ALPHA
+*> \verbatim
+*>          ALPHA is REAL array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] BETA
 *> \verbatim
-*>          THETA is REAL array, dimension (N)
+*>          BETA is REAL array, dimension (N)
 *>
-*>          On exit, THETA contains K = MIN(M, P, L, M + P - L) values
-*>          in radians in ascending order.
+*>          On exit, ALPHA and BETA contain the K generalized singular
+*>          value pairs of A and B, where
+*>            ALPHA(1:K) = S,
+*>            BETA(1:K)  = C.
 *> \endverbatim
 *>
 *> \param[out] U1
@@ -257,6 +266,14 @@
 *> \par Internal Parameters:
 *  =========================
 *>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL
+*>          W is a radix power chosen such that the Frobenius norm of A
+*>          and W*B are with SQRT(RADIX) and 1/SQRT(RADIX) of each
+*>          other.
+*> \endverbatim
+*>
 *> \verbatim
 *>  TOL     REAL
 *>          Let G = (A**T,B**T)**T. TOL is the threshold to determine
@@ -272,7 +289,7 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date October 2019
+*> \date October 2019, May 2020
 *
 *> \ingroup realGEsing
 *
@@ -296,10 +313,12 @@
 *>  stability.
 *>
 *  =====================================================================
-      RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
-     $                    A, LDA, B, LDB,
-     $                    THETA, U1, LDU1, U2, LDU2,
-     $                    WORK, LWORK, IWORK, INFO )
+      RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
+     $                              SWAPPED,
+     $                              A, LDA, B, LDB,
+     $                              ALPHA, BETA,
+     $                              U1, LDU1, U2, LDU2,
+     $                              WORK, LWORK, IWORK, INFO )
 *
 *  -- LAPACK driver routine (version 3.7.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -308,13 +327,14 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
 *
       IMPLICIT NONE
 *     .. Scalar Arguments ..
+      LOGICAL            SWAPPED
       CHARACTER          JOBU1, JOBU2, JOBX
       INTEGER            INFO, LDA, LDB, LDU1, LDU2, L, M, N, P, LWORK
-      REAL               W
 *     ..
 *     .. Array Arguments ..
       INTEGER            IWORK( * )
-      REAL               A( LDA, * ), B( LDB, * ), THETA( * ),
+      REAL               A( LDA, * ), B( LDB, * ),
+     $                   ALPHA( N ), BETA( N ),
      $                   U1( LDU1, * ), U2( LDU2, * ),
      $                   WORK( * )
 *     ..
@@ -325,7 +345,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
       INTEGER            I, J, K, K1, LMAX, Z, LDG, LDX, LDVT, LWKOPT
       REAL               BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
-     $                   T
+     $                   THETA, IOTA, W
 *     .. Local Arrays ..
       REAL               G( M + P, N ), VT( N, N )
 *     ..
@@ -339,7 +359,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
      $                   SLASET, SORGQR, SORCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN, SQRT
+      INTRINSIC          COS, MAX, MIN, SIN, SQRT
 *     ..
 *     .. Executable Statements ..
 *
@@ -367,9 +387,9 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
       ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
-         INFO = -10
+         INFO = -9
       ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
-         INFO = -12
+         INFO = -11
       ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
          INFO = -15
       ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
@@ -385,12 +405,13 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
          NORMB = SLANGE( 'F', P, N, B, LDB, WORK )
 *
          IF( NORMA.GT.SQRT( 2.0E0 ) * NORMB ) THEN
-            CALL SGGQRCS( JOBU2, JOBU1, JOBX, P, N, M, L, W,
+            CALL SGGQRCS( JOBU2, JOBU1, JOBX, P, N, M, L,
+     $                    SWAPPED,
      $                    B, LDB, A, LDA,
-     $                    THETA,
+     $                    BETA, ALPHA,
      $                    U2, LDU2, U1, LDU1,
      $                    WORK, LWORK, IWORK, INFO )
-            W = -W
+            SWAPPED = .TRUE.
             RETURN
          ENDIF
       END IF
@@ -401,6 +422,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
       NAN = 1.0E0
       NAN = 0.0 / (NAN - 1.0E0)
 *
+      SWAPPED = .FALSE.
       L = 0
       LMAX = MIN( M + P, N )
       Z = ( M + P ) * N
@@ -408,23 +430,26 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
       LDG = M + P
       VT = 0
       LDVT = N
+      THETA = NAN
+      IOTA = NAN
       W = NAN
 *
 *     Compute workspace
 *
       IF( INFO.EQ.0 ) THEN
          LWKOPT = 0
-
-         CALL SGEQP3( M+P, N, G, LDG, IWORK, THETA, WORK, -1, INFO )
+*
+         CALL SGEQP3( M+P, N, G, LDG, IWORK, ALPHA, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          LWKOPT = INT( WORK( 1 ) )
-
-         CALL SORGQR( M + P, LMAX, LMAX, G, LDG, THETA, WORK, -1, INFO )
+*
+         CALL SORGQR( M + P, LMAX, LMAX, G, LDG, ALPHA, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
-
-         CALL SORCSD2BY1( JOBU2, JOBU1, JOBX, M + P, M, LMAX,
+*
+         CALL SORCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, LMAX,
      $                    G, LDG, G, LDG,
-     $                    THETA, U1, LDU1, U2, LDU2, VT, LDVT,
+     $                    ALPHA,
+     $                    U1, LDU1, U2, LDU2, VT, LDVT,
      $                    WORK, -1, IWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *        The matrix (A, B) must be stored sequentially for SORGQR
@@ -433,7 +458,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
          IF( WANTX ) THEN
             LWKOPT = LWKOPT + LDVT*N
          END IF
-
+*
          WORK( 1 ) = REAL( LWKOPT )
       END IF
 *
@@ -492,7 +517,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
 *
 *     Compute the QR factorization with column pivoting GΠ = Q1 R1
 *
-      CALL SGEQP3( M + P, N, G, LDG, IWORK, THETA,
+      CALL SGEQP3( M + P, N, G, LDG, IWORK, ALPHA,
      $             WORK( Z + 1 ), LWORK - Z, INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
@@ -538,7 +563,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition
 *
-      CALL SORGQR( M + P, L, L, G, LDG, THETA,
+      CALL SORGQR( M + P, L, L, G, LDG, ALPHA,
      $             WORK( Z + 1 ), LWORK - Z, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
@@ -546,12 +571,16 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
 *
 *     DEBUG
 *
-      THETA( 1:N ) = NAN
+      ALPHA( 1:N ) = NAN
+      BETA( 1:N ) = NAN
 *
 *     Compute the CS decomposition of Q1( :, 1:L )
 *
+      K = MIN( M, P, L, M + P - L )
+      K1 = MAX( L - P, 0 )
       CALL SORCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, L,
-     $                 G( 1, 1 ), LDG, G( M + 1, 1 ), LDG, THETA,
+     $                 G( 1, 1 ), LDG, G( M + 1, 1 ), LDG,
+     $                 ALPHA,
      $                 U1, LDU1, U2, LDU2, VT, LDVT,
      $                 WORK( Z + LDVT*N + 1 ), LWORK - Z - LDVT*N,
      $                 IWORK( N + 1 ), INFO )
@@ -581,48 +610,52 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, W,
          END IF
 *        Revert column permutation Π by permuting the columns of X
          CALL SLAPMT( .FALSE., L, N, WORK( 2 ), LDX, IWORK )
-*        Adjust generalized singular values for matrix scaling
-*        Prepare row scaling of X
-         IF( .NOT. W.EQ.1.0E0 ) THEN
-            K = MIN( M, P, L, M + P - L )
-            K1 = MAX( L - P, 0 )
-            DO I = 1, K
-               T = THETA( I )
-*              Do not adjust singular value if THETA(I) is greater
-*              than pi/2 (infinite singular values won't change)
-               IF( TAN( T ) < 0 ) THEN
-                  WORK( Z + I + 1 ) = 1.0E0
-               ELSE
-*              ensure sine, cosine divisor is far away from zero
-*              w is a power of two and will cause no trouble
-                  THETA( I ) = ATAN( W * TAN( T ) )
-                  IF( SIN( THETA( I ) ) .GE. COS( THETA( I ) ) ) THEN
-                     WORK( Z + I + 1 ) = SIN( T ) / SIN( THETA( I ) )
-                  ELSE
-                     WORK( Z + I + 1 ) = COS( T ) / COS( THETA( I ) ) /W
-                  END IF
+      END IF
+*
+*     Adjust generalized singular values for matrix scaling
+*     Compute sine, cosine values
+*     Prepare row scaling of X
+*
+      DO I = 1, K
+         THETA = ALPHA( I )
+*        Do not adjust singular value if THETA is greater
+*        than pi/2 (infinite singular values won't change)
+         IF( COS( THETA ).LE.0.0E0 ) THEN
+            ALPHA( I ) = 0.0E0
+            BETA( I ) = 1.0E0
+            IF( WANTX ) THEN
+               WORK( Z + I + 1 ) = 1.0E0
+            END IF
+         ELSE
+*           iota comes in the greek alphabet after theta
+            IOTA = ATAN( W * TAN( THETA ) )
+*           ensure sine, cosine divisor is far away from zero
+*           w is a power of two and will cause no trouble
+            IF( SIN( IOTA ) .GE. COS( IOTA ) ) THEN
+               ALPHA( I ) =  ( SIN( IOTA ) / TAN( THETA ) ) / W
+               BETA( I ) = SIN( IOTA )
+               IF( WANTX ) THEN
+                  WORK( Z + I + 1 ) = SIN( THETA ) / SIN( IOTA )
+               END IF
+            ELSE
+               ALPHA( I ) = COS( IOTA )
+               BETA( I ) = SIN( IOTA )
+               IF( WANTX ) THEN
+                  WORK( Z + I + 1 ) = COS( THETA ) / COS( IOTA ) / W
                END IF
+            END IF
+         END IF
+      END DO
+*     Adjust rows of X for matrix scaling
+      IF( WANTX ) THEN
+         DO J = 0, N-1
+            DO I = 1, K1
+               WORK( LDX*J + I + 1 ) = WORK( LDX*J + I + 1 ) / W
             END DO
-*           Adjust rows of X for matrix scaling
-            DO J = 0, N-1
-               DO I = 1, K1
-                  WORK( LDX*J + I + 1 ) = WORK( LDX*J + I + 1 ) / W
-               END DO
-               DO I = 1, K
-                  WORK( LDX*J + I + K1 + 1 ) =
-     $            WORK( LDX*J + I + K1 + 1 ) * WORK( Z + I + 1 )
-               END DO
+            DO I = 1, K
+               WORK( LDX*J + I + K1 + 1 ) =
+     $         WORK( LDX*J + I + K1 + 1 ) * WORK( Z + I + 1 )
             END DO
-         END IF
-      ELSE IF( .NOT. W.EQ.1.0E0 ) THEN
-*
-*        Adjust only generalized singular values for matrix scaling
-*
-         DO I = 1, MIN( M, P, L, M + P - L )
-*           Do not adjust singular value if THETA(I) is greater than pi/2
-            IF( TAN( THETA(I) ) > 0 ) THEN
-               THETA(I) = ATAN( W * TAN( THETA(I) ) )
-            END IF
          END DO
       END IF
 *

From 4202857bcf0dd79938f9e15648632f1c67d29fc5 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 7 May 2020 18:37:35 +0200
Subject: [PATCH 068/101] SGGQRCS: avoid needless matrix norm recomputation

---
 SRC/sggqrcs.f | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 016e0d7a62..de450d59d5 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -414,6 +414,13 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
             SWAPPED = .TRUE.
             RETURN
          ENDIF
+*
+*     Past this point, we know that
+*     * NORMA <= NORMB (almost)
+*     * W >= 1
+*     * ALPHA will contain cosine values at the end
+*     * BETA will contain sine values at the end
+*
       END IF
 *
 *     Initialize variables
@@ -500,7 +507,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Compute the Frobenius norm of matrix G
 *
-      NORMG = SLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
+      NORMG = NORMB * SQRT( 1.0E0 + ( ( W * NORMA ) / NORMB )**2 )
 *
 *     Get machine precision and set up threshold for determining
 *     the effective numerical rank of the matrix G.

From 6e5c3a685af72efbf7817b2977d36d3cbcfaf1d2 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 7 May 2020 19:04:12 +0200
Subject: [PATCH 069/101] SGGQRCS: update documentation

---
 SRC/sggqrcs.f | 73 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 40 insertions(+), 33 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index de450d59d5..cd71591cbc 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -51,34 +51,45 @@
 *> factorization with column pivoting and the 2-by-1 CS decomposition to
 *> compute the GSVD.
 *>
-*> SGGQRCS is only conditionally backward stable when the computation of
-*> X is required. If
-*> * you do not know what this means, or if
-*> * the matrices A and B differ significantly in norm, or if
-*> * X is required in factorized form (see below),
-*>
-*> then it is strongly advised to use SGGSVD3 instead.
-*>
 *> Let L be the effective numerical rank of the matrix (A**T,B**T)**T,
-*> then X is a L-by-N nonsingular upper triangular matrix, D1 and
-*> D2 are M-by-L and P-by-L "diagonal" matrices and of the
-*> following structures, respectively:
+*> then X is a L-by-N nonsingular matrix, D1 and D2 are M-by-L and
+*> P-by-L "diagonal" matrices. If SWAPPED is false, then D1 and D2 are
+*> of the of the following structures, respectively:
 *>
-*>                        K   K1
-*>        D1 =     (  0   0   0 )
-*>              K  (  0   S   0 )
-*>              K1 (  0   0   I )
+*>                 K1  K
+*>            K1 [ I   0   0 ]
+*>       D1 = K  [ 0   C   0 ]
+*>               [ 0   0   0 ]
 *>
-*>                    K2  K
-*>        D2 =  K2 (  I   0   0 )
-*>              K  (  0   C   0 )
-*>                 (  0   0   0 )
+*>                     K   K2
+*>               [ 0   0   0 ]
+*>       D2 = K  [ 0   S   0 ]
+*>            K2 [ 0   0   I ]
 *>
 *> where
 *>
 *>   K  = MIN(M, P, L, M + P - L),
 *>   K1 = MAX(L - P, 0),
 *>   K2 = MAX(L - M, 0),
+*>   C  = diag( ALPHA(1), ..., ALPHA(K) ),
+*>   S  = diag( BETA(1), ..., BETA(K) ), and
+*>   C^2 + S^2 = I.
+*>
+*> If SWAPPED is true, then D1 and D2 are of the of the following
+*> structures, respectively:
+*>
+*>                     K   K1
+*>               [ 0   0   0 ]
+*>       D1 = K  [ 0   S   0 ]
+*>            K1 [ 0   0   I ]
+*>
+*>                 K2  K
+*>            K2 [ I   0   0 ]
+*>       D2 = K  [ 0   C   0 ]
+*>               [ 0   0   0 ]
+*>
+*> where
+*>
 *>   S  = diag( ALPHA(1), ..., ALPHA(K) ),
 *>   C  = diag( BETA(1), ..., BETA(K) ), and
 *>   C^2 + S^2 = I.
@@ -86,8 +97,8 @@
 *> The routine computes C, S and optionally the matrices U1, U2, and X.
 *> On exit, X is stored in WORK( 2:L*N+1 ).
 *>
-*> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
-*> A and B implicitly gives the SVD of A*inv(B):
+*> If B is an N-by-N nonsingular matrix, then the GSVD of the matrix
+*> pair (A, B) implicitly gives the SVD of A*inv(B):
 *>
 *>       A*inv(B) = U1*(D1*inv(D2))*U2**T.
 *>
@@ -102,9 +113,9 @@
 *>       A = U1*D1*( 0 R )*Q**T,    B = U2*D2*( 0 R )*Q**T
 *>
 *> where U1, U2, and Q are orthogonal matrices. This latter GSVD form is
-*> computed directly by SGGSVD3. It is possible to convert between the
+*> computed directly by DGGSVD3. It is possible to convert between the
 *> two representations by calculating the RQ decomposition of X but this
-*> is not recommended for numerical reasons.
+*> is not recommended for reasons of numerical stability.
 *>
 *> \endverbatim
 *
@@ -199,9 +210,7 @@
 *>          BETA is REAL array, dimension (N)
 *>
 *>          On exit, ALPHA and BETA contain the K generalized singular
-*>          value pairs of A and B, where
-*>            ALPHA(1:K) = S,
-*>            BETA(1:K)  = C.
+*>          value pairs of A and B.
 *> \endverbatim
 *>
 *> \param[out] U1
@@ -302,15 +311,13 @@
 *> \par Further Details:
 *  =====================
 *>
-*>  SGGQRCS can compute the singular values with high relative accuracy
-*>  and should be significantly faster than SGGSVD3 for large matrices
-*>  because the matrices A and B are reduced to a pair of
+*>  SGGQRCS should be significantly faster than DGGSVD3 for large
+*>  matrices because the matrices A and B are reduced to a pair of
 *>  well-conditioned bidiagonal matrices instead of pairs of upper
 *>  triangular matrices. On the downside, SGGQRCS requires a much larger
-*>  workspace whose dimension must be queried at run-time and the
-*>  computation of the right-hand side matrix X is only conditionally
-*>  backward stable; A and B must be similar in norm for backward
-*>  stability.
+*>  workspace whose dimension must be queried at run-time. SGGQRCS also
+*>  offers no guarantees which of the two possible diagonal matrices
+*>  is used for the matrix factorization.
 *>
 *  =====================================================================
       RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,

From 29eff1bd9dbf56e76849a9d0d97fae8bece87bb4 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 7 May 2020 19:11:33 +0200
Subject: [PATCH 070/101] DGGQRCS: update implementation

Port the most recent SGGQRCS code to double precision.
---
 SRC/dggqrcs.f | 507 ++++++++++++++++++++++++++++----------------------
 1 file changed, 280 insertions(+), 227 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 8ed449da57..69662c7943 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -18,21 +18,21 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
+*       SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, SWAPPED,
 *                           A, LDA, B, LDB,
-*                           THETA, U1, LDU1, U2, LDU2, QT, LDQT,
+*                           ALPHA, BETA,
+*                           U1, LDU1, U2, LDU2
 *                           WORK, LWORK, IWORK, INFO )
 *
 *       .. Scalar Arguments ..
-*       CHARACTER          JOBU1, JOB2, JOBQT
-*       INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
-*      $                   M, N, P, L, LWORK
-*       DOUBLE PRECISION   W
+*       CHARACTER          JOBU1, JOB2, JOBX
+*       INTEGER            INFO, LDA, LDB, LDU1, LDU2, M, N, P, L, LWORK
 *       ..
 *       .. Array Arguments ..
 *       INTEGER            IWORK( * )
-*       DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), THETA( * ),
-*      $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQ, * ),
+*       DOUBLE PRECISION   A( LDA, * ), B( LDB, * ),
+*      $                   ALPHA( N ), BETA( N ),
+*      $                   U1( LDU1, * ), U2( LDU2, * ),
 *      $                   WORK( * )
 *       ..
 *
@@ -45,60 +45,78 @@
 *> DGGQRCS computes the generalized singular value decomposition (GSVD)
 *> of an M-by-N real matrix A and P-by-N real matrix B:
 *>
-*>       U1**T*A*Q = D1*( 0 R ),    U2**T*B*Q = D2*( 0 R )
+*>       A = U1 * D1 * X,           B = U2 * D2 * X
 *>
-*> where U1, U2, and Q are orthogonal matrices. DGGQRCS uses the QR
+*> where U1 and U2 are orthogonal matrices. DGGQRCS uses the QR
 *> factorization with column pivoting and the 2-by-1 CS decomposition to
 *> compute the GSVD.
 *>
 *> Let L be the effective numerical rank of the matrix (A**T,B**T)**T,
-*> then R is a L-by-L nonsingular upper triangular matrix, D1 and
-*> D2 are M-by-L and P-by-L "diagonal" matrices and of the
-*> following structures, respectively:
+*> then X is a L-by-N nonsingular matrix, D1 and D2 are M-by-L and
+*> P-by-L "diagonal" matrices. If SWAPPED is false, then D1 and D2 are
+*> of the of the following structures, respectively:
 *>
-*>                        K   K1
-*>        D1 =     (  0   0   0 )
-*>              K  (  0   S   0 )
-*>              K1 (  0   0   I )
+*>                 K1  K
+*>            K1 [ I   0   0 ]
+*>       D1 = K  [ 0   C   0 ]
+*>               [ 0   0   0 ]
 *>
-*>                    K2  K
-*>        D2 =  K2 (  I   0   0 )
-*>              K  (  0   C   0 )
-*>                 (  0   0   0 )
-*>
-*>                 N-L  L
-*>   ( 0 R ) = L (  0   R )
+*>                     K   K2
+*>               [ 0   0   0 ]
+*>       D2 = K  [ 0   S   0 ]
+*>            K2 [ 0   0   I ]
 *>
 *> where
 *>
 *>   K  = MIN(M, P, L, M + P - L),
 *>   K1 = MAX(L - P, 0),
 *>   K2 = MAX(L - M, 0),
-*>   C  = diag( COS(THETA(1)), ..., COS(THETA(K)) ),
-*>   S  = diag( SIN(THETA(1)), ..., SIN(THETA(K)) ), and
+*>   C  = diag( ALPHA(1), ..., ALPHA(K) ),
+*>   S  = diag( BETA(1), ..., BETA(K) ), and
+*>   C^2 + S^2 = I.
+*>
+*> If SWAPPED is true, then D1 and D2 are of the of the following
+*> structures, respectively:
+*>
+*>                     K   K1
+*>               [ 0   0   0 ]
+*>       D1 = K  [ 0   S   0 ]
+*>            K1 [ 0   0   I ]
+*>
+*>                 K2  K
+*>            K2 [ I   0   0 ]
+*>       D2 = K  [ 0   C   0 ]
+*>               [ 0   0   0 ]
+*>
+*> where
+*>
+*>   S  = diag( ALPHA(1), ..., ALPHA(K) ),
+*>   C  = diag( BETA(1), ..., BETA(K) ), and
 *>   C^2 + S^2 = I.
 *>
-*> The routine computes C, S, R, and optionally the orthogonal
-*> transformation matrices U, V and Q. If L <= M, then R is stored in
-*> A(1:L, 1:L) on exit. Otherwise, the first M rows of R are stored in
-*> A(:, 1:L) and R( M+1:, M+1: ) is stored in B(1:L-M, 1:L-M). In both
-*> cases, only the upper triangular part is stored.
+*> The routine computes C, S and optionally the matrices U1, U2, and X.
+*> On exit, X is stored in WORK( 2:L*N+1 ).
+*>
+*> If B is an N-by-N nonsingular matrix, then the GSVD of the matrix
+*> pair (A, B) implicitly gives the SVD of A*inv(B):
+*>
+*>       A*inv(B) = U1*(D1*inv(D2))*U2**T.
 *>
-*> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
-*> A and B implicitly gives the SVD of A*inv(B):
-*>                      A*inv(B) = U1*(D1*inv(D2))*U2**T.
 *> If (A**T,B**T)**T  has orthonormal columns, then the GSVD of A and B
 *> is also equal to the CS decomposition of A and B. Furthermore, the
 *> GSVD can be used to derive the solution of the eigenvalue problem:
-*>                      A**T*A x = lambda * B**T*B x.
+*>
+*>       A**T*A x = lambda * B**T*B x.
+*>
 *> In some literature, the GSVD of A and B is presented in the form
-*>                  U1**T*A*X = ( 0 D1 ),   U2**T*B*X = ( 0 D2 )
-*> where U1 and U2 are orthogonal and X is nonsingular, D1 and D2 are
-*> ``diagonal''.  The former GSVD form can be converted to the latter
-*> form by taking the nonsingular matrix X as
 *>
-*>                      X = Q*( I   0    )
-*>                            ( 0 inv(R) ).
+*>       A = U1*D1*( 0 R )*Q**T,    B = U2*D2*( 0 R )*Q**T
+*>
+*> where U1, U2, and Q are orthogonal matrices. This latter GSVD form is
+*> computed directly by DGGSVD3. It is possible to convert between the
+*> two representations by calculating the RQ decomposition of X but this
+*> is not recommended for reasons of numerical stability.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -118,11 +136,11 @@
 *>          = 'N':  U2 is not computed.
 *> \endverbatim
 *>
-*> \param[in] JOBQT
+*> \param[in] JOBX
 *> \verbatim
-*>          JOBQT is CHARACTER*1
-*>          = 'Y':  Orthogonal matrix Q is computed;
-*>          = 'N':  Q is not computed.
+*>          JOBX is CHARACTER*1
+*>          = 'Y':  Matrix X is computed;
+*>          = 'N':  X is not computed.
 *> \endverbatim
 *>
 *> \param[in] M
@@ -143,15 +161,6 @@
 *>          The number of rows of the matrix B.  P >= 1.
 *> \endverbatim
 *>
-*> \param[out] W
-*> \verbatim
-*>          W is DOUBLE PRECISION
-*>
-*>          On exit, W is a radix power chosen such that the Frobenius
-*>          norm of A and W*B are within sqrt(radix) and 1/sqrt(radix)
-*>          of each other.
-*> \endverbatim
-*>
 *> \param[out] L
 *> \verbatim
 *>          L is INTEGER
@@ -159,12 +168,18 @@
 *>          (A**T, B**T)**T.
 *> \endverbatim
 *>
+*> \param[out] SWAPPED
+*> \verbatim
+*>          L is LOGICAL
+*>          On exit, SWAPPED is true if DGGQRCS swapped the input
+*>          matrices A, B and computed the GSVD of (B, A); false
+*>          otherwise.
+*> \endverbatim
+*>
 *> \param[in,out] A
 *> \verbatim
 *>          A is DOUBLE PRECISION array, dimension (LDA,N)
 *>          On entry, the M-by-N matrix A.
-*>          On exit, A contains the triangular matrix R or the first M
-*>          rows of R, respectively. See Purpose for details.
 *> \endverbatim
 *>
 *> \param[in] LDA
@@ -177,8 +192,6 @@
 *> \verbatim
 *>          B is DOUBLE PRECISION array, dimension (LDB,N)
 *>          On entry, the P-by-N matrix B.
-*>          On exit, if L > M, then B contains the last L - M rows of
-*>          the triangular matrix R. See Purpose for details.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -187,12 +200,17 @@
 *>          The leading dimension of the array B. LDB >= max(1,P).
 *> \endverbatim
 *>
-*> \param[out] THETA
+*> \param[out] ALPHA
+*> \verbatim
+*>          ALPHA is DOUBLE PRECISION array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] BETA
 *> \verbatim
-*>          THETA is DOUBLE PRECISION array, dimension (N)
+*>          BETA is DOUBLE PRECISION array, dimension (N)
 *>
-*>          On exit, THETA contains K = MIN(M, P, L, M + P - L) values
-*>          in radians in ascending order.
+*>          On exit, ALPHA and BETA contain the K generalized singular
+*>          value pairs of A and B.
 *> \endverbatim
 *>
 *> \param[out] U1
@@ -223,20 +241,6 @@
 *>          JOBU2 = 'Y'; LDU2 >= 1 otherwise.
 *> \endverbatim
 *>
-*> \param[out] QT
-*> \verbatim
-*>          QT is DOUBLE PRECISION array, dimension (LDQT,N)
-*>          If JOBQT = 'Y', QT contains the N-by-N orthogonal matrix
-*>          Q**T.
-*> \endverbatim
-*>
-*> \param[in] LDQT
-*> \verbatim
-*>          LDQT is INTEGER
-*>          The leading dimension of the array QT. LDQT >= max(1,N) if
-*>          JOBQT = 'Y'; LDQT >= 1 otherwise.
-*> \endverbatim
-*>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
@@ -271,6 +275,14 @@
 *> \par Internal Parameters:
 *  =========================
 *>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION
+*>          W is a radix power chosen such that the Frobenius norm of A
+*>          and W*B are with SQRT(RADIX) and 1/SQRT(RADIX) of each
+*>          other.
+*> \endverbatim
+*>
 *> \verbatim
 *>  TOL     DOUBLE PRECISION
 *>          Let G = (A**T,B**T)**T. TOL is the threshold to determine
@@ -286,7 +298,7 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date September 2016
+*> \date October 2019, May 2020
 *
 *> \ingroup doubleGEsing
 *
@@ -299,17 +311,21 @@
 *> \par Further Details:
 *  =====================
 *>
-*>  DGGQRCS should be significantly faster than DGGSVD and DGGSVD3 for
-*>  large matrices because the matrices A and B are reduced to a pair of
+*>  DGGQRCS should be significantly faster than DGGSVD3 for large
+*>  matrices because the matrices A and B are reduced to a pair of
 *>  well-conditioned bidiagonal matrices instead of pairs of upper
 *>  triangular matrices. On the downside, DGGQRCS requires a much larger
-*>  workspace whose dimension must be queried at run-time.
+*>  workspace whose dimension must be queried at run-time. DGGQRCS also
+*>  offers no guarantees which of the two possible diagonal matrices
+*>  is used for the matrix factorization.
 *>
 *  =====================================================================
-      SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
-     $                    A, LDA, B, LDB,
-     $                    THETA, U1, LDU1, U2, LDU2, QT, LDQT,
-     $                    WORK, LWORK, IWORK, INFO )
+      RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
+     $                              SWAPPED,
+     $                              A, LDA, B, LDB,
+     $                              ALPHA, BETA,
+     $                              U1, LDU1, U2, LDU2,
+     $                              WORK, LWORK, IWORK, INFO )
 *
 *  -- LAPACK driver routine (version 3.7.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -318,26 +334,27 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       IMPLICIT NONE
 *     .. Scalar Arguments ..
-      CHARACTER          JOBU1, JOBU2, JOBQT
-      INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
-     $                   L, M, N, P, LWORK
-      DOUBLE PRECISION   W
+      LOGICAL            SWAPPED
+      CHARACTER          JOBU1, JOBU2, JOBX
+      INTEGER            INFO, LDA, LDB, LDU1, LDU2, L, M, N, P, LWORK
 *     ..
 *     .. Array Arguments ..
       INTEGER            IWORK( * )
-      DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), THETA( * ),
-     $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQT, * ),
+      DOUBLE PRECISION   A( LDA, * ), B( LDB, * ),
+     $                   ALPHA( N ), BETA( N ),
+     $                   U1( LDU1, * ), U2( LDU2, * ),
      $                   WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Local Scalars ..
-      LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
-      INTEGER            I, J, LMAX, Z, LDG, LWKOPT
-      DOUBLE PRECISION   GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
+      LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
+      INTEGER            I, J, K, K1, LMAX, Z, LDG, LDX, LDVT, LWKOPT
+      DOUBLE PRECISION   BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
+     $                   THETA, IOTA, W
 *     .. Local Arrays ..
-      DOUBLE PRECISION   G( M + P, N )
+      DOUBLE PRECISION   G( M + P, N ), VT( N, N )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -345,11 +362,11 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       EXTERNAL           LSAME, DLAMCH, DLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEMM, DGEQP3, DGERQF, DLACPY, DLAPMT, DLASCL,
-     $                   DLASET, DORGQR, DORGRQ, DORCSD2BY1, XERBLA
+      EXTERNAL           DGEMM, DGEQP3, DLACPY, DLAPMT, DLASCL,
+     $                   DLASET, DORGQR, DORCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          COS, MAX, MIN, SIN, SQRT
 *     ..
 *     .. Executable Statements ..
 *
@@ -357,25 +374,10 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       WANTU1 = LSAME( JOBU1, 'Y' )
       WANTU2 = LSAME( JOBU2, 'Y' )
-      WANTQT = LSAME( JOBQT, 'Y' )
+      WANTX = LSAME( JOBX, 'Y' )
       LQUERY = ( LWORK.EQ.-1 )
       LWKOPT = 1
 *
-*     Initialize variables
-*
-      L = 0
-      LMAX = MIN( M + P, N )
-      Z = ( M + P ) * N
-      IF ( LQUERY ) THEN
-         G = 0
-      ELSE
-         G = WORK( 1 )
-      END IF
-      LDG = M + P
-*     Computing 0.0 / 0.0 directly causes compiler errors
-      NAN = 1.0D0
-      NAN = 0.0 / (NAN - 1.0D0)
-*
 *     Test the input arguments
 *
       INFO = 0
@@ -383,7 +385,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          INFO = -1
       ELSE IF( .NOT.( WANTU2 .OR. LSAME( JOBU2, 'N' ) ) ) THEN
          INFO = -2
-      ELSE IF( .NOT.( WANTQT .OR. LSAME( JOBQT, 'N' ) ) ) THEN
+      ELSE IF( .NOT.( WANTX .OR. LSAME( JOBX, 'N' ) ) ) THEN
          INFO = -3
       ELSE IF( M.LT.1 ) THEN
          INFO = -4
@@ -392,42 +394,85 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
-         INFO = -10
+         INFO = -9
       ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
-         INFO = -12
+         INFO = -11
       ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
          INFO = -15
       ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
          INFO = -17
-      ELSE IF( LDQT.LT.1 .OR. ( WANTQT .AND. LDQT.LT.N ) ) THEN
-         INFO = -19
       ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
-         INFO = -23
+         INFO = -19
       END IF
 *
+*     Make sure A is the matrix smaller in norm
+*
+      IF( INFO.EQ.0 ) THEN
+         NORMA = DLANGE( 'F', M, N, A, LDA, WORK )
+         NORMB = DLANGE( 'F', P, N, B, LDB, WORK )
+*
+         IF( NORMA.GT.SQRT( 2.0E0 ) * NORMB ) THEN
+            CALL DGGQRCS( JOBU2, JOBU1, JOBX, P, N, M, L,
+     $                    SWAPPED,
+     $                    B, LDB, A, LDA,
+     $                    BETA, ALPHA,
+     $                    U2, LDU2, U1, LDU1,
+     $                    WORK, LWORK, IWORK, INFO )
+            SWAPPED = .TRUE.
+            RETURN
+         ENDIF
+*
+*     Past this point, we know that
+*     * NORMA <= NORMB (almost)
+*     * W >= 1
+*     * ALPHA will contain cosine values at the end
+*     * BETA will contain sine values at the end
+*
+      END IF
+*
+*     Initialize variables
+*
+*     Computing 0.0 / 0.0 directly causes compiler errors
+      NAN = 1.0D0
+      NAN = 0.0 / (NAN - 1.0D0)
+*
+      SWAPPED = .FALSE.
+      L = 0
+      LMAX = MIN( M + P, N )
+      Z = ( M + P ) * N
+      G = WORK( 1 )
+      LDG = M + P
+      VT = 0
+      LDVT = N
+      THETA = NAN
+      IOTA = NAN
+      W = NAN
+*
 *     Compute workspace
 *
       IF( INFO.EQ.0 ) THEN
-         CALL DGEQP3( M+P, N, G, LDG, IWORK, THETA, WORK, -1, INFO )
+         LWKOPT = 0
+*
+         CALL DGEQP3( M+P, N, G, LDG, IWORK, ALPHA, WORK, -1, INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          LWKOPT = INT( WORK( 1 ) )
-
-         CALL DORGQR( M + P, LMAX, LMAX, G, LDG, THETA, WORK, -1, INFO )
+*
+         CALL DORGQR( M + P, LMAX, LMAX, G, LDG, ALPHA, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
-
-         CALL DORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, LMAX,
+*
+         CALL DORCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, LMAX,
      $                    G, LDG, G, LDG,
-     $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
+     $                    ALPHA,
+     $                    U1, LDU1, U2, LDU2, VT, LDVT,
      $                    WORK, -1, IWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
-         LWKOPT = Z + LWKOPT
-
-*        DGERQF stores LMAX scalar factors for the elementary reflectors
-         CALL DGERQF( LMAX, N, QT, LDQT, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
-
-         CALL DORGRQ( N, N, LMAX, QT, LDQT, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
-
+*        The matrix (A, B) must be stored sequentially for DORGQR
+         LWKOPT = LWKOPT + Z
+*        2-by-1 CSD matrix V1 must be stored
+         IF( WANTX ) THEN
+            LWKOPT = LWKOPT + LDVT*N
+         END IF
+*
          WORK( 1 ) = DBLE( LWKOPT )
       END IF
 *
@@ -438,28 +483,29 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       IF( LQUERY ) THEN
          RETURN
       ENDIF
+*     Finish initialization
+      IF( WANTX ) THEN
+         VT = WORK( Z + 1 )
+      END IF
 *
-*     Scale matrix B such that norm(A) \approx norm(B)
-*
-      NORMA = DLANGE( 'F', M, N, A, LDA, WORK )
-      NORMB = DLANGE( 'F', P, N, B, LDB, WORK )
+*     Scale matrix A such that norm(A) \approx norm(B)
 *
-      IF ( NORMB.EQ.0 ) THEN
+      IF( NORMA.EQ.0.0D0 ) THEN
          W = 1.0D0
       ELSE
          BASE = DLAMCH( 'B' )
-         W = BASE ** INT( LOG( NORMA / NORMB ) / LOG( BASE ) )
+         W = BASE ** INT( LOG( NORMB / NORMA ) / LOG( BASE ) )
 *
-         CALL DLASCL( 'G', -1, -1, 1.0D0, W, P, N, B, LDB, INFO )
+         CALL DLASCL( 'G', -1, -1, 1.0D0, W, M, N, A, LDA, INFO )
          IF ( INFO.NE.0 ) THEN
             RETURN
          END IF
       END IF
 *
-*     Copy matrices A, B into the (M+P) x n matrix G
+*     Copy matrices A, B into the (M+P) x N matrix G
 *
-      CALL DLACPY( 'A', M, N, A, LDA, G( P + 1, 1 ), LDG )
-      CALL DLACPY( 'A', P, N, B, LDB, G( 1, 1 ), LDG )
+      CALL DLACPY( 'A', M, N, A, LDA, G( 1, 1 ), LDG )
+      CALL DLACPY( 'A', P, N, B, LDB, G( M + 1, 1 ), LDG )
 *
 *     DEBUG
 *
@@ -468,14 +514,14 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Compute the Frobenius norm of matrix G
 *
-      GNORM = DLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
+      NORMG = NORMB * SQRT( 1.0D0 + ( ( W * NORMA ) / NORMB )**2 )
 *
 *     Get machine precision and set up threshold for determining
 *     the effective numerical rank of the matrix G.
 *
       ULP = DLAMCH( 'Precision' )
       UNFL = DLAMCH( 'Safe Minimum' )
-      TOL = MAX( M + P, N ) * MAX( GNORM, UNFL ) * ULP
+      TOL = MAX( M + P, N ) * MAX( NORMG, UNFL ) * ULP
 *
 *     IWORK stores the column permutations computed by DGEQP3.
 *     Columns J where IWORK( J ) is non-zero are permuted to the front
@@ -485,7 +531,7 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Compute the QR factorization with column pivoting GΠ = Q1 R1
 *
-      CALL DGEQP3( M + P, N, G, LDG, IWORK, THETA,
+      CALL DGEQP3( M + P, N, G, LDG, IWORK, ALPHA,
      $             WORK( Z + 1 ), LWORK - Z, INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
@@ -493,12 +539,12 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Determine the rank of G
 *
-      DO 20 I = 1, MIN( M + P, N )
+      DO I = 1, MIN( M + P, N )
          IF( ABS( G( I, I ) ).LE.TOL ) THEN
             EXIT
          END IF
          L = L + 1
-   20 CONTINUE
+      END DO
 *
 *     Handle rank=0 case
 *
@@ -509,9 +555,6 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          IF( WANTU2 ) THEN
             CALL DLASET( 'A', P, P, 0.0D0, 1.0D0, U2, LDU2 )
          END IF
-         IF( WANTQT ) THEN
-            CALL DLASET( 'A', N, N, 0.0D0, 1.0D0, QT, LDQT )
-         END IF
 *
          WORK( 1 ) = DBLE( LWKOPT )
          RETURN
@@ -519,20 +562,22 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Copy R1( 1:L, : ) into A, B and set lower triangular part to zero
 *
-      IF( L.LE.M ) THEN
-          CALL DLACPY( 'U', L, N, G, LDG, A, LDA )
-          CALL DLASET( 'L', L - 1, N, 0.0D0, 0.0D0, A( 2, 1 ), LDA )
-      ELSE
-          CALL DLACPY( 'U', M, N, G, LDG, A, LDA )
-          CALL DLACPY( 'U', L - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
+      IF( WANTX ) THEN
+         IF( L.LE.M ) THEN
+             CALL DLACPY( 'U', L, N, G, LDG, A, LDA )
+             CALL DLASET( 'L', L - 1, N, 0.0D0, 0.0D0, A( 2, 1 ), LDA )
+         ELSE
+             CALL DLACPY( 'U', M, N, G, LDG, A, LDA )
+             CALL DLACPY( 'U', L - M, N - M, G( M+1,M+1 ), LDG, B, LDB )
 *
-          CALL DLASET( 'L', M - 1, N, 0.0D0, 0.0D0, A( 2, 1 ), LDA )
-          CALL DLASET( 'L', L-M-1, N, 0.0D0, 0.0D0, B( 2, 1 ), LDB )
+             CALL DLASET( 'L', M - 1, N, 0.0D0, 0.0D0, A( 2, 1 ), LDA )
+             CALL DLASET( 'L', L-M-1, N, 0.0D0, 0.0D0, B( 2, 1 ), LDB )
+         END IF
       END IF
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition
 *
-      CALL DORGQR( M + P, L, L, G, LDG, THETA,
+      CALL DORGQR( M + P, L, L, G, LDG, ALPHA,
      $             WORK( Z + 1 ), LWORK - Z, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
@@ -540,84 +585,92 @@ SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     DEBUG
 *
-      THETA(1:N) = NAN
+      ALPHA( 1:N ) = NAN
+      BETA( 1:N ) = NAN
 *
 *     Compute the CS decomposition of Q1( :, 1:L )
 *
-      CALL DORCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
-     $                 G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
-     $                 U2, LDU2, U1, LDU1, QT, LDQT,
-     $                 WORK( Z + 1 ), LWORK - Z, IWORK( N + 1 ), INFO )
+      K = MIN( M, P, L, M + P - L )
+      K1 = MAX( L - P, 0 )
+      CALL DORCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, L,
+     $                 G( 1, 1 ), LDG, G( M + 1, 1 ), LDG,
+     $                 ALPHA,
+     $                 U1, LDU1, U2, LDU2, VT, LDVT,
+     $                 WORK( Z + LDVT*N + 1 ), LWORK - Z - LDVT*N,
+     $                 IWORK( N + 1 ), INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
       END IF
 *
 *     DEBUG
 *
-      WORK(1:LWORK) = NAN
-*
-*     Copy V^T from QT to G
-*
-      CALL DLACPY( 'A', L, L, QT, LDQT, G, LDG )
-*
-*     DEBUG
-*
-      CALL DLASET( 'A', N, N, NAN, NAN, QT, LDQT )
-*
-*     Compute V^T R1( 1:L, : ) in the last L rows of QT
-*
-      IF ( L.LE.M ) THEN
-         CALL DGEMM( 'N', 'N', L, N, L, 1.0D0, G, LDG,
-     $               A, LDA, 0.0D0, QT( N-L+1, 1 ), LDQT )
-      ELSE
-         CALL DGEMM( 'N', 'N', L, N, M, 1.0D0, G( 1, 1 ), LDG,
-     $               A, LDA, 0.0D0, QT( N-L+1, 1 ), LDQT )
-         CALL DGEMM( 'N', 'N', L, N - M, L - M, 1.0D0,
-     $               G( 1, M + 1 ), LDG, B, LDB,
-     $               1.0D0, QT( N-L+1, M+1 ), LDQT )
-      END IF
-*
-*     DEBUG
-*
-      CALL DLASET( 'A', M, N, NAN, NAN, A, LDA )
-      CALL DLASET( 'A', P, N, NAN, NAN, B, LDB )
-      WORK(1:LWORK) = NAN
-*
-*     Compute the RQ decomposition of V^T R1( 1:L, : )
-*
-      CALL DGERQF( L, N, QT( N-L+1, 1 ), LDQT, WORK,
-     $             WORK( L + 1 ), LWORK - L, INFO )
-      IF ( INFO.NE.0 ) THEN
-         RETURN
-      END IF
-*
-*     Copy matrix L from QT( N-L+1:N, N-L+1:N ) to A, B
-*
-      IF ( L.LE.M ) THEN
-         CALL DLACPY( 'U', L, L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
-      ELSE
-         CALL DLACPY( 'U', M,     L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
-         CALL DLACPY( 'U', L - M, L - M, QT( N-L+M+1, N-L+M+1 ), LDQT,
-     $                B, LDB )
+      WORK( 1:LDG*N ) = NAN
+*
+*     Compute X = V^T R1( 1:L, : ) and adjust for matrix scaling
+*
+      IF( WANTX ) THEN
+         LDX = L
+         IF ( L.LE.M ) THEN
+            CALL DGEMM( 'N', 'N', L, N, L,
+     $                  1.0D0, VT, LDVT, A, LDA,
+     $                  0.0D0, WORK( 2 ), LDX )
+         ELSE
+            CALL DGEMM( 'N', 'N', L, N, M,
+     $                  1.0D0, VT( 1, 1 ), LDVT, A, LDA,
+     $                  0.0D0, WORK( 2 ), LDX )
+            CALL DGEMM( 'N', 'N', L, N - M, L - M,
+     $                  1.0D0, VT( 1, M + 1 ), LDVT, B, LDB,
+     $                  1.0D0, WORK( L*M + 2 ), LDX )
+         END IF
+*        Revert column permutation Π by permuting the columns of X
+         CALL DLAPMT( .FALSE., L, N, WORK( 2 ), LDX, IWORK )
       END IF
 *
-*     DEBUG
-*
-      CALL DLASET( 'U', L, L, NAN, NAN, QT( 1, N-L+1 ), LDQT )
-      WORK( L+1:LWORK ) = NAN
-*
-*     Explicitly form Q^T
-*
-      IF( WANTQT ) THEN
-         CALL DORGRQ( N, N, L, QT, LDQT, WORK,
-     $                WORK( L + 1 ), LWORK - L, INFO )
-         IF ( INFO.NE.0 ) THEN
-            RETURN
+*     Adjust generalized singular values for matrix scaling
+*     Compute sine, cosine values
+*     Prepare row scaling of X
+*
+      DO I = 1, K
+         THETA = ALPHA( I )
+*        Do not adjust singular value if THETA is greater
+*        than pi/2 (infinite singular values won't change)
+         IF( COS( THETA ).LE.0.0D0 ) THEN
+            ALPHA( I ) = 0.0D0
+            BETA( I ) = 1.0D0
+            IF( WANTX ) THEN
+               WORK( Z + I + 1 ) = 1.0D0
+            END IF
+         ELSE
+*           iota comes in the greek alphabet after theta
+            IOTA = ATAN( W * TAN( THETA ) )
+*           ensure sine, cosine divisor is far away from zero
+*           w is a power of two and will cause no trouble
+            IF( SIN( IOTA ) .GE. COS( IOTA ) ) THEN
+               ALPHA( I ) =  ( SIN( IOTA ) / TAN( THETA ) ) / W
+               BETA( I ) = SIN( IOTA )
+               IF( WANTX ) THEN
+                  WORK( Z + I + 1 ) = SIN( THETA ) / SIN( IOTA )
+               END IF
+            ELSE
+               ALPHA( I ) = COS( IOTA )
+               BETA( I ) = SIN( IOTA )
+               IF( WANTX ) THEN
+                  WORK( Z + I + 1 ) = COS( THETA ) / COS( IOTA ) / W
+               END IF
+            END IF
          END IF
-*
-*     Revert column permutation Π by permuting the rows of Q^T
-*
-         CALL DLAPMT( .FALSE., N, N, QT, LDQT, IWORK )
+      END DO
+*     Adjust rows of X for matrix scaling
+      IF( WANTX ) THEN
+         DO J = 0, N-1
+            DO I = 1, K1
+               WORK( LDX*J + I + 1 ) = WORK( LDX*J + I + 1 ) / W
+            END DO
+            DO I = 1, K
+               WORK( LDX*J + I + K1 + 1 ) =
+     $         WORK( LDX*J + I + K1 + 1 ) * WORK( Z + I + 1 )
+            END DO
+         END DO
       END IF
 *
       WORK( 1 ) = DBLE( LWKOPT )

From ea6b97557454b3f9024c57c34c8237fcc44e8bde Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 7 May 2020 21:15:42 +0200
Subject: [PATCH 071/101] DGGQRCS: fix typo

---
 SRC/dggqrcs.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 69662c7943..e7c30097fd 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -411,7 +411,7 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          NORMA = DLANGE( 'F', M, N, A, LDA, WORK )
          NORMB = DLANGE( 'F', P, N, B, LDB, WORK )
 *
-         IF( NORMA.GT.SQRT( 2.0E0 ) * NORMB ) THEN
+         IF( NORMA.GT.SQRT( 2.0D0 ) * NORMB ) THEN
             CALL DGGQRCS( JOBU2, JOBU1, JOBX, P, N, M, L,
      $                    SWAPPED,
      $                    B, LDB, A, LDA,

From dbae8056dedd27d84347f41f5a040bbbd34ef121 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 7 May 2020 21:16:33 +0200
Subject: [PATCH 072/101] CGGQRCS: update implementation

Port the most recent SGGQRCS code to single-precision complex.
---
 SRC/cggqrcs.f | 574 +++++++++++++++++++++++++++-----------------------
 1 file changed, 309 insertions(+), 265 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index afc43eb0bf..2c17a9fcf0 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -18,22 +18,22 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
+*       SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, SWAPPED,
 *                           A, LDA, B, LDB,
-*                           THETA, U1, LDU1, U2, LDU2, QT, LDQT,
+*                           ALPHA, BETA,
+*                           U1, LDU1, U2, LDU2
 *                           WORK, LWORK, RWORK, LRWORK, IWORK, INFO )
 *
 *       .. Scalar Arguments ..
-*       CHARACTER          JOBU1, JOB2, JOBQT
-*       INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
-*      $                   M, N, P, L, LWORK, LRWORK
-*       REAL               W
+*       CHARACTER          JOBU1, JOB2, JOBX
+*       INTEGER            INFO, LDA, LDB, LDU1, LDU2, M, N, P, L,
+*                          LWORK, LRWORK
 *       ..
 *       .. Array Arguments ..
 *       INTEGER            IWORK( * )
-*       REAL               THETA( * ), RWORK( * )
+*       REAL               ALPHA( N ), BETA( N ), RWORK( * )
 *       COMPLEX            A( LDA, * ), B( LDB, * ),
-*      $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQ, * ),
+*      $                   U1( LDU1, * ), U2( LDU2, * ),
 *      $                   WORK( * )
 *       ..
 *
@@ -46,60 +46,78 @@
 *> CGGQRCS computes the generalized singular value decomposition (GSVD)
 *> of an M-by-N complex matrix A and P-by-N complex matrix B:
 *>
-*>       U1**T*A*Q = D1*( 0 R ),    U2**T*B*Q = D2*( 0 R )
+*>       A = U1 * D1 * X,           B = U2 * D2 * X
 *>
-*> where U1, U2, and Q are orthogonal matrices. CGGQRCS uses the QR
+*> where U1 and U2 are unitary matrices. CGGQRCS uses the QR
 *> factorization with column pivoting and the 2-by-1 CS decomposition to
 *> compute the GSVD.
 *>
-*> Let L be the effective numerical rank of the matrix (A**T,B**T)**T,
-*> then R is an L-by-L nonsingular upper triangular matrix, D1 and
-*> D2 are M-by-L and P-by-L "diagonal" matrices and of the
-*> following structures, respectively:
+*> Let L be the effective numerical rank of the matrix (A**H,B**H)**H,
+*> then X is a L-by-N nonsingular matrix, D1 and D2 are M-by-L and
+*> P-by-L "diagonal" matrices. If SWAPPED is false, then D1 and D2 are
+*> of the of the following structures, respectively:
 *>
-*>                        K   K1
-*>        D1 =     (  0   0   0 )
-*>              K  (  0   S   0 )
-*>              K1 (  0   0   I )
+*>                 K1  K
+*>            K1 [ I   0   0 ]
+*>       D1 = K  [ 0   C   0 ]
+*>               [ 0   0   0 ]
 *>
-*>                    K2  K
-*>        D2 =  K2 (  I   0   0 )
-*>              K  (  0   C   0 )
-*>                 (  0   0   0 )
-*>
-*>                 N-L  L
-*>   ( 0 R ) = L (  0   R )
+*>                     K   K2
+*>               [ 0   0   0 ]
+*>       D2 = K  [ 0   S   0 ]
+*>            K2 [ 0   0   I ]
 *>
 *> where
 *>
 *>   K  = MIN(M, P, L, M + P - L),
 *>   K1 = MAX(L - P, 0),
 *>   K2 = MAX(L - M, 0),
-*>   C  = diag( COS(THETA(1)), ..., COS(THETA(K)) ),
-*>   S  = diag( SIN(THETA(1)), ..., SIN(THETA(K)) ), and
+*>   C  = diag( ALPHA(1), ..., ALPHA(K) ),
+*>   S  = diag( BETA(1), ..., BETA(K) ), and
+*>   C^2 + S^2 = I.
+*>
+*> If SWAPPED is true, then D1 and D2 are of the of the following
+*> structures, respectively:
+*>
+*>                     K   K1
+*>               [ 0   0   0 ]
+*>       D1 = K  [ 0   S   0 ]
+*>            K1 [ 0   0   I ]
+*>
+*>                 K2  K
+*>            K2 [ I   0   0 ]
+*>       D2 = K  [ 0   C   0 ]
+*>               [ 0   0   0 ]
+*>
+*> where
+*>
+*>   S  = diag( ALPHA(1), ..., ALPHA(K) ),
+*>   C  = diag( BETA(1), ..., BETA(K) ), and
 *>   C^2 + S^2 = I.
 *>
-*> The routine computes C, S, R, and optionally the orthogonal
-*> transformation matrices U, V and Q. If L <= M, then R is stored in
-*> A(1:L, 1:L) on exit. Otherwise, the first M rows of R are stored in
-*> A(:, 1:L) and R( M+1:, M+1: ) is stored in B(1:L-M, 1:L-M). In both
-*> cases, only the upper triangular part is stored.
+*> The routine computes C, S and optionally the matrices U1, U2, and X.
+*> On exit, X is stored in WORK( 2:L*N+1 ).
 *>
-*> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
-*> A and B implicitly gives the SVD of A*inv(B):
-*>                      A*inv(B) = U1*(D1*inv(D2))*U2**T.
-*> If (A**T,B**T)**T  has orthonormal columns, then the GSVD of A and B
+*> If B is an N-by-N nonsingular matrix, then the GSVD of the matrix
+*> pair (A, B) implicitly gives the SVD of A*inv(B):
+*>
+*>       A*inv(B) = U1*(D1*inv(D2))*U2**H.
+*>
+*> If (A**H,B**H)**H  has orthonormal columns, then the GSVD of A and B
 *> is also equal to the CS decomposition of A and B. Furthermore, the
 *> GSVD can be used to derive the solution of the eigenvalue problem:
-*>                      A**T*A x = lambda * B**T*B x.
+*>
+*>       A**H*A x = lambda * B**H*B x.
+*>
 *> In some literature, the GSVD of A and B is presented in the form
-*>                  U1**T*A*X = ( 0 D1 ),   U2**T*B*X = ( 0 D2 )
-*> where U1 and U2 are orthogonal and X is nonsingular, D1 and D2 are
-*> ``diagonal''.  The former GSVD form can be converted to the latter
-*> form by taking the nonsingular matrix X as
 *>
-*>                      X = Q*( I   0    )
-*>                            ( 0 inv(R) ).
+*>       A = U1*D1*( 0 R )*Q**H,    B = U2*D2*( 0 R )*Q**H
+*>
+*> where U1, U2, and Q are unitary matrices. This latter GSVD form is
+*> computed directly by DGGSVD3. It is possible to convert between the
+*> two representations by calculating the RQ decomposition of X but this
+*> is not recommended for reasons of numerical stability.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -119,11 +137,11 @@
 *>          = 'N':  U2 is not computed.
 *> \endverbatim
 *>
-*> \param[in] JOBQT
+*> \param[in] JOBX
 *> \verbatim
-*>          JOBQT is CHARACTER*1
-*>          = 'Y':  Orthogonal matrix Q is computed;
-*>          = 'N':  Q is not computed.
+*>          JOBX is CHARACTER*1
+*>          = 'Y':  Matrix X is computed;
+*>          = 'N':  X is not computed.
 *> \endverbatim
 *>
 *> \param[in] M
@@ -144,28 +162,25 @@
 *>          The number of rows of the matrix B.  P >= 1.
 *> \endverbatim
 *>
-*> \param[out] W
-*> \verbatim
-*>          W is REAL
-*>
-*>          On exit, W is a radix power chosen such that the Frobenius
-*>          norm of A and W*B are within sqrt(radix) and 1/sqrt(radix)
-*>          of each other.
-*> \endverbatim
-*>
 *> \param[out] L
 *> \verbatim
 *>          L is INTEGER
 *>          On exit, the effective numerical rank of the matrix
-*>          (A**T, B**T)**T.
+*>          (A**H, B**H)**H.
+*> \endverbatim
+*>
+*> \param[out] SWAPPED
+*> \verbatim
+*>          L is LOGICAL
+*>          On exit, SWAPPED is true if CGGQRCS swapped the input
+*>          matrices A, B and computed the GSVD of (B, A); false
+*>          otherwise.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
 *>          On entry, the M-by-N matrix A.
-*>          On exit, A contains the triangular matrix R or the first M
-*>          rows of R, respectively. See Purpose for details.
 *> \endverbatim
 *>
 *> \param[in] LDA
@@ -178,8 +193,6 @@
 *> \verbatim
 *>          B is COMPLEX array, dimension (LDB,N)
 *>          On entry, the P-by-N matrix B.
-*>          On exit, if L > M, then B contains the last L - M rows of
-*>          the triangular matrix R. See Purpose for details.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -188,18 +201,23 @@
 *>          The leading dimension of the array B. LDB >= max(1,P).
 *> \endverbatim
 *>
-*> \param[out] THETA
+*> \param[out] ALPHA
 *> \verbatim
-*>          THETA is REAL array, dimension (N)
+*>          ALPHA is REAL array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] BETA
+*> \verbatim
+*>          BETA is REAL array, dimension (N)
 *>
-*>          On exit, THETA contains K = MIN(M, P, L, M + P - L) values
-*>          in radians in ascending order.
+*>          On exit, ALPHA and BETA contain the K generalized singular
+*>          value pairs of A and B.
 *> \endverbatim
 *>
 *> \param[out] U1
 *> \verbatim
 *>          U1 is COMPLEX array, dimension (LDU1,M)
-*>          If JOBU1 = 'Y', U1 contains the M-by-M orthogonal matrix U1.
+*>          If JOBU1 = 'Y', U1 contains the M-by-M unitary matrix U1.
 *>          If JOBU1 = 'N', U1 is not referenced.
 *> \endverbatim
 *>
@@ -213,7 +231,7 @@
 *> \param[out] U2
 *> \verbatim
 *>          U2 is COMPLEX array, dimension (LDU2,P)
-*>          If JOBU2 = 'Y', U2 contains the P-by-P orthogonal matrix U2.
+*>          If JOBU2 = 'Y', U2 contains the P-by-P unitary matrix U2.
 *>          If JOBU2 = 'N', U2 is not referenced.
 *> \endverbatim
 *>
@@ -224,20 +242,6 @@
 *>          JOBU2 = 'Y'; LDU2 >= 1 otherwise.
 *> \endverbatim
 *>
-*> \param[out] QT
-*> \verbatim
-*>          QT is COMPLEX array, dimension (LDQT,N)
-*>          If JOBQT = 'Y', QT contains the N-by-N orthogonal matrix
-*>          Q**T.
-*> \endverbatim
-*>
-*> \param[in] LDQT
-*> \verbatim
-*>          LDQT is INTEGER
-*>          The leading dimension of the array QT. LDQT >= max(1,N) if
-*>          JOBQT = 'Y'; LDQT >= 1 otherwise.
-*> \endverbatim
-*>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
@@ -288,11 +292,19 @@
 *> \par Internal Parameters:
 *  =========================
 *>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL
+*>          W is a radix power chosen such that the Frobenius norm of A
+*>          and W*B are with SQRT(RADIX) and 1/SQRT(RADIX) of each
+*>          other.
+*> \endverbatim
+*>
 *> \verbatim
 *>  TOL     REAL
-*>          Let G = (A**T,B**T)**T. TOL is the threshold to determine
+*>          Let G = (A**H,B**H)**H. TOL is the threshold to determine
 *>          the effective rank of G. Generally, it is set to
-*>                   TOL = MAX(M,P,N) * norm(G) * MACHEPS,
+*>                   TOL = MAX( M + P, N ) * norm(G) * MACHEPS,
 *>          where norm(G) is the Frobenius norm of G.
 *>          The size of TOL may affect the size of backward error of the
 *>          decomposition.
@@ -303,9 +315,9 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date October 2019
+*> \date October 2019, May 2020
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup realGEsing
 *
 *> \par Contributors:
 *  ==================
@@ -316,60 +328,69 @@
 *> \par Further Details:
 *  =====================
 *>
-*>  CGGQRCS should be significantly faster than CGGSVD and CGGSVD3 for
-*>  large matrices because the matrices A and B are reduced to a pair of
+*>  CGGQRCS should be significantly faster than DGGSVD3 for large
+*>  matrices because the matrices A and B are reduced to a pair of
 *>  well-conditioned bidiagonal matrices instead of pairs of upper
 *>  triangular matrices. On the downside, CGGQRCS requires a much larger
-*>  workspace whose dimension must be queried at run-time.
+*>  workspace whose dimension must be queried at run-time. CGGQRCS also
+*>  offers no guarantees which of the two possible diagonal matrices
+*>  is used for the matrix factorization.
 *>
 *  =====================================================================
-      SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
-     $                    A, LDA, B, LDB,
-     $                    THETA, U1, LDU1, U2, LDU2, QT, LDQT,
-     $                    WORK, LWORK, RWORK, LRWORK, IWORK, INFO )
-*
-*  -- LAPACK driver routine (version 3.X.0) --
+      RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
+     $                              SWAPPED,
+     $                              A, LDA, B, LDB,
+     $                              ALPHA, BETA,
+     $                              U1, LDU1, U2, LDU2,
+     $                              WORK, LWORK, RWORK, LRWORK, IWORK,
+     $                              INFO )
+*
+*  -- LAPACK driver routine (version 3.7.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     <DATE>
+*     September 2016
 *
       IMPLICIT NONE
 *     .. Scalar Arguments ..
-      CHARACTER          JOBU1, JOBU2, JOBQT
-      INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
-     $                   L, M, N, P, LWORK, LRWORK
-      REAL               W
+      LOGICAL            SWAPPED
+      CHARACTER          JOBU1, JOBU2, JOBX
+      INTEGER            INFO, LDA, LDB, LDU1, LDU2, L, M, N, P, LWORK,
+     $                   LRWKOPT, LRWORK, LRWORK2BY1
 *     ..
 *     .. Array Arguments ..
       INTEGER            IWORK( * )
-      REAL               THETA( * ), RWORK( * )
+      REAL               ALPHA( N ), BETA( N ), RWORK( * )
       COMPLEX            A( LDA, * ), B( LDB, * ),
-     $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQT, * ),
+     $                   U1( LDU1, * ), U2( LDU2, * ),
      $                   WORK( * )
 *     ..
 *
 *  =====================================================================
 *
+*     .. Parameters ..
+      COMPLEX            CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0E0, 0.0E0 ),
+     $                   CZERO = ( 0.0E0, 0.0E0 ) )
 *     .. Local Scalars ..
-      LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
-      INTEGER            I, J, LMAX, Z, LDG, LWKOPT, LRWKOPT,
-     $                   LRWORK2BY1
-      REAL               GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
-      COMPLEX            ZERO, ONE, CNAN
+      LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
+      INTEGER            I, J, K, K1, LMAX, Z, LDG, LDX, LDVT, LWKOPT
+      REAL               BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
+     $                   THETA, IOTA, W
+      COMPLEX            CNAN
 *     .. Local Arrays ..
-      COMPLEX            G( M + P, N )
+      COMPLEX            G( M + P, N ), VT( N, N )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, CLANGE
+      COMPLEX            SLAMCH, CLANGE
       EXTERNAL           LSAME, SLAMCH, CLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGEMM, CGEQP3, CGERQF, CLACPY, CLAPMT, CLASCL,
-     $                   CLASET, CUNGQR, CUNGRQ, CUNCSD2BY1, XERBLA
+      EXTERNAL           CGEMM, CGEQP3, CLACPY, CLAPMT, CLASCL,
+     $                   CLASET, CUNGQR, CUNCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          COS, MAX, MIN, SIN, SQRT
 *     ..
 *     .. Executable Statements ..
 *
@@ -377,28 +398,9 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       WANTU1 = LSAME( JOBU1, 'Y' )
       WANTU2 = LSAME( JOBU2, 'Y' )
-      WANTQT = LSAME( JOBQT, 'Y' )
-      LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 )
+      WANTX = LSAME( JOBX, 'Y' )
+      LQUERY = LWORK.EQ.-1 .OR. LRWORK.EQ.-1
       LWKOPT = 1
-      LRWKOPT = 2*N
-*
-*     Initialize variables
-*
-      L = 0
-      LMAX = MIN( M + P, N )
-      Z = ( M + P ) * N
-      IF ( LQUERY ) THEN
-         G = 0
-      ELSE
-         G = WORK( 1 )
-      END IF
-      LDG = M + P
-      ZERO = (0.0E0, 0.0E0)
-      ONE = (1.0E0, 0.0E0)
-*     Computing 0.0 / 0.0 directly causes compiler errors
-      NAN = 1.0E0
-      NAN = 0.0 / (NAN - 1.0E0)
-      CNAN = CMPLX(NAN,NAN)
 *
 *     Test the input arguments
 *
@@ -407,7 +409,7 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          INFO = -1
       ELSE IF( .NOT.( WANTU2 .OR. LSAME( JOBU2, 'N' ) ) ) THEN
          INFO = -2
-      ELSE IF( .NOT.( WANTQT .OR. LSAME( JOBQT, 'N' ) ) ) THEN
+      ELSE IF( .NOT.( WANTX .OR. LSAME( JOBX, 'N' ) ) ) THEN
          INFO = -3
       ELSE IF( M.LT.1 ) THEN
          INFO = -4
@@ -416,39 +418,86 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
-         INFO = -10
+         INFO = -9
       ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
-         INFO = -12
+         INFO = -11
       ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
          INFO = -15
       ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
          INFO = -17
-      ELSE IF( LDQT.LT.1 .OR. ( WANTQT .AND. LDQT.LT.N ) ) THEN
-         INFO = -19
       ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
-         INFO = -23
-      ELSE IF( LRWORK.LT.2*N .AND. .NOT.LQUERY ) THEN
-         INFO = -25
+         INFO = -19
       END IF
 *
-*     Compute optimal workspace size
+*     Make sure A is the matrix smaller in norm
 *
       IF( INFO.EQ.0 ) THEN
-*        CGEQP3, CUNGQR read/store LMAX scalar factors
-         CALL CGEQP3( M+P, N, G, LDG, IWORK, WORK,
-     $                WORK, -1, RWORK, INFO )
-         LWKOPT = INT( WORK( 1 ) ) + LMAX
-
+         NORMA = CLANGE( 'F', M, N, A, LDA, RWORK )
+         NORMB = CLANGE( 'F', P, N, B, LDB, RWORK )
+*
+         IF( NORMA.GT.SQRT( 2.0E0 ) * NORMB ) THEN
+            CALL CGGQRCS( JOBU2, JOBU1, JOBX, P, N, M, L,
+     $                    SWAPPED,
+     $                    B, LDB, A, LDA,
+     $                    BETA, ALPHA,
+     $                    U2, LDU2, U1, LDU1,
+     $                    WORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+            SWAPPED = .TRUE.
+            RETURN
+         ENDIF
+*
+*     Past this point, we know that
+*     * NORMA <= NORMB (almost)
+*     * W >= 1
+*     * ALPHA will contain cosine values at the end
+*     * BETA will contain sine values at the end
+*
+      END IF
+*
+*     Initialize variables
+*
+*     Computing 0.0 / 0.0 directly causes compiler errors
+      NAN = 1.0E0
+      NAN = 0.0 / (NAN - 1.0E0)
+      CNAN = CMPLX( NAN, NAN )
+*
+      SWAPPED = .FALSE.
+      L = 0
+      LMAX = MIN( M + P, N )
+      Z = ( M + P ) * N
+      G = WORK( 1 )
+      LDG = M + P
+      VT = 0
+      LDVT = N
+      THETA = NAN
+      IOTA = NAN
+      W = NAN
+*
+*     Compute workspace
+*
+      IF( INFO.EQ.0 ) THEN
+         LWKOPT = 0
+*
+         CALL CGEQP3( M + P, N, G, LDG, IWORK, WORK, WORK, -1, RWORK,
+     $                INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+         LWKOPT = INT( WORK( 1 ) )
+*
          CALL CUNGQR( M + P, LMAX, LMAX, G, LDG, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
-
-         CALL CUNCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, LMAX,
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+*
+         CALL CUNCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, LMAX,
      $                    G, LDG, G, LDG,
-     $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
+     $                    ALPHA,
+     $                    U1, LDU1, U2, LDU2, VT, LDVT,
      $                    WORK, -1, RWORK, LRWORK, IWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
-*        The matrix (A, B) must be stored sequentially for xUNCSD2BY1
-         LWKOPT = Z + LWKOPT
+*        The matrix (A, B) must be stored sequentially for CUNGQR
+         LWKOPT = LWKOPT + Z
+*        2-by-1 CSD matrix V1 must be stored
+         IF( WANTX ) THEN
+            LWKOPT = LWKOPT + LDVT*N
+         END IF
 *        Adjust CUNCSD2BY1 LRWORK for case with maximum memory
 *        consumption
          LRWORK2BY1 = INT( RWORK(1) )
@@ -459,14 +508,7 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
      $                - 8 * MAX( 0, MIN( M, P, N, M+P-N ) )
      $                + 8 * MIN( M, P, N )
          LRWKOPT = MAX( 2*N, LRWORK2BY1 )
-
-*        CGERQF, CUNGRQ read/store up to LMAX scalar factors
-         CALL CGERQF( LMAX, N, QT, LDQT, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
-
-         CALL CUNGRQ( N, N, LMAX, QT, LDQT, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
-
+*
          WORK( 1 ) = CMPLX( REAL( LWKOPT ), 0.0E0 )
          RWORK( 1 ) = REAL( LRWKOPT )
       END IF
@@ -478,32 +520,29 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       IF( LQUERY ) THEN
          RETURN
       ENDIF
+*     Finish initialization
+      IF( WANTX ) THEN
+         VT = WORK( Z + 1 )
+      END IF
 *
-*     DEBUG
-*
-      IWORK( 1:M+N+P ) = -1
-*
-*     Scale matrix B such that norm(A) \approx norm(B)
-*
-      NORMA = CLANGE( 'F', M, N, A, LDA, RWORK )
-      NORMB = CLANGE( 'F', P, N, B, LDB, RWORK )
+*     Scale matrix A such that norm(A) \approx norm(B)
 *
-      IF ( NORMB.EQ.0 ) THEN
+      IF( NORMA.EQ.0.0E0 ) THEN
          W = 1.0E0
       ELSE
          BASE = SLAMCH( 'B' )
-         W = BASE ** INT( LOG( NORMA / NORMB ) / LOG( BASE ) )
+         W = BASE ** INT( LOG( NORMB / NORMA ) / LOG( BASE ) )
 *
-         CALL CLASCL( 'G', -1, -1, 1.0E0, W, P, N, B, LDB, INFO )
+         CALL CLASCL( 'G', -1, -1, 1.0E0, W, M, N, A, LDA, INFO )
          IF ( INFO.NE.0 ) THEN
             RETURN
          END IF
       END IF
 *
-*     Copy matrices A, B into the (M+P) x n matrix G
+*     Copy matrices A, B into the (M+P) x N matrix G
 *
-      CALL CLACPY( 'A', M, N, A, LDA, G( P + 1, 1 ), LDG )
-      CALL CLACPY( 'A', P, N, B, LDB, G( 1, 1 ), LDG )
+      CALL CLACPY( 'A', M, N, A, LDA, G( 1, 1 ), LDG )
+      CALL CLACPY( 'A', P, N, B, LDB, G( M + 1, 1 ), LDG )
 *
 *     DEBUG
 *
@@ -512,14 +551,14 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Compute the Frobenius norm of matrix G
 *
-      GNORM = CLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
+      NORMG = NORMB * SQRT( 1.0E0 + ( ( W * NORMA ) / NORMB )**2 )
 *
 *     Get machine precision and set up threshold for determining
 *     the effective numerical rank of the matrix G.
 *
       ULP = SLAMCH( 'Precision' )
       UNFL = SLAMCH( 'Safe Minimum' )
-      TOL = MAX( M + P, N ) * MAX( GNORM, UNFL ) * ULP
+      TOL = MAX( M + P, N ) * MAX( NORMG, UNFL ) * ULP
 *
 *     IWORK stores the column permutations computed by CGEQP3.
 *     Columns J where IWORK( J ) is non-zero are permuted to the front
@@ -537,42 +576,41 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Determine the rank of G
 *
-      DO 20 I = 1, LMAX
+      DO I = 1, MIN( M + P, N )
          IF( ABS( G( I, I ) ).LE.TOL ) THEN
             EXIT
          END IF
          L = L + 1
-   20 CONTINUE
+      END DO
 *
 *     Handle rank=0 case
 *
       IF( L.EQ.0 ) THEN
          IF( WANTU1 ) THEN
-            CALL CLASET( 'A', M, M, ZERO, ONE, U1, LDU1 )
+            CALL CLASET( 'A', M, M, CZERO, CONE, U1, LDU1 )
          END IF
          IF( WANTU2 ) THEN
-            CALL CLASET( 'A', P, P, ZERO, ONE, U2, LDU2 )
-         END IF
-         IF( WANTQT ) THEN
-            CALL CLASET( 'A', N, N, ZERO, ONE, QT, LDQT )
+            CALL CLASET( 'A', P, P, CZERO, CONE, U2, LDU2 )
          END IF
 *
-         WORK( 1 ) = CMPLX( REAL(LWKOPT), 0.0E0 )
-         RWORK( 1 ) = REAL(LRWKOPT)
+         WORK( 1 ) = CMPLX( REAL ( LWKOPT ), 0.0E0 )
+         RWORK( 1 ) = REAL( LRWKOPT )
          RETURN
       END IF
 *
 *     Copy R1( 1:L, : ) into A, B and set lower triangular part to zero
 *
-      IF( L.LE.M ) THEN
-          CALL CLACPY( 'U', L, N, G, LDG, A, LDA )
-          CALL CLASET( 'L', L - 1, N, ZERO, ZERO, A( 2, 1 ), LDA )
-      ELSE
-          CALL CLACPY( 'U', M, N, G, LDG, A, LDA )
-          CALL CLACPY( 'U', L - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
+      IF( WANTX ) THEN
+         IF( L.LE.M ) THEN
+             CALL CLACPY( 'U', L, N, G, LDG, A, LDA )
+             CALL CLASET( 'L', L - 1, N, CZERO, CZERO, A( 2, 1 ), LDA )
+         ELSE
+             CALL CLACPY( 'U', M, N, G, LDG, A, LDA )
+             CALL CLACPY( 'U', L - M, N - M, G( M+1,M+1 ), LDG, B, LDB )
 *
-          CALL CLASET( 'L', M - 1, N, ZERO, ZERO, A( 2, 1 ), LDA )
-          CALL CLASET( 'L', L-M-1, N, ZERO, ZERO, B( 2, 1 ), LDB )
+             CALL CLASET( 'L', M - 1, N, CZERO, CZERO, A( 2, 1 ), LDA )
+             CALL CLASET( 'L', L-M-1, N, CZERO, CZERO, B( 2, 1 ), LDB )
+         END IF
       END IF
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition
@@ -585,92 +623,98 @@ SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     DEBUG
 *
-      RWORK( 1:LRWORK ) = NAN
-      WORK( Z+1:LWORK ) = CNAN
+      ALPHA( 1:N ) = CNAN
+      BETA( 1:N ) = CNAN
 *
 *     Compute the CS decomposition of Q1( :, 1:L )
 *
-      CALL CUNCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
-     $                 G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
-     $                 U2, LDU2, U1, LDU1, QT, LDQT,
-     $                 WORK( Z + 1 ), LWORK - Z,
-     $                 RWORK, LRWORK, IWORK( N + 1 ), INFO )
+      K = MIN( M, P, L, M + P - L )
+      K1 = MAX( L - P, 0 )
+      CALL CUNCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, L,
+     $                 G( 1, 1 ), LDG, G( M + 1, 1 ), LDG,
+     $                 ALPHA,
+     $                 U1, LDU1, U2, LDU2, VT, LDVT,
+     $                 WORK( Z + LDVT*N + 1 ), LWORK - Z - LDVT*N,
+     $                 RWORK, LRWORK,
+     $                 IWORK( N + 1 ), INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
       END IF
 *
 *     DEBUG
 *
-      WORK( 1:LWORK ) = CNAN
-      RWORK( 1:LRWORK ) = NAN
-*
-*     Copy V^T from QT to G
-*
-      CALL CLACPY( 'A', L, L, QT, LDQT, G, LDG )
-*
-*     DEBUG
-*
-      CALL CLASET( 'A', N, N, CNAN, CNAN, QT, LDQT )
-*
-*     Compute V^T R1( 1:L, : ) in the last L rows of QT
-*
-      IF ( L.LE.M ) THEN
-         CALL CGEMM( 'N', 'N', L, N, L, ONE, G, LDG,
-     $               A, LDA, ZERO, QT( N-L+1, 1 ), LDQT )
-      ELSE
-         CALL CGEMM( 'N', 'N', L, N, M, ONE, G( 1, 1 ), LDG,
-     $               A, LDA, ZERO, QT( N-L+1, 1 ), LDQT )
-         CALL CGEMM( 'N', 'N', L, N - M, L - M, ONE,
-     $               G( 1, M + 1 ), LDG, B, LDB,
-     $               ONE, QT( N-L+1, M+1 ), LDQT )
-      END IF
-*
-*     DEBUG
-*
-      CALL CLASET( 'A', M, N, CNAN, CNAN, A, LDA )
-      CALL CLASET( 'A', P, N, CNAN, CNAN, B, LDB )
-      WORK(1:LWORK) = CNAN
-*
-*     Compute the RQ decomposition of V^T R1( 1:L, : )
-*
-      CALL CGERQF( L, N, QT( N-L+1, 1 ), LDQT, WORK( 1 ),
-     $             WORK( L + 1 ), LWORK - L, INFO )
-      IF ( INFO.NE.0 ) THEN
-         RETURN
-      END IF
-*
-*     Copy matrix L from QT( N-L+1:N, N-L+1:N ) to A, B
-*
-      IF ( L.LE.M ) THEN
-         CALL CLACPY( 'U', L, L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
-      ELSE
-         CALL CLACPY( 'U', M,     L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
-         CALL CLACPY( 'U', L - M, L - M, QT( N-L+M+1, N-L+M+1 ), LDQT,
-     $                B, LDB )
+      WORK( 1:LDG*N ) = CNAN
+      RWORK( 1:2*N ) = NAN
+*
+*     Compute X = V^T R1( 1:L, : ) and adjust for matrix scaling
+*
+      IF( WANTX ) THEN
+         LDX = L
+         IF ( L.LE.M ) THEN
+            CALL CGEMM( 'N', 'N', L, N, L,
+     $                  CONE, VT, LDVT, A, LDA,
+     $                  CZERO, WORK( 2 ), LDX )
+         ELSE
+            CALL CGEMM( 'N', 'N', L, N, M,
+     $                  CONE, VT( 1, 1 ), LDVT, A, LDA,
+     $                  CZERO, WORK( 2 ), LDX )
+            CALL CGEMM( 'N', 'N', L, N - M, L - M,
+     $                  CONE, VT( 1, M + 1 ), LDVT, B, LDB,
+     $                  CONE, WORK( L*M + 2 ), LDX )
+         END IF
+*        Revert column permutation Π by permuting the columns of X
+         CALL CLAPMT( .FALSE., L, N, WORK( 2 ), LDX, IWORK )
       END IF
 *
-*     DEBUG
-*
-      CALL CLASET( 'U', L, L, CNAN, CNAN, QT( 1, N-L+1 ), LDQT )
-      WORK( L+1:LWORK ) = CNAN
-*
-*     Explicitly form Q^T
-*
-      IF( WANTQT ) THEN
-         CALL CUNGRQ( N, N, L, QT, LDQT, WORK,
-     $                WORK( L + 1 ), LWORK - L, INFO )
-         IF ( INFO.NE.0 ) THEN
-            RETURN
+*     Adjust generalized singular values for matrix scaling
+*     Compute sine, cosine values
+*     Prepare row scaling of X
+*
+      DO I = 1, K
+         THETA = ALPHA( I )
+*        Do not adjust singular value if THETA is greater
+*        than pi/2 (infinite singular values won't change)
+         IF( COS( THETA ).LE.0.0E0 ) THEN
+            ALPHA( I ) = 0.0E0
+            BETA( I ) = 1.0E0
+            IF( WANTX ) THEN
+               RWORK( I ) = 1.0E0
+            END IF
+         ELSE
+*           iota comes in the greek alphabet after theta
+            IOTA = ATAN( W * TAN( THETA ) )
+*           ensure sine, cosine divisor is far away from zero
+*           w is a power of two and will cause no trouble
+            IF( SIN( IOTA ) .GE. COS( IOTA ) ) THEN
+               ALPHA( I ) =  ( SIN( IOTA ) / TAN( THETA ) ) / W
+               BETA( I ) = SIN( IOTA )
+               IF( WANTX ) THEN
+                  RWORK( I ) = SIN( THETA ) / SIN( IOTA )
+               END IF
+            ELSE
+               ALPHA( I ) = COS( IOTA )
+               BETA( I ) = SIN( IOTA )
+               IF( WANTX ) THEN
+                  RWORK( I ) = COS( THETA ) / COS( IOTA ) / W
+               END IF
+            END IF
          END IF
-*
-*     Revert column permutation Π by permuting the rows of Q^T
-*
-         CALL CLAPMT( .FALSE., N, N, QT, LDQT, IWORK )
+      END DO
+*     Adjust rows of X for matrix scaling
+      IF( WANTX ) THEN
+         DO J = 0, N-1
+            DO I = 1, K1
+               WORK( LDX*J + I + 1 ) = WORK( LDX*J + I + 1 ) / W
+            END DO
+            DO I = 1, K
+               WORK( LDX*J + I + K1 + 1 ) =
+     $         WORK( LDX*J + I + K1 + 1 ) * RWORK( I )
+            END DO
+         END DO
       END IF
 *
-      WORK( 1 ) = CMPLX( REAL(LWKOPT), 0.0E0 )
-      RWORK( 1 ) = REAL(LRWKOPT)
-
+      WORK( 1 ) = CMPLX( REAL( LWKOPT ), 0.0E0 )
+      RWORK( 1 ) = REAL( LRWKOPT )
       RETURN
 *
 *     End of CGGQRCS

From 02cc14d492f37e065a00aad8374109e35064a3e6 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 7 May 2020 21:52:33 +0200
Subject: [PATCH 073/101] ZGGQRCS: update implementation

Port the most recent CGGQRCS code to double-precision complex.
---
 SRC/zggqrcs.f | 580 +++++++++++++++++++++++++++-----------------------
 1 file changed, 312 insertions(+), 268 deletions(-)

diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index 8bee9e7542..3214722c01 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -18,22 +18,22 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
+*       SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L, SWAPPED,
 *                           A, LDA, B, LDB,
-*                           THETA, U1, LDU1, U2, LDU2, QT, LDQT,
+*                           ALPHA, BETA,
+*                           U1, LDU1, U2, LDU2
 *                           WORK, LWORK, RWORK, LRWORK, IWORK, INFO )
 *
 *       .. Scalar Arguments ..
-*       CHARACTER          JOBU1, JOB2, JOBQT
-*       INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
-*      $                   M, N, P, L, LWORK, LRWORK
-*       DOUBLE PRECISION   W
+*       CHARACTER          JOBU1, JOB2, JOBX
+*       INTEGER            INFO, LDA, LDB, LDU1, LDU2, M, N, P, L,
+*                          LWORK, LRWORK
 *       ..
 *       .. Array Arguments ..
 *       INTEGER            IWORK( * )
-*       DOUBLE PRECISION   THETA( * ), RWORK( * )
+*       DOUBLE PRECISION   ALPHA( N ), BETA( N ), RWORK( * )
 *       COMPLEX*16         A( LDA, * ), B( LDB, * ),
-*      $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQ, * ),
+*      $                   U1( LDU1, * ), U2( LDU2, * ),
 *      $                   WORK( * )
 *       ..
 *
@@ -46,60 +46,78 @@
 *> ZGGQRCS computes the generalized singular value decomposition (GSVD)
 *> of an M-by-N complex matrix A and P-by-N complex matrix B:
 *>
-*>       U1**T*A*Q = D1*( 0 R ),    U2**T*B*Q = D2*( 0 R )
+*>       A = U1 * D1 * X,           B = U2 * D2 * X
 *>
-*> where U1, U2, and Q are orthogonal matrices. ZGGQRCS uses the QR
+*> where U1 and U2 are unitary matrices. ZGGQRCS uses the QR
 *> factorization with column pivoting and the 2-by-1 CS decomposition to
 *> compute the GSVD.
 *>
-*> Let L be the effective numerical rank of the matrix (A**T,B**T)**T,
-*> then R is an L-by-L nonsingular upper triangular matrix, D1 and
-*> D2 are M-by-L and P-by-L "diagonal" matrices and of the
-*> following structures, respectively:
+*> Let L be the effective numerical rank of the matrix (A**H,B**H)**H,
+*> then X is a L-by-N nonsingular matrix, D1 and D2 are M-by-L and
+*> P-by-L "diagonal" matrices. If SWAPPED is false, then D1 and D2 are
+*> of the of the following structures, respectively:
 *>
-*>                        K   K1
-*>        D1 =     (  0   0   0 )
-*>              K  (  0   S   0 )
-*>              K1 (  0   0   I )
+*>                 K1  K
+*>            K1 [ I   0   0 ]
+*>       D1 = K  [ 0   C   0 ]
+*>               [ 0   0   0 ]
 *>
-*>                    K2  K
-*>        D2 =  K2 (  I   0   0 )
-*>              K  (  0   C   0 )
-*>                 (  0   0   0 )
-*>
-*>                 N-L  L
-*>   ( 0 R ) = L (  0   R )
+*>                     K   K2
+*>               [ 0   0   0 ]
+*>       D2 = K  [ 0   S   0 ]
+*>            K2 [ 0   0   I ]
 *>
 *> where
 *>
 *>   K  = MIN(M, P, L, M + P - L),
 *>   K1 = MAX(L - P, 0),
 *>   K2 = MAX(L - M, 0),
-*>   C  = diag( COS(THETA(1)), ..., COS(THETA(K)) ),
-*>   S  = diag( SIN(THETA(1)), ..., SIN(THETA(K)) ), and
+*>   C  = diag( ALPHA(1), ..., ALPHA(K) ),
+*>   S  = diag( BETA(1), ..., BETA(K) ), and
+*>   C^2 + S^2 = I.
+*>
+*> If SWAPPED is true, then D1 and D2 are of the of the following
+*> structures, respectively:
+*>
+*>                     K   K1
+*>               [ 0   0   0 ]
+*>       D1 = K  [ 0   S   0 ]
+*>            K1 [ 0   0   I ]
+*>
+*>                 K2  K
+*>            K2 [ I   0   0 ]
+*>       D2 = K  [ 0   C   0 ]
+*>               [ 0   0   0 ]
+*>
+*> where
+*>
+*>   S  = diag( ALPHA(1), ..., ALPHA(K) ),
+*>   C  = diag( BETA(1), ..., BETA(K) ), and
 *>   C^2 + S^2 = I.
 *>
-*> The routine computes C, S, R, and optionally the orthogonal
-*> transformation matrices U, V and Q. If L <= M, then R is stored in
-*> A(1:L, 1:L) on exit. Otherwise, the first M rows of R are stored in
-*> A(:, 1:L) and R( M+1:, M+1: ) is stored in B(1:L-M, 1:L-M). In both
-*> cases, only the upper triangular part is stored.
+*> The routine computes C, S and optionally the matrices U1, U2, and X.
+*> On exit, X is stored in WORK( 2:L*N+1 ).
 *>
-*> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
-*> A and B implicitly gives the SVD of A*inv(B):
-*>                      A*inv(B) = U1*(D1*inv(D2))*U2**T.
-*> If (A**T,B**T)**T  has orthonormal columns, then the GSVD of A and B
+*> If B is an N-by-N nonsingular matrix, then the GSVD of the matrix
+*> pair (A, B) implicitly gives the SVD of A*inv(B):
+*>
+*>       A*inv(B) = U1*(D1*inv(D2))*U2**H.
+*>
+*> If (A**H,B**H)**H  has orthonormal columns, then the GSVD of A and B
 *> is also equal to the CS decomposition of A and B. Furthermore, the
 *> GSVD can be used to derive the solution of the eigenvalue problem:
-*>                      A**T*A x = lambda * B**T*B x.
+*>
+*>       A**H*A x = lambda * B**H*B x.
+*>
 *> In some literature, the GSVD of A and B is presented in the form
-*>                  U1**T*A*X = ( 0 D1 ),   U2**T*B*X = ( 0 D2 )
-*> where U1 and U2 are orthogonal and X is nonsingular, D1 and D2 are
-*> ``diagonal''.  The former GSVD form can be converted to the latter
-*> form by taking the nonsingular matrix X as
 *>
-*>                      X = Q*( I   0    )
-*>                            ( 0 inv(R) ).
+*>       A = U1*D1*( 0 R )*Q**H,    B = U2*D2*( 0 R )*Q**H
+*>
+*> where U1, U2, and Q are unitary matrices. This latter GSVD form is
+*> computed directly by DGGSVD3. It is possible to convert between the
+*> two representations by calculating the RQ decomposition of X but this
+*> is not recommended for reasons of numerical stability.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -119,11 +137,11 @@
 *>          = 'N':  U2 is not computed.
 *> \endverbatim
 *>
-*> \param[in] JOBQT
+*> \param[in] JOBX
 *> \verbatim
-*>          JOBQT is CHARACTER*1
-*>          = 'Y':  Orthogonal matrix Q is computed;
-*>          = 'N':  Q is not computed.
+*>          JOBX is CHARACTER*1
+*>          = 'Y':  Matrix X is computed;
+*>          = 'N':  X is not computed.
 *> \endverbatim
 *>
 *> \param[in] M
@@ -144,28 +162,25 @@
 *>          The number of rows of the matrix B.  P >= 1.
 *> \endverbatim
 *>
-*> \param[out] W
-*> \verbatim
-*>          W is DOUBLE PRECISION
-*>
-*>          On exit, W is a radix power chosen such that the Frobenius
-*>          norm of A and W*B are within sqrt(radix) and 1/sqrt(radix)
-*>          of each other.
-*> \endverbatim
-*>
 *> \param[out] L
 *> \verbatim
 *>          L is INTEGER
 *>          On exit, the effective numerical rank of the matrix
-*>          (A**T, B**T)**T.
+*>          (A**H, B**H)**H.
+*> \endverbatim
+*>
+*> \param[out] SWAPPED
+*> \verbatim
+*>          L is LOGICAL
+*>          On exit, SWAPPED is true if ZGGQRCS swapped the input
+*>          matrices A, B and computed the GSVD of (B, A); false
+*>          otherwise.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
 *>          On entry, the M-by-N matrix A.
-*>          On exit, A contains the triangular matrix R or the first M
-*>          rows of R, respectively. See Purpose for details.
 *> \endverbatim
 *>
 *> \param[in] LDA
@@ -178,8 +193,6 @@
 *> \verbatim
 *>          B is COMPLEX*16 array, dimension (LDB,N)
 *>          On entry, the P-by-N matrix B.
-*>          On exit, if L > M, then B contains the last L - M rows of
-*>          the triangular matrix R. See Purpose for details.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -188,18 +201,23 @@
 *>          The leading dimension of the array B. LDB >= max(1,P).
 *> \endverbatim
 *>
-*> \param[out] THETA
+*> \param[out] ALPHA
 *> \verbatim
-*>          THETA is DOUBLE PRECISION array, dimension (N)
+*>          ALPHA is DOUBLE PRECISIONarray, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] BETA
+*> \verbatim
+*>          BETA is DOUBLE PRECISIONarray, dimension (N)
 *>
-*>          On exit, THETA contains K = MIN(M, P, L, M + P - L) values
-*>          in radians in ascending order.
+*>          On exit, ALPHA and BETA contain the K generalized singular
+*>          value pairs of A and B.
 *> \endverbatim
 *>
 *> \param[out] U1
 *> \verbatim
 *>          U1 is COMPLEX*16 array, dimension (LDU1,M)
-*>          If JOBU1 = 'Y', U1 contains the M-by-M orthogonal matrix U1.
+*>          If JOBU1 = 'Y', U1 contains the M-by-M unitary matrix U1.
 *>          If JOBU1 = 'N', U1 is not referenced.
 *> \endverbatim
 *>
@@ -213,7 +231,7 @@
 *> \param[out] U2
 *> \verbatim
 *>          U2 is COMPLEX*16 array, dimension (LDU2,P)
-*>          If JOBU2 = 'Y', U2 contains the P-by-P orthogonal matrix U2.
+*>          If JOBU2 = 'Y', U2 contains the P-by-P unitary matrix U2.
 *>          If JOBU2 = 'N', U2 is not referenced.
 *> \endverbatim
 *>
@@ -224,20 +242,6 @@
 *>          JOBU2 = 'Y'; LDU2 >= 1 otherwise.
 *> \endverbatim
 *>
-*> \param[out] QT
-*> \verbatim
-*>          QT is COMPLEX*16 array, dimension (LDQT,N)
-*>          If JOBQT = 'Y', QT contains the N-by-N orthogonal matrix
-*>          Q**T.
-*> \endverbatim
-*>
-*> \param[in] LDQT
-*> \verbatim
-*>          LDQT is INTEGER
-*>          The leading dimension of the array QT. LDQT >= max(1,N) if
-*>          JOBQT = 'Y'; LDQT >= 1 otherwise.
-*> \endverbatim
-*>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
@@ -257,7 +261,7 @@
 *>
 *> \param[out] RWORK
 *> \verbatim
-*>          RWORK is DOUBLE PRECISION array, dimension (MAX(1,LRWORK))
+*>          RWORK is DOUBLE PRECISIONarray, dimension (MAX(1,LRWORK))
 *> \endverbatim
 *>
 *> \param[in] LRWORK
@@ -281,18 +285,26 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit.
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
-*>          > 0:  ZBBCSD did not converge. For further details, see
-*>                subroutine ZUNCSDBY1.
+*>          > 0:  CBBCSD did not converge. For further details, see
+*>                subroutine CUNCSDBY1.
 *> \endverbatim
 *
 *> \par Internal Parameters:
 *  =========================
 *>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION
+*>          W is a radix power chosen such that the Frobenius norm of A
+*>          and W*B are with SQRT(RADIX) and 1/SQRT(RADIX) of each
+*>          other.
+*> \endverbatim
+*>
 *> \verbatim
 *>  TOL     DOUBLE PRECISION
-*>          Let G = (A**T,B**T)**T. TOL is the threshold to determine
+*>          Let G = (A**H,B**H)**H. TOL is the threshold to determine
 *>          the effective rank of G. Generally, it is set to
-*>                   TOL = MAX(M,P,N) * norm(G) * MACHEPS,
+*>                   TOL = MAX( M + P, N ) * norm(G) * MACHEPS,
 *>          where norm(G) is the Frobenius norm of G.
 *>          The size of TOL may affect the size of backward error of the
 *>          decomposition.
@@ -303,9 +315,9 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date April 2020
+*> \date October 2019, May 2020
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup realGEsing
 *
 *> \par Contributors:
 *  ==================
@@ -316,60 +328,69 @@
 *> \par Further Details:
 *  =====================
 *>
-*>  ZGGQRCS should be significantly faster than ZGGSVD and ZGGSVD3 for
-*>  large matrices because the matrices A and B are reduced to a pair of
+*>  ZGGQRCS should be significantly faster than DGGSVD3 for large
+*>  matrices because the matrices A and B are reduced to a pair of
 *>  well-conditioned bidiagonal matrices instead of pairs of upper
 *>  triangular matrices. On the downside, ZGGQRCS requires a much larger
-*>  workspace whose dimension must be queried at run-time.
+*>  workspace whose dimension must be queried at run-time. ZGGQRCS also
+*>  offers no guarantees which of the two possible diagonal matrices
+*>  is used for the matrix factorization.
 *>
 *  =====================================================================
-      SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
-     $                    A, LDA, B, LDB,
-     $                    THETA, U1, LDU1, U2, LDU2, QT, LDQT,
-     $                    WORK, LWORK, RWORK, LRWORK, IWORK, INFO )
-*
-*  -- LAPACK driver routine (version 3.X.0) --
+      RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
+     $                              SWAPPED,
+     $                              A, LDA, B, LDB,
+     $                              ALPHA, BETA,
+     $                              U1, LDU1, U2, LDU2,
+     $                              WORK, LWORK, RWORK, LRWORK, IWORK,
+     $                              INFO )
+*
+*  -- LAPACK driver routine (version 3.7.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     <DATE>
+*     September 2016
 *
       IMPLICIT NONE
 *     .. Scalar Arguments ..
-      CHARACTER          JOBU1, JOBU2, JOBQT
-      INTEGER            INFO, LDA, LDB, LDU1, LDU2, LDQT,
-     $                   L, M, N, P, LWORK, LRWORK
-      DOUBLE PRECISION   W
+      LOGICAL            SWAPPED
+      CHARACTER          JOBU1, JOBU2, JOBX
+      INTEGER            INFO, LDA, LDB, LDU1, LDU2, L, M, N, P, LWORK,
+     $                   LRWKOPT, LRWORK, LRWORK2BY1
 *     ..
 *     .. Array Arguments ..
       INTEGER            IWORK( * )
-      DOUBLE PRECISION   THETA( * ), RWORK( * )
+      DOUBLE PRECISION   ALPHA( N ), BETA( N ), RWORK( * )
       COMPLEX*16         A( LDA, * ), B( LDB, * ),
-     $                   U1( LDU1, * ), U2( LDU2, * ), QT( LDQT, * ),
+     $                   U1( LDU1, * ), U2( LDU2, * ),
      $                   WORK( * )
 *     ..
 *
 *  =====================================================================
 *
+*     .. Parameters ..
+      COMPLEX*16         CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0D0, 0.0D0 ),
+     $                   CZERO = ( 0.0D0, 0.0D0 ) )
 *     .. Local Scalars ..
-      LOGICAL            WANTU1, WANTU2, WANTQT, LQUERY
-      INTEGER            I, J, LMAX, Z, LDG, LWKOPT, LRWKOPT,
-     $                   LRWORK2BY1
-      DOUBLE PRECISION   GNORM, TOL, ULP, UNFL, NORMA, NORMB, BASE, NAN
-      COMPLEX*16         ZERO, ONE, ZNAN
+      LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
+      INTEGER            I, J, K, K1, LMAX, Z, LDG, LDX, LDVT, LWKOPT
+      DOUBLE PRECISION   BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
+     $                   THETA, IOTA, W
+      COMPLEX*16         ZNAN
 *     .. Local Arrays ..
-      COMPLEX*16         G( M + P, N )
+      COMPLEX*16         G( M + P, N ), VT( N, N )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      DOUBLE PRECISION   DLAMCH, ZLANGE
+      COMPLEX*16         DLAMCH, ZLANGE
       EXTERNAL           LSAME, DLAMCH, ZLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZGEMM, ZGEQP3, ZGERQF, ZLACPY, ZLAPMT, ZLASCL,
-     $                   ZLASET, ZUNGQR, ZUNGRQ, ZUNCSD2BY1, XERBLA
+      EXTERNAL           ZGEMM, ZGEQP3, ZLACPY, ZLAPMT, ZLASCL,
+     $                   ZLASET, ZUNGQR, ZUNCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          COS, MAX, MIN, SIN, SQRT
 *     ..
 *     .. Executable Statements ..
 *
@@ -377,28 +398,9 @@ SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
       WANTU1 = LSAME( JOBU1, 'Y' )
       WANTU2 = LSAME( JOBU2, 'Y' )
-      WANTQT = LSAME( JOBQT, 'Y' )
-      LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 )
+      WANTX = LSAME( JOBX, 'Y' )
+      LQUERY = LWORK.EQ.-1 .OR. LRWORK.EQ.-1
       LWKOPT = 1
-      LRWKOPT = 2*N
-*
-*     Initialize variables
-*
-      L = 0
-      LMAX = MIN( M + P, N )
-      Z = ( M + P ) * N
-      IF ( LQUERY ) THEN
-         G = 0
-      ELSE
-         G = WORK( 1 )
-      END IF
-      LDG = M + P
-      ZERO = (0.0D0, 0.0D0)
-      ONE = (1.0D0, 0.0D0)
-*     Computing 0.0 / 0.0 directly causes compiler errors
-      NAN = 1.0D0
-      NAN = 0.0 / (NAN - 1.0D0)
-      ZNAN = DCMPLX(NAN,NAN)
 *
 *     Test the input arguments
 *
@@ -407,7 +409,7 @@ SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
          INFO = -1
       ELSE IF( .NOT.( WANTU2 .OR. LSAME( JOBU2, 'N' ) ) ) THEN
          INFO = -2
-      ELSE IF( .NOT.( WANTQT .OR. LSAME( JOBQT, 'N' ) ) ) THEN
+      ELSE IF( .NOT.( WANTX .OR. LSAME( JOBX, 'N' ) ) ) THEN
          INFO = -3
       ELSE IF( M.LT.1 ) THEN
          INFO = -4
@@ -416,39 +418,86 @@ SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
-         INFO = -10
+         INFO = -9
       ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
-         INFO = -12
+         INFO = -11
       ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
          INFO = -15
       ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
          INFO = -17
-      ELSE IF( LDQT.LT.1 .OR. ( WANTQT .AND. LDQT.LT.N ) ) THEN
-         INFO = -19
       ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
-         INFO = -23
-      ELSE IF( LRWORK.LT.2*N .AND. .NOT.LQUERY ) THEN
-         INFO = -25
+         INFO = -19
+      END IF
+*
+*     Make sure A is the matrix smaller in norm
+*
+      IF( INFO.EQ.0 ) THEN
+         NORMA = ZLANGE( 'F', M, N, A, LDA, RWORK )
+         NORMB = ZLANGE( 'F', P, N, B, LDB, RWORK )
+*
+         IF( NORMA.GT.SQRT( 2.0D0 ) * NORMB ) THEN
+            CALL ZGGQRCS( JOBU2, JOBU1, JOBX, P, N, M, L,
+     $                    SWAPPED,
+     $                    B, LDB, A, LDA,
+     $                    BETA, ALPHA,
+     $                    U2, LDU2, U1, LDU1,
+     $                    WORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+            SWAPPED = .TRUE.
+            RETURN
+         ENDIF
+*
+*     Past this point, we know that
+*     * NORMA <= NORMB (almost)
+*     * W >= 1
+*     * ALPHA will contain cosine values at the end
+*     * BETA will contain sine values at the end
+*
       END IF
 *
-*     Compute optimal workspace size
+*     Initialize variables
+*
+*     Computing 0.0 / 0.0 directly causes compiler errors
+      NAN = 1.0D0
+      NAN = 0.0 / (NAN - 1.0D0)
+      ZNAN = DCMPLX( NAN, NAN )
+*
+      SWAPPED = .FALSE.
+      L = 0
+      LMAX = MIN( M + P, N )
+      Z = ( M + P ) * N
+      G = WORK( 1 )
+      LDG = M + P
+      VT = 0
+      LDVT = N
+      THETA = NAN
+      IOTA = NAN
+      W = NAN
+*
+*     Compute workspace
 *
       IF( INFO.EQ.0 ) THEN
-*        ZGEQP3, ZUNGQR read/store LMAX scalar factors
-         CALL ZGEQP3( M+P, N, G, LDG, IWORK, WORK,
-     $                WORK, -1, RWORK, INFO )
-         LWKOPT = INT( WORK( 1 ) ) + LMAX
-
+         LWKOPT = 0
+*
+         CALL ZGEQP3( M + P, N, G, LDG, IWORK, WORK, WORK, -1, RWORK,
+     $                INFO )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+         LWKOPT = INT( WORK( 1 ) )
+*
          CALL ZUNGQR( M + P, LMAX, LMAX, G, LDG, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
-
-         CALL ZUNCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, LMAX,
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+*
+         CALL ZUNCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, LMAX,
      $                    G, LDG, G, LDG,
-     $                    THETA, U2, LDU2, U1, LDU1, QT, LDQT,
+     $                    ALPHA,
+     $                    U1, LDU1, U2, LDU2, VT, LDVT,
      $                    WORK, -1, RWORK, LRWORK, IWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
-*        The matrix (A, B) must be stored sequentially for xUNCSD2BY1
-         LWKOPT = Z + LWKOPT
+*        The matrix (A, B) must be stored sequentially for ZUNGQR
+         LWKOPT = LWKOPT + Z
+*        2-by-1 CSD matrix V1 must be stored
+         IF( WANTX ) THEN
+            LWKOPT = LWKOPT + LDVT*N
+         END IF
 *        Adjust ZUNCSD2BY1 LRWORK for case with maximum memory
 *        consumption
          LRWORK2BY1 = INT( RWORK(1) )
@@ -459,14 +508,7 @@ SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
      $                - 8 * MAX( 0, MIN( M, P, N, M+P-N ) )
      $                + 8 * MIN( M, P, N )
          LRWKOPT = MAX( 2*N, LRWORK2BY1 )
-
-*        ZGERQF, ZUNGRQ read/store up to LMAX scalar factors
-         CALL ZGERQF( LMAX, N, QT, LDQT, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
-
-         CALL ZUNGRQ( N, N, LMAX, QT, LDQT, WORK, WORK, -1, INFO )
-         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) + LMAX )
-
+*
          WORK( 1 ) = DCMPLX( DBLE( LWKOPT ), 0.0D0 )
          RWORK( 1 ) = DBLE( LRWKOPT )
       END IF
@@ -478,32 +520,29 @@ SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
       IF( LQUERY ) THEN
          RETURN
       ENDIF
+*     Finish initialization
+      IF( WANTX ) THEN
+         VT = WORK( Z + 1 )
+      END IF
 *
-*     DEBUG
-*
-      IWORK( 1:M+N+P ) = -1
-*
-*     Scale matrix B such that norm(A) \approx norm(B)
-*
-      NORMA = ZLANGE( 'F', M, N, A, LDA, RWORK )
-      NORMB = ZLANGE( 'F', P, N, B, LDB, RWORK )
+*     Scale matrix A such that norm(A) \approx norm(B)
 *
-      IF ( NORMB.EQ.0 ) THEN
+      IF( NORMA.EQ.0.0D0 ) THEN
          W = 1.0D0
       ELSE
          BASE = DLAMCH( 'B' )
-         W = BASE ** INT( LOG( NORMA / NORMB ) / LOG( BASE ) )
+         W = BASE ** INT( LOG( NORMB / NORMA ) / LOG( BASE ) )
 *
-         CALL ZLASCL( 'G', -1, -1, 1.0D0, W, P, N, B, LDB, INFO )
+         CALL ZLASCL( 'G', -1, -1, 1.0D0, W, M, N, A, LDA, INFO )
          IF ( INFO.NE.0 ) THEN
             RETURN
          END IF
       END IF
 *
-*     Copy matrices A, B into the (M+P) x n matrix G
+*     Copy matrices A, B into the (M+P) x N matrix G
 *
-      CALL ZLACPY( 'A', M, N, A, LDA, G( P + 1, 1 ), LDG )
-      CALL ZLACPY( 'A', P, N, B, LDB, G( 1, 1 ), LDG )
+      CALL ZLACPY( 'A', M, N, A, LDA, G( 1, 1 ), LDG )
+      CALL ZLACPY( 'A', P, N, B, LDB, G( M + 1, 1 ), LDG )
 *
 *     DEBUG
 *
@@ -512,14 +551,14 @@ SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Compute the Frobenius norm of matrix G
 *
-      GNORM = ZLANGE( 'F', M + P, N, G, LDG, WORK( Z + 1 ) )
+      NORMG = NORMB * SQRT( 1.0D0 + ( ( W * NORMA ) / NORMB )**2 )
 *
 *     Get machine precision and set up threshold for determining
 *     the effective numerical rank of the matrix G.
 *
       ULP = DLAMCH( 'Precision' )
       UNFL = DLAMCH( 'Safe Minimum' )
-      TOL = MAX( M + P, N ) * MAX( GNORM, UNFL ) * ULP
+      TOL = MAX( M + P, N ) * MAX( NORMG, UNFL ) * ULP
 *
 *     IWORK stores the column permutations computed by ZGEQP3.
 *     Columns J where IWORK( J ) is non-zero are permuted to the front
@@ -537,42 +576,41 @@ SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     Determine the rank of G
 *
-      DO 20 I = 1, LMAX
+      DO I = 1, MIN( M + P, N )
          IF( ABS( G( I, I ) ).LE.TOL ) THEN
             EXIT
          END IF
          L = L + 1
-   20 CONTINUE
+      END DO
 *
 *     Handle rank=0 case
 *
       IF( L.EQ.0 ) THEN
          IF( WANTU1 ) THEN
-            CALL ZLASET( 'A', M, M, ZERO, ONE, U1, LDU1 )
+            CALL ZLASET( 'A', M, M, CZERO, CONE, U1, LDU1 )
          END IF
          IF( WANTU2 ) THEN
-            CALL ZLASET( 'A', P, P, ZERO, ONE, U2, LDU2 )
-         END IF
-         IF( WANTQT ) THEN
-            CALL ZLASET( 'A', N, N, ZERO, ONE, QT, LDQT )
+            CALL ZLASET( 'A', P, P, CZERO, CONE, U2, LDU2 )
          END IF
 *
-         WORK( 1 ) = DCMPLX( DBLE(LWKOPT), 0.0D0 )
-         RWORK( 1 ) = DBLE(LRWKOPT)
+         WORK( 1 ) = DCMPLX( DBLE( LWKOPT ), 0.0D0 )
+         RWORK( 1 ) = DBLE( LRWKOPT )
          RETURN
       END IF
 *
 *     Copy R1( 1:L, : ) into A, B and set lower triangular part to zero
 *
-      IF( L.LE.M ) THEN
-          CALL ZLACPY( 'U', L, N, G, LDG, A, LDA )
-          CALL ZLASET( 'L', L - 1, N, ZERO, ZERO, A( 2, 1 ), LDA )
-      ELSE
-          CALL ZLACPY( 'U', M, N, G, LDG, A, LDA )
-          CALL ZLACPY( 'U', L - M, N - M, G( M+1, M+1 ), LDG, B, LDB )
+      IF( WANTX ) THEN
+         IF( L.LE.M ) THEN
+             CALL ZLACPY( 'U', L, N, G, LDG, A, LDA )
+             CALL ZLASET( 'L', L - 1, N, CZERO, CZERO, A( 2, 1 ), LDA )
+         ELSE
+             CALL ZLACPY( 'U', M, N, G, LDG, A, LDA )
+             CALL ZLACPY( 'U', L - M, N - M, G( M+1,M+1 ), LDG, B, LDB )
 *
-          CALL ZLASET( 'L', M - 1, N, ZERO, ZERO, A( 2, 1 ), LDA )
-          CALL ZLASET( 'L', L-M-1, N, ZERO, ZERO, B( 2, 1 ), LDB )
+             CALL ZLASET( 'L', M - 1, N, CZERO, CZERO, A( 2, 1 ), LDA )
+             CALL ZLASET( 'L', L-M-1, N, CZERO, CZERO, B( 2, 1 ), LDB )
+         END IF
       END IF
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition
@@ -585,92 +623,98 @@ SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBQT, M, N, P, W, L,
 *
 *     DEBUG
 *
-      RWORK( 1:LRWORK ) = NAN
-      WORK( Z+1:LWORK ) = ZNAN
+      ALPHA( 1:N ) = ZNAN
+      BETA( 1:N ) = ZNAN
 *
 *     Compute the CS decomposition of Q1( :, 1:L )
 *
-      CALL ZUNCSD2BY1( JOBU2, JOBU1, 'Y', M + P, P, L,
-     $                 G( 1, 1 ), LDG, G( P + 1, 1 ), LDG, THETA,
-     $                 U2, LDU2, U1, LDU1, QT, LDQT,
-     $                 WORK( Z + 1 ), LWORK - Z,
-     $                 RWORK, LRWORK, IWORK( N + 1 ), INFO )
+      K = MIN( M, P, L, M + P - L )
+      K1 = MAX( L - P, 0 )
+      CALL ZUNCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, L,
+     $                 G( 1, 1 ), LDG, G( M + 1, 1 ), LDG,
+     $                 ALPHA,
+     $                 U1, LDU1, U2, LDU2, VT, LDVT,
+     $                 WORK( Z + LDVT*N + 1 ), LWORK - Z - LDVT*N,
+     $                 RWORK, LRWORK,
+     $                 IWORK( N + 1 ), INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
       END IF
 *
 *     DEBUG
 *
-      WORK( 1:LWORK ) = ZNAN
-      RWORK( 1:LRWORK ) = NAN
-*
-*     Copy V^T from QT to G
-*
-      CALL ZLACPY( 'A', L, L, QT, LDQT, G, LDG )
-*
-*     DEBUG
-*
-      CALL ZLASET( 'A', N, N, ZNAN, ZNAN, QT, LDQT )
-*
-*     Compute V^T R1( 1:L, : ) in the last L rows of QT
-*
-      IF ( L.LE.M ) THEN
-         CALL ZGEMM( 'N', 'N', L, N, L, ONE, G, LDG,
-     $               A, LDA, ZERO, QT( N-L+1, 1 ), LDQT )
-      ELSE
-         CALL ZGEMM( 'N', 'N', L, N, M, ONE, G( 1, 1 ), LDG,
-     $               A, LDA, ZERO, QT( N-L+1, 1 ), LDQT )
-         CALL ZGEMM( 'N', 'N', L, N - M, L - M, ONE,
-     $               G( 1, M + 1 ), LDG, B, LDB,
-     $               ONE, QT( N-L+1, M+1 ), LDQT )
-      END IF
-*
-*     DEBUG
-*
-      CALL ZLASET( 'A', M, N, ZNAN, ZNAN, A, LDA )
-      CALL ZLASET( 'A', P, N, ZNAN, ZNAN, B, LDB )
-      WORK(1:LWORK) = ZNAN
-*
-*     Compute the RQ decomposition of V^T R1( 1:L, : )
-*
-      CALL ZGERQF( L, N, QT( N-L+1, 1 ), LDQT, WORK( 1 ),
-     $             WORK( L + 1 ), LWORK - L, INFO )
-      IF ( INFO.NE.0 ) THEN
-         RETURN
-      END IF
-*
-*     Copy matrix L from QT( N-L+1:N, N-L+1:N ) to A, B
-*
-      IF ( L.LE.M ) THEN
-         CALL ZLACPY( 'U', L, L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
-      ELSE
-         CALL ZLACPY( 'U', M,     L, QT( N-L+1, N-L+1 ), LDQT, A, LDA )
-         CALL ZLACPY( 'U', L - M, L - M, QT( N-L+M+1, N-L+M+1 ), LDQT,
-     $                B, LDB )
+      WORK( 1:LDG*N ) = ZNAN
+      RWORK( 1:2*N ) = NAN
+*
+*     Compute X = V^T R1( 1:L, : ) and adjust for matrix scaling
+*
+      IF( WANTX ) THEN
+         LDX = L
+         IF ( L.LE.M ) THEN
+            CALL ZGEMM( 'N', 'N', L, N, L,
+     $                  CONE, VT, LDVT, A, LDA,
+     $                  CZERO, WORK( 2 ), LDX )
+         ELSE
+            CALL ZGEMM( 'N', 'N', L, N, M,
+     $                  CONE, VT( 1, 1 ), LDVT, A, LDA,
+     $                  CZERO, WORK( 2 ), LDX )
+            CALL ZGEMM( 'N', 'N', L, N - M, L - M,
+     $                  CONE, VT( 1, M + 1 ), LDVT, B, LDB,
+     $                  CONE, WORK( L*M + 2 ), LDX )
+         END IF
+*        Revert column permutation Π by permuting the columns of X
+         CALL ZLAPMT( .FALSE., L, N, WORK( 2 ), LDX, IWORK )
       END IF
 *
-*     DEBUG
-*
-      CALL ZLASET( 'U', L, L, ZNAN, ZNAN, QT( 1, N-L+1 ), LDQT )
-      WORK( L+1:LWORK ) = ZNAN
-*
-*     Explicitly form Q^T
-*
-      IF( WANTQT ) THEN
-         CALL ZUNGRQ( N, N, L, QT, LDQT, WORK,
-     $                WORK( L + 1 ), LWORK - L, INFO )
-         IF ( INFO.NE.0 ) THEN
-            RETURN
+*     Adjust generalized singular values for matrix scaling
+*     Compute sine, cosine values
+*     Prepare row scaling of X
+*
+      DO I = 1, K
+         THETA = ALPHA( I )
+*        Do not adjust singular value if THETA is greater
+*        than pi/2 (infinite singular values won't change)
+         IF( COS( THETA ).LE.0.0D0 ) THEN
+            ALPHA( I ) = 0.0D0
+            BETA( I ) = 1.0D0
+            IF( WANTX ) THEN
+               RWORK( I ) = 1.0D0
+            END IF
+         ELSE
+*           iota comes in the greek alphabet after theta
+            IOTA = ATAN( W * TAN( THETA ) )
+*           ensure sine, cosine divisor is far away from zero
+*           w is a power of two and will cause no trouble
+            IF( SIN( IOTA ) .GE. COS( IOTA ) ) THEN
+               ALPHA( I ) =  ( SIN( IOTA ) / TAN( THETA ) ) / W
+               BETA( I ) = SIN( IOTA )
+               IF( WANTX ) THEN
+                  RWORK( I ) = SIN( THETA ) / SIN( IOTA )
+               END IF
+            ELSE
+               ALPHA( I ) = COS( IOTA )
+               BETA( I ) = SIN( IOTA )
+               IF( WANTX ) THEN
+                  RWORK( I ) = COS( THETA ) / COS( IOTA ) / W
+               END IF
+            END IF
          END IF
-*
-*     Revert column permutation Π by permuting the rows of Q^T
-*
-         CALL ZLAPMT( .FALSE., N, N, QT, LDQT, IWORK )
+      END DO
+*     Adjust rows of X for matrix scaling
+      IF( WANTX ) THEN
+         DO J = 0, N-1
+            DO I = 1, K1
+               WORK( LDX*J + I + 1 ) = WORK( LDX*J + I + 1 ) / W
+            END DO
+            DO I = 1, K
+               WORK( LDX*J + I + K1 + 1 ) =
+     $         WORK( LDX*J + I + K1 + 1 ) * RWORK( I )
+            END DO
+         END DO
       END IF
 *
-      WORK( 1 ) = DCMPLX( DBLE(LWKOPT), 0.0D0 )
-      RWORK( 1 ) = DBLE(LRWKOPT)
-
+      WORK( 1 ) = DCMPLX( DBLE( LWKOPT ), 0.0D0 )
+      RWORK( 1 ) = DBLE( LRWKOPT )
       RETURN
 *
 *     End of ZGGQRCS

From 3b0f2ac15ba5f9978a627e2ad0352a2863e8b180 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Thu, 7 May 2020 21:58:34 +0200
Subject: [PATCH 074/101] {s,d}GGQRCS: fix a formula in documentation

---
 SRC/dggqrcs.f | 2 +-
 SRC/sggqrcs.f | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index e7c30097fd..de4c63456c 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -287,7 +287,7 @@
 *>  TOL     DOUBLE PRECISION
 *>          Let G = (A**T,B**T)**T. TOL is the threshold to determine
 *>          the effective rank of G. Generally, it is set to
-*>                   TOL = MAX(M,P,N) * norm(G) * MACHEPS,
+*>                   TOL = MAX( M + P, N ) * norm(G) * MACHEPS,
 *>          where norm(G) is the Frobenius norm of G.
 *>          The size of TOL may affect the size of backward error of the
 *>          decomposition.
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index cd71591cbc..e84551a90a 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -287,7 +287,7 @@
 *>  TOL     REAL
 *>          Let G = (A**T,B**T)**T. TOL is the threshold to determine
 *>          the effective rank of G. Generally, it is set to
-*>                   TOL = MAX(M,P,N) * norm(G) * MACHEPS,
+*>                   TOL = MAX( M + P, N ) * norm(G) * MACHEPS,
 *>          where norm(G) is the Frobenius norm of G.
 *>          The size of TOL may affect the size of backward error of the
 *>          decomposition.

From 762ef542972847a366ab29aade34041c02ae942d Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sun, 10 May 2020 13:00:31 +0200
Subject: [PATCH 075/101] SGGQRCS: try speeding up matrix multiplication

For tall matrices, xGGSVD3 is faster than xGGQRCS. This commit tries to
improve performance by speeding up the matrix-matrix multiplication
`V^* R( 1:L, N )`, where `R` is upper triangular with, with `xTRMM()`.
---
 SRC/sggqrcs.f | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index e84551a90a..9ec1fac42f 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -363,7 +363,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SGEQP3, SLACPY, SLAPMT, SLASCL,
-     $                   SLASET, SORGQR, SORCSD2BY1, XERBLA
+     $                   SLASET, STRMM, SORGQR, SORCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          COS, MAX, MIN, SIN, SQRT
@@ -611,9 +611,13 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IF( WANTX ) THEN
          LDX = L
          IF ( L.LE.M ) THEN
-            CALL SGEMM( 'N', 'N', L, N, L,
-     $                  1.0E0, VT, LDVT, A, LDA,
-     $                  0.0E0, WORK( 2 ), LDX )
+      CALL SLACPY( 'A', M, N, A, LDA, G( 1, 1 ), LDG )
+            CALL SLACPY( 'A', L, L, VT, LDVT, WORK( 2 ), LDX )
+            CALL STRMM( 'R', 'U', 'N', 'N', L, L,
+     $                  1.0E0, A, LDA, WORK( 2 ), LDX )
+            CALL SGEMM( 'N', 'N', L, N - L, L,
+     $                  1.0E0, VT, LDVT, A( 1, L + 1 ), LDA,
+     $                  0.0E0, WORK( LDX*L + 2 ), LDX )
          ELSE
             CALL SGEMM( 'N', 'N', L, N, M,
      $                  1.0E0, VT( 1, 1 ), LDVT, A, LDA,

From 1593ed712894ddee0112852dcb33e634725cce2d Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Mon, 11 May 2020 11:55:49 +0200
Subject: [PATCH 076/101] Revert "SGGQRCS: try speeding up matrix
 multiplication"

Revert because
* the commit was incomplete and forgot to remove the superfluous SLASCL
  call zeroing the lower triangular matrix part; with this called
  removed...
* there is hardly any difference for matrices with m, n, p <= 256,
* if there is a difference, it may be in favor of xGEMM.
---
 SRC/sggqrcs.f | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 9ec1fac42f..e84551a90a 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -363,7 +363,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SGEQP3, SLACPY, SLAPMT, SLASCL,
-     $                   SLASET, STRMM, SORGQR, SORCSD2BY1, XERBLA
+     $                   SLASET, SORGQR, SORCSD2BY1, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          COS, MAX, MIN, SIN, SQRT
@@ -611,13 +611,9 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IF( WANTX ) THEN
          LDX = L
          IF ( L.LE.M ) THEN
-      CALL SLACPY( 'A', M, N, A, LDA, G( 1, 1 ), LDG )
-            CALL SLACPY( 'A', L, L, VT, LDVT, WORK( 2 ), LDX )
-            CALL STRMM( 'R', 'U', 'N', 'N', L, L,
-     $                  1.0E0, A, LDA, WORK( 2 ), LDX )
-            CALL SGEMM( 'N', 'N', L, N - L, L,
-     $                  1.0E0, VT, LDVT, A( 1, L + 1 ), LDA,
-     $                  0.0E0, WORK( LDX*L + 2 ), LDX )
+            CALL SGEMM( 'N', 'N', L, N, L,
+     $                  1.0E0, VT, LDVT, A, LDA,
+     $                  0.0E0, WORK( 2 ), LDX )
          ELSE
             CALL SGEMM( 'N', 'N', L, N, M,
      $                  1.0E0, VT( 1, 1 ), LDVT, A, LDA,

From baf0f4120971749723b205e124d96b145fdcf118 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Mon, 11 May 2020 20:36:33 +0200
Subject: [PATCH 077/101] xGGQRCS: fix out-of-bounds access

Fix out-of-bounds access when computing only the singular values.
---
 SRC/cggqrcs.f | 2 ++
 SRC/dggqrcs.f | 2 ++
 SRC/sggqrcs.f | 2 ++
 SRC/zggqrcs.f | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 2c17a9fcf0..bc89d91dd1 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -523,6 +523,8 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     Finish initialization
       IF( WANTX ) THEN
          VT = WORK( Z + 1 )
+      ELSE
+         LDVT = 0
       END IF
 *
 *     Scale matrix A such that norm(A) \approx norm(B)
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index de4c63456c..d3cb981619 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -486,6 +486,8 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     Finish initialization
       IF( WANTX ) THEN
          VT = WORK( Z + 1 )
+      ELSE
+         LDVT = 0
       END IF
 *
 *     Scale matrix A such that norm(A) \approx norm(B)
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index e84551a90a..27752ddad7 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -486,6 +486,8 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     Finish initialization
       IF( WANTX ) THEN
          VT = WORK( Z + 1 )
+      ELSE
+         LDVT = 0
       END IF
 *
 *     Scale matrix A such that norm(A) \approx norm(B)
diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index 3214722c01..e0704b79c3 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -523,6 +523,8 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     Finish initialization
       IF( WANTX ) THEN
          VT = WORK( Z + 1 )
+      ELSE
+         LDVT = 0
       END IF
 *
 *     Scale matrix A such that norm(A) \approx norm(B)

From b9ca9b67594770837c0738161028e365eda42121 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Sat, 16 Jan 2021 17:42:04 +0000
Subject: [PATCH 078/101] xLASRTI: improve documentation wording

---
 SRC/dlasrti.f | 5 ++---
 SRC/slasrti.f | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/SRC/dlasrti.f b/SRC/dlasrti.f
index 794748770e..195cd97980 100644
--- a/SRC/dlasrti.f
+++ b/SRC/dlasrti.f
@@ -66,10 +66,9 @@
 *>
 *> \param[in,out] INDICES
 *> \verbatim
-*>          X is INTEGER array, dimension (N)
+*>          INDICES is INTEGER array, dimension (N)
 *>          On entry, the indices of values in X to be sorted.
-*>          On exit, X has been sorted into
-*>          increasing order such that
+*>          On exit, the indices have been sorted such that
 *>          X( INDICES(1) ) <= ... <= X( INDICES(N) )
 *>          or decreasing order such that
 *>          X( INDICES(1) ) >= ... >= X( INDICES(N) )
diff --git a/SRC/slasrti.f b/SRC/slasrti.f
index cddeabc347..462e41192e 100644
--- a/SRC/slasrti.f
+++ b/SRC/slasrti.f
@@ -66,10 +66,9 @@
 *>
 *> \param[in,out] INDICES
 *> \verbatim
-*>          X is INTEGER array, dimension (N)
+*>          INDICES is INTEGER array, dimension (N)
 *>          On entry, the indices of values in X to be sorted.
-*>          On exit, X has been sorted into
-*>          increasing order such that
+*>          On exit, the indices have been sorted such that
 *>          X( INDICES(1) ) <= ... <= X( INDICES(N) )
 *>          or decreasing order such that
 *>          X( INDICES(1) ) >= ... >= X( INDICES(N) )

From 0a61fa43eceaf6500d5073d44e51f3893f48145d Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sat, 30 Jan 2021 17:26:58 +0000
Subject: [PATCH 079/101] xGGQRCS: fix external functions return value type

---
 SRC/cggqrcs.f | 2 +-
 SRC/zggqrcs.f | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index bc89d91dd1..3253e2d896 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -382,7 +382,7 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      COMPLEX            SLAMCH, CLANGE
+      REAL               SLAMCH, CLANGE
       EXTERNAL           LSAME, SLAMCH, CLANGE
 *     ..
 *     .. External Subroutines ..
diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index e0704b79c3..61308ec8f1 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -382,7 +382,7 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      COMPLEX*16         DLAMCH, ZLANGE
+      DOUBLE PRECISION   DLAMCH, ZLANGE
       EXTERNAL           LSAME, DLAMCH, ZLANGE
 *     ..
 *     .. External Subroutines ..

From 58f2a5bb517a02366551c1cdd49a11d0da6ee4dd Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sat, 30 Jan 2021 19:49:43 +0000
Subject: [PATCH 080/101] CGGQRCS: fix accidental memory allocation

---
 SRC/cggqrcs.f | 66 ++++++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 32 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 3253e2d896..397f62b284 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -373,12 +373,11 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                   CZERO = ( 0.0E0, 0.0E0 ) )
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
-      INTEGER            I, J, K, K1, LMAX, Z, LDG, LDX, LDVT, LWKOPT
+      INTEGER            I, J, K, K1, LMAX, IG, IG11, IG21, IG22,
+     $                   IVT, IVT12, LDG, LDX, LDVT, LWKOPT
       REAL               BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
      $                   THETA, IOTA, W
       COMPLEX            CNAN
-*     .. Local Arrays ..
-      COMPLEX            G( M + P, N ), VT( N, N )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -463,12 +462,15 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
       SWAPPED = .FALSE.
       L = 0
-      LMAX = MIN( M + P, N )
-      Z = ( M + P ) * N
-      G = WORK( 1 )
       LDG = M + P
-      VT = 0
       LDVT = N
+      LMAX = MIN( M + P, N )
+      IG = 1
+      IG11 = 1
+      IG21 = M + 1
+      IG22 = LDG * M + M + 1
+      IVT = LDG * N + 1
+      IVT12 = IVT + LDVT * M
       THETA = NAN
       IOTA = NAN
       W = NAN
@@ -478,22 +480,23 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IF( INFO.EQ.0 ) THEN
          LWKOPT = 0
 *
-         CALL CGEQP3( M + P, N, G, LDG, IWORK, WORK, WORK, -1, RWORK,
-     $                INFO )
+         CALL CGEQP3( M + P, N, WORK( IG ), LDG, IWORK, WORK, WORK, -1,
+     $                RWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          LWKOPT = INT( WORK( 1 ) )
 *
-         CALL CUNGQR( M + P, LMAX, LMAX, G, LDG, WORK, WORK, -1, INFO )
+         CALL CUNGQR( M + P, LMAX, LMAX, WORK( IG ), LDG, WORK, WORK,
+     $                -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *
          CALL CUNCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, LMAX,
-     $                    G, LDG, G, LDG,
+     $                    WORK( IG ), LDG, WORK( IG ), LDG,
      $                    ALPHA,
-     $                    U1, LDU1, U2, LDU2, VT, LDVT,
+     $                    U1, LDU1, U2, LDU2, WORK( IVT ), LDVT,
      $                    WORK, -1, RWORK, LRWORK, IWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *        The matrix (A, B) must be stored sequentially for CUNGQR
-         LWKOPT = LWKOPT + Z
+         LWKOPT = LWKOPT + IVT
 *        2-by-1 CSD matrix V1 must be stored
          IF( WANTX ) THEN
             LWKOPT = LWKOPT + LDVT*N
@@ -521,9 +524,7 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          RETURN
       ENDIF
 *     Finish initialization
-      IF( WANTX ) THEN
-         VT = WORK( Z + 1 )
-      ELSE
+      IF( .NOT.WANTX ) THEN
          LDVT = 0
       END IF
 *
@@ -543,8 +544,8 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Copy matrices A, B into the (M+P) x N matrix G
 *
-      CALL CLACPY( 'A', M, N, A, LDA, G( 1, 1 ), LDG )
-      CALL CLACPY( 'A', P, N, B, LDB, G( M + 1, 1 ), LDG )
+      CALL CLACPY( 'A', M, N, A, LDA, WORK( IG11 ), LDG )
+      CALL CLACPY( 'A', P, N, B, LDB, WORK( IG21 ), LDG )
 *
 *     DEBUG
 *
@@ -570,8 +571,9 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Compute the QR factorization with column pivoting GΠ = Q1 R1
 *
-      CALL CGEQP3( M + P, N, G, LDG, IWORK, WORK( Z + 1 ),
-     $             WORK( Z + LMAX + 1 ), LWORK - Z - LMAX, RWORK, INFO )
+      CALL CGEQP3( M + P, N, WORK( IG ), LDG, IWORK, WORK( IVT ),
+     $             WORK( IVT + LMAX ), LWORK - IVT - LMAX + 1, RWORK,
+     $             INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
       END IF
@@ -579,7 +581,7 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     Determine the rank of G
 *
       DO I = 1, MIN( M + P, N )
-         IF( ABS( G( I, I ) ).LE.TOL ) THEN
+         IF( ABS( WORK( (I-1) * LDG + I ) ).LE.TOL ) THEN
             EXIT
          END IF
          L = L + 1
@@ -604,11 +606,11 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
       IF( WANTX ) THEN
          IF( L.LE.M ) THEN
-             CALL CLACPY( 'U', L, N, G, LDG, A, LDA )
+             CALL CLACPY( 'U', L, N, WORK( IG ), LDG, A, LDA )
              CALL CLASET( 'L', L - 1, N, CZERO, CZERO, A( 2, 1 ), LDA )
          ELSE
-             CALL CLACPY( 'U', M, N, G, LDG, A, LDA )
-             CALL CLACPY( 'U', L - M, N - M, G( M+1,M+1 ), LDG, B, LDB )
+             CALL CLACPY( 'U', M, N, WORK( IG ), LDG, A, LDA )
+             CALL CLACPY( 'U', L - M, N - M, WORK( IG22 ), LDG, B, LDB )
 *
              CALL CLASET( 'L', M - 1, N, CZERO, CZERO, A( 2, 1 ), LDA )
              CALL CLASET( 'L', L-M-1, N, CZERO, CZERO, B( 2, 1 ), LDB )
@@ -617,8 +619,8 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition
 *
-      CALL CUNGQR( M + P, L, L, G, LDG, WORK( Z + 1 ),
-     $             WORK( Z + L + 1 ), LWORK - Z - L, INFO )
+      CALL CUNGQR( M + P, L, L, WORK( IG ), LDG, WORK( IVT ),
+     $             WORK( IVT + L ), LWORK - IVT - L + 1, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
@@ -633,10 +635,10 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       K = MIN( M, P, L, M + P - L )
       K1 = MAX( L - P, 0 )
       CALL CUNCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, L,
-     $                 G( 1, 1 ), LDG, G( M + 1, 1 ), LDG,
+     $                 WORK( IG11 ), LDG, WORK( IG21 ), LDG,
      $                 ALPHA,
-     $                 U1, LDU1, U2, LDU2, VT, LDVT,
-     $                 WORK( Z + LDVT*N + 1 ), LWORK - Z - LDVT*N,
+     $                 U1, LDU1, U2, LDU2, WORK( IVT ), LDVT,
+     $                 WORK( IVT + LDVT*N ), LWORK - IVT - LDVT*N + 1,
      $                 RWORK, LRWORK,
      $                 IWORK( N + 1 ), INFO )
       IF( INFO.NE.0 ) THEN
@@ -654,14 +656,14 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          LDX = L
          IF ( L.LE.M ) THEN
             CALL CGEMM( 'N', 'N', L, N, L,
-     $                  CONE, VT, LDVT, A, LDA,
+     $                  CONE, WORK( IVT ), LDVT, A, LDA,
      $                  CZERO, WORK( 2 ), LDX )
          ELSE
             CALL CGEMM( 'N', 'N', L, N, M,
-     $                  CONE, VT( 1, 1 ), LDVT, A, LDA,
+     $                  CONE, WORK( IVT ), LDVT, A, LDA,
      $                  CZERO, WORK( 2 ), LDX )
             CALL CGEMM( 'N', 'N', L, N - M, L - M,
-     $                  CONE, VT( 1, M + 1 ), LDVT, B, LDB,
+     $                  CONE, WORK( IVT12 ), LDVT, B, LDB,
      $                  CONE, WORK( L*M + 2 ), LDX )
          END IF
 *        Revert column permutation Π by permuting the columns of X

From dda1d0fc303c3161fcdf27aa3ed9205b04900e9f Mon Sep 17 00:00:00 2001
From: Christoph Conrads <gitlab@christoph-conrads.name>
Date: Sat, 30 Jan 2021 19:50:51 +0000
Subject: [PATCH 081/101] CGGQRCS: fix off-by-one bug

The last element of the matrix X overlapped with the first element of
the matrix V^* because WORK(1) cannot be used by X.
---
 SRC/cggqrcs.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 397f62b284..6986365e99 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -469,7 +469,7 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IG11 = 1
       IG21 = M + 1
       IG22 = LDG * M + M + 1
-      IVT = LDG * N + 1
+      IVT = LDG * N + 2
       IVT12 = IVT + LDVT * M
       THETA = NAN
       IOTA = NAN

From 702991a1d79d9c4b526c168bb9074015b7dadcbf Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Fri, 12 Feb 2021 15:17:04 +0000
Subject: [PATCH 082/101] ZGGQRCS: fix accidental memory allocation

This commit is a port of recent changes mades to CGGQRCS.
---
 SRC/zggqrcs.f | 66 ++++++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 32 deletions(-)

diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index 61308ec8f1..1790b84ca4 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -373,12 +373,11 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                   CZERO = ( 0.0D0, 0.0D0 ) )
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
-      INTEGER            I, J, K, K1, LMAX, Z, LDG, LDX, LDVT, LWKOPT
+      INTEGER            I, J, K, K1, LMAX, IG, IG11, IG21, IG22,
+     $                   IVT, IVT12, LDG, LDX, LDVT, LWKOPT
       DOUBLE PRECISION   BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
      $                   THETA, IOTA, W
       COMPLEX*16         ZNAN
-*     .. Local Arrays ..
-      COMPLEX*16         G( M + P, N ), VT( N, N )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -463,12 +462,15 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
       SWAPPED = .FALSE.
       L = 0
-      LMAX = MIN( M + P, N )
-      Z = ( M + P ) * N
-      G = WORK( 1 )
       LDG = M + P
-      VT = 0
       LDVT = N
+      LMAX = MIN( M + P, N )
+      IG = 1
+      IG11 = 1
+      IG21 = M + 1
+      IG22 = LDG * M + M + 1
+      IVT = LDG * N + 2
+      IVT12 = IVT + LDVT * M
       THETA = NAN
       IOTA = NAN
       W = NAN
@@ -478,22 +480,23 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IF( INFO.EQ.0 ) THEN
          LWKOPT = 0
 *
-         CALL ZGEQP3( M + P, N, G, LDG, IWORK, WORK, WORK, -1, RWORK,
-     $                INFO )
+         CALL ZGEQP3( M + P, N, WORK( IG ), LDG, IWORK, WORK, WORK, -1,
+     $                RWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          LWKOPT = INT( WORK( 1 ) )
 *
-         CALL ZUNGQR( M + P, LMAX, LMAX, G, LDG, WORK, WORK, -1, INFO )
+         CALL ZUNGQR( M + P, LMAX, LMAX, WORK( IG ), LDG, WORK, WORK,
+     $                -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *
          CALL ZUNCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, LMAX,
-     $                    G, LDG, G, LDG,
+     $                    WORK( IG ), LDG, WORK( IG ), LDG,
      $                    ALPHA,
-     $                    U1, LDU1, U2, LDU2, VT, LDVT,
+     $                    U1, LDU1, U2, LDU2, WORK( IVT ), LDVT,
      $                    WORK, -1, RWORK, LRWORK, IWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *        The matrix (A, B) must be stored sequentially for ZUNGQR
-         LWKOPT = LWKOPT + Z
+         LWKOPT = LWKOPT + IVT
 *        2-by-1 CSD matrix V1 must be stored
          IF( WANTX ) THEN
             LWKOPT = LWKOPT + LDVT*N
@@ -521,9 +524,7 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          RETURN
       ENDIF
 *     Finish initialization
-      IF( WANTX ) THEN
-         VT = WORK( Z + 1 )
-      ELSE
+      IF( .NOT.WANTX ) THEN
          LDVT = 0
       END IF
 *
@@ -543,8 +544,8 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Copy matrices A, B into the (M+P) x N matrix G
 *
-      CALL ZLACPY( 'A', M, N, A, LDA, G( 1, 1 ), LDG )
-      CALL ZLACPY( 'A', P, N, B, LDB, G( M + 1, 1 ), LDG )
+      CALL ZLACPY( 'A', M, N, A, LDA, WORK( IG11 ), LDG )
+      CALL ZLACPY( 'A', P, N, B, LDB, WORK( IG21 ), LDG )
 *
 *     DEBUG
 *
@@ -570,8 +571,9 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Compute the QR factorization with column pivoting GΠ = Q1 R1
 *
-      CALL ZGEQP3( M + P, N, G, LDG, IWORK, WORK( Z + 1 ),
-     $             WORK( Z + LMAX + 1 ), LWORK - Z - LMAX, RWORK, INFO )
+      CALL ZGEQP3( M + P, N, WORK( IG ), LDG, IWORK, WORK( IVT ),
+     $             WORK( IVT + LMAX ), LWORK - IVT - LMAX + 1, RWORK,
+     $             INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
       END IF
@@ -579,7 +581,7 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     Determine the rank of G
 *
       DO I = 1, MIN( M + P, N )
-         IF( ABS( G( I, I ) ).LE.TOL ) THEN
+         IF( ABS( WORK( (I-1) * LDG + I ) ).LE.TOL ) THEN
             EXIT
          END IF
          L = L + 1
@@ -604,11 +606,11 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
       IF( WANTX ) THEN
          IF( L.LE.M ) THEN
-             CALL ZLACPY( 'U', L, N, G, LDG, A, LDA )
+             CALL ZLACPY( 'U', L, N, WORK( IG ), LDG, A, LDA )
              CALL ZLASET( 'L', L - 1, N, CZERO, CZERO, A( 2, 1 ), LDA )
          ELSE
-             CALL ZLACPY( 'U', M, N, G, LDG, A, LDA )
-             CALL ZLACPY( 'U', L - M, N - M, G( M+1,M+1 ), LDG, B, LDB )
+             CALL ZLACPY( 'U', M, N, WORK( IG ), LDG, A, LDA )
+             CALL ZLACPY( 'U', L - M, N - M, WORK( IG22 ), LDG, B, LDB )
 *
              CALL ZLASET( 'L', M - 1, N, CZERO, CZERO, A( 2, 1 ), LDA )
              CALL ZLASET( 'L', L-M-1, N, CZERO, CZERO, B( 2, 1 ), LDB )
@@ -617,8 +619,8 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition
 *
-      CALL ZUNGQR( M + P, L, L, G, LDG, WORK( Z + 1 ),
-     $             WORK( Z + L + 1 ), LWORK - Z - L, INFO )
+      CALL ZUNGQR( M + P, L, L, WORK( IG ), LDG, WORK( IVT ),
+     $             WORK( IVT + L ), LWORK - IVT - L + 1, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
@@ -633,10 +635,10 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       K = MIN( M, P, L, M + P - L )
       K1 = MAX( L - P, 0 )
       CALL ZUNCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, L,
-     $                 G( 1, 1 ), LDG, G( M + 1, 1 ), LDG,
+     $                 WORK( IG11 ), LDG, WORK( IG21 ), LDG,
      $                 ALPHA,
-     $                 U1, LDU1, U2, LDU2, VT, LDVT,
-     $                 WORK( Z + LDVT*N + 1 ), LWORK - Z - LDVT*N,
+     $                 U1, LDU1, U2, LDU2, WORK( IVT ), LDVT,
+     $                 WORK( IVT + LDVT*N ), LWORK - IVT - LDVT*N + 1,
      $                 RWORK, LRWORK,
      $                 IWORK( N + 1 ), INFO )
       IF( INFO.NE.0 ) THEN
@@ -654,14 +656,14 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          LDX = L
          IF ( L.LE.M ) THEN
             CALL ZGEMM( 'N', 'N', L, N, L,
-     $                  CONE, VT, LDVT, A, LDA,
+     $                  CONE, WORK( IVT ), LDVT, A, LDA,
      $                  CZERO, WORK( 2 ), LDX )
          ELSE
             CALL ZGEMM( 'N', 'N', L, N, M,
-     $                  CONE, VT( 1, 1 ), LDVT, A, LDA,
+     $                  CONE, WORK( IVT ), LDVT, A, LDA,
      $                  CZERO, WORK( 2 ), LDX )
             CALL ZGEMM( 'N', 'N', L, N - M, L - M,
-     $                  CONE, VT( 1, M + 1 ), LDVT, B, LDB,
+     $                  CONE, WORK( IVT12 ), LDVT, B, LDB,
      $                  CONE, WORK( L*M + 2 ), LDX )
          END IF
 *        Revert column permutation Π by permuting the columns of X

From 4a6a0f886c102cdcffefd99dcc7c429f8f9a08c7 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Fri, 12 Feb 2021 17:27:21 +0000
Subject: [PATCH 083/101] SGGQRCS: fix documentation typos

---
 SRC/sggqrcs.f | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 27752ddad7..bbf47b6218 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -268,8 +268,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit.
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
-*>          > 0:  DBBCSD did not converge. For further details, see
-*>                subroutine DORCSDBY1.
+*>          > 0:  SBBCSD did not converge. For further details, see
+*>                subroutine SORCSDBY1.
 *> \endverbatim
 *
 *> \par Internal Parameters:

From 332b4e6242cf3b50e60bdcda519e0bef6f2c81c7 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Fri, 12 Feb 2021 17:27:32 +0000
Subject: [PATCH 084/101] xGGQRCS: fix accidental memory allocation

---
 SRC/dggqrcs.f | 72 ++++++++++++++++++++++++++-------------------------
 SRC/sggqrcs.f | 72 ++++++++++++++++++++++++++-------------------------
 2 files changed, 74 insertions(+), 70 deletions(-)

diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index d3cb981619..f71fd0c4bb 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -350,11 +350,10 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
-      INTEGER            I, J, K, K1, LMAX, Z, LDG, LDX, LDVT, LWKOPT
+      INTEGER            I, J, K, K1, LMAX, IG, IG11, IG21, IG22,
+     $                   IVT, IVT12, LDG, LDX, LDVT, LWKOPT
       DOUBLE PRECISION   BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
      $                   THETA, IOTA, W
-*     .. Local Arrays ..
-      DOUBLE PRECISION   G( M + P, N ), VT( N, N )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -438,12 +437,15 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
       SWAPPED = .FALSE.
       L = 0
-      LMAX = MIN( M + P, N )
-      Z = ( M + P ) * N
-      G = WORK( 1 )
       LDG = M + P
-      VT = 0
       LDVT = N
+      LMAX = MIN( M + P, N )
+      IG = 1
+      IG11 = 1
+      IG21 = M + 1
+      IG22 = LDG * M + M + 1
+      IVT = LDG * N + 2
+      IVT12 = IVT + LDVT * M
       THETA = NAN
       IOTA = NAN
       W = NAN
@@ -453,21 +455,23 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IF( INFO.EQ.0 ) THEN
          LWKOPT = 0
 *
-         CALL DGEQP3( M+P, N, G, LDG, IWORK, ALPHA, WORK, -1, INFO )
+         CALL DGEQP3( M + P, N, WORK( IG ), LDG, IWORK, ALPHA, WORK, -1,
+     $                INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          LWKOPT = INT( WORK( 1 ) )
 *
-         CALL DORGQR( M + P, LMAX, LMAX, G, LDG, ALPHA, WORK, -1, INFO )
+         CALL DORGQR( M + P, LMAX, LMAX, WORK( IG ), LDG, ALPHA, WORK,
+     $                -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *
          CALL DORCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, LMAX,
-     $                    G, LDG, G, LDG,
+     $                    WORK( IG ), LDG, WORK( IG ), LDG,
      $                    ALPHA,
-     $                    U1, LDU1, U2, LDU2, VT, LDVT,
+     $                    U1, LDU1, U2, LDU2, WORK( IVT ), LDVT,
      $                    WORK, -1, IWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *        The matrix (A, B) must be stored sequentially for DORGQR
-         LWKOPT = LWKOPT + Z
+         LWKOPT = LWKOPT + IVT
 *        2-by-1 CSD matrix V1 must be stored
          IF( WANTX ) THEN
             LWKOPT = LWKOPT + LDVT*N
@@ -484,9 +488,7 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          RETURN
       ENDIF
 *     Finish initialization
-      IF( WANTX ) THEN
-         VT = WORK( Z + 1 )
-      ELSE
+      IF( .NOT.WANTX ) THEN
          LDVT = 0
       END IF
 *
@@ -506,8 +508,8 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Copy matrices A, B into the (M+P) x N matrix G
 *
-      CALL DLACPY( 'A', M, N, A, LDA, G( 1, 1 ), LDG )
-      CALL DLACPY( 'A', P, N, B, LDB, G( M + 1, 1 ), LDG )
+      CALL DLACPY( 'A', M, N, A, LDA, WORK( IG11 ), LDG )
+      CALL DLACPY( 'A', P, N, B, LDB, WORK( IG21 ), LDG )
 *
 *     DEBUG
 *
@@ -533,8 +535,8 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Compute the QR factorization with column pivoting GΠ = Q1 R1
 *
-      CALL DGEQP3( M + P, N, G, LDG, IWORK, ALPHA,
-     $             WORK( Z + 1 ), LWORK - Z, INFO )
+      CALL DGEQP3( M + P, N, WORK( IG ), LDG, IWORK, ALPHA,
+     $             WORK( IVT ), LWORK - IVT + 1, INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
       END IF
@@ -542,7 +544,7 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     Determine the rank of G
 *
       DO I = 1, MIN( M + P, N )
-         IF( ABS( G( I, I ) ).LE.TOL ) THEN
+         IF( ABS( WORK( (I-1) * LDG + I ) ).LE.TOL ) THEN
             EXIT
          END IF
          L = L + 1
@@ -566,11 +568,11 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
       IF( WANTX ) THEN
          IF( L.LE.M ) THEN
-             CALL DLACPY( 'U', L, N, G, LDG, A, LDA )
+             CALL DLACPY( 'U', L, N, WORK( IG ), LDG, A, LDA )
              CALL DLASET( 'L', L - 1, N, 0.0D0, 0.0D0, A( 2, 1 ), LDA )
          ELSE
-             CALL DLACPY( 'U', M, N, G, LDG, A, LDA )
-             CALL DLACPY( 'U', L - M, N - M, G( M+1,M+1 ), LDG, B, LDB )
+             CALL DLACPY( 'U', M, N, WORK( IG ), LDG, A, LDA )
+             CALL DLACPY( 'U', L - M, N - M, WORK( IG22 ), LDG, B, LDB )
 *
              CALL DLASET( 'L', M - 1, N, 0.0D0, 0.0D0, A( 2, 1 ), LDA )
              CALL DLASET( 'L', L-M-1, N, 0.0D0, 0.0D0, B( 2, 1 ), LDB )
@@ -579,8 +581,8 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition
 *
-      CALL DORGQR( M + P, L, L, G, LDG, ALPHA,
-     $             WORK( Z + 1 ), LWORK - Z, INFO )
+      CALL DORGQR( M + P, L, L, WORK( IG ), LDG, ALPHA,
+     $             WORK( IVT ), LWORK - IVT + 1, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
@@ -595,10 +597,10 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       K = MIN( M, P, L, M + P - L )
       K1 = MAX( L - P, 0 )
       CALL DORCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, L,
-     $                 G( 1, 1 ), LDG, G( M + 1, 1 ), LDG,
+     $                 WORK( IG11 ), LDG, WORK( IG21 ), LDG,
      $                 ALPHA,
-     $                 U1, LDU1, U2, LDU2, VT, LDVT,
-     $                 WORK( Z + LDVT*N + 1 ), LWORK - Z - LDVT*N,
+     $                 U1, LDU1, U2, LDU2, WORK( IVT ), LDVT,
+     $                 WORK( IVT + LDVT*N ), LWORK - IVT - LDVT*N + 1,
      $                 IWORK( N + 1 ), INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
@@ -614,14 +616,14 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          LDX = L
          IF ( L.LE.M ) THEN
             CALL DGEMM( 'N', 'N', L, N, L,
-     $                  1.0D0, VT, LDVT, A, LDA,
+     $                  1.0D0, WORK( IVT ), LDVT, A, LDA,
      $                  0.0D0, WORK( 2 ), LDX )
          ELSE
             CALL DGEMM( 'N', 'N', L, N, M,
-     $                  1.0D0, VT( 1, 1 ), LDVT, A, LDA,
+     $                  1.0D0, WORK( IVT ), LDVT, A, LDA,
      $                  0.0D0, WORK( 2 ), LDX )
             CALL DGEMM( 'N', 'N', L, N - M, L - M,
-     $                  1.0D0, VT( 1, M + 1 ), LDVT, B, LDB,
+     $                  1.0D0, WORK( IVT12 ), LDVT, B, LDB,
      $                  1.0D0, WORK( L*M + 2 ), LDX )
          END IF
 *        Revert column permutation Π by permuting the columns of X
@@ -640,7 +642,7 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
             ALPHA( I ) = 0.0D0
             BETA( I ) = 1.0D0
             IF( WANTX ) THEN
-               WORK( Z + I + 1 ) = 1.0D0
+               WORK( IVT + I ) = 1.0D0
             END IF
          ELSE
 *           iota comes in the greek alphabet after theta
@@ -651,13 +653,13 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
                ALPHA( I ) =  ( SIN( IOTA ) / TAN( THETA ) ) / W
                BETA( I ) = SIN( IOTA )
                IF( WANTX ) THEN
-                  WORK( Z + I + 1 ) = SIN( THETA ) / SIN( IOTA )
+                  WORK( IVT + I ) = SIN( THETA ) / SIN( IOTA )
                END IF
             ELSE
                ALPHA( I ) = COS( IOTA )
                BETA( I ) = SIN( IOTA )
                IF( WANTX ) THEN
-                  WORK( Z + I + 1 ) = COS( THETA ) / COS( IOTA ) / W
+                  WORK( IVT + I ) = COS( THETA ) / COS( IOTA ) / W
                END IF
             END IF
          END IF
@@ -670,7 +672,7 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
             END DO
             DO I = 1, K
                WORK( LDX*J + I + K1 + 1 ) =
-     $         WORK( LDX*J + I + K1 + 1 ) * WORK( Z + I + 1 )
+     $         WORK( LDX*J + I + K1 + 1 ) * WORK( IVT + I )
             END DO
          END DO
       END IF
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index bbf47b6218..74f41d06d2 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -350,11 +350,10 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
-      INTEGER            I, J, K, K1, LMAX, Z, LDG, LDX, LDVT, LWKOPT
+      INTEGER            I, J, K, K1, LMAX, IG, IG11, IG21, IG22,
+     $                   IVT, IVT12, LDG, LDX, LDVT, LWKOPT
       REAL               BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
      $                   THETA, IOTA, W
-*     .. Local Arrays ..
-      REAL               G( M + P, N ), VT( N, N )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -438,12 +437,15 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
       SWAPPED = .FALSE.
       L = 0
-      LMAX = MIN( M + P, N )
-      Z = ( M + P ) * N
-      G = WORK( 1 )
       LDG = M + P
-      VT = 0
       LDVT = N
+      LMAX = MIN( M + P, N )
+      IG = 1
+      IG11 = 1
+      IG21 = M + 1
+      IG22 = LDG * M + M + 1
+      IVT = LDG * N + 2
+      IVT12 = IVT + LDVT * M
       THETA = NAN
       IOTA = NAN
       W = NAN
@@ -453,21 +455,23 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IF( INFO.EQ.0 ) THEN
          LWKOPT = 0
 *
-         CALL SGEQP3( M+P, N, G, LDG, IWORK, ALPHA, WORK, -1, INFO )
+         CALL SGEQP3( M + P, N, WORK( IG ), LDG, IWORK, ALPHA, WORK, -1,
+     $                INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          LWKOPT = INT( WORK( 1 ) )
 *
-         CALL SORGQR( M + P, LMAX, LMAX, G, LDG, ALPHA, WORK, -1, INFO )
+         CALL SORGQR( M + P, LMAX, LMAX, WORK( IG ), LDG, ALPHA, WORK,
+     $                -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *
          CALL SORCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, LMAX,
-     $                    G, LDG, G, LDG,
+     $                    WORK( IG ), LDG, WORK( IG ), LDG,
      $                    ALPHA,
-     $                    U1, LDU1, U2, LDU2, VT, LDVT,
+     $                    U1, LDU1, U2, LDU2, WORK( IVT ), LDVT,
      $                    WORK, -1, IWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *        The matrix (A, B) must be stored sequentially for SORGQR
-         LWKOPT = LWKOPT + Z
+         LWKOPT = LWKOPT + IVT
 *        2-by-1 CSD matrix V1 must be stored
          IF( WANTX ) THEN
             LWKOPT = LWKOPT + LDVT*N
@@ -484,9 +488,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          RETURN
       ENDIF
 *     Finish initialization
-      IF( WANTX ) THEN
-         VT = WORK( Z + 1 )
-      ELSE
+      IF( .NOT.WANTX ) THEN
          LDVT = 0
       END IF
 *
@@ -506,8 +508,8 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Copy matrices A, B into the (M+P) x N matrix G
 *
-      CALL SLACPY( 'A', M, N, A, LDA, G( 1, 1 ), LDG )
-      CALL SLACPY( 'A', P, N, B, LDB, G( M + 1, 1 ), LDG )
+      CALL SLACPY( 'A', M, N, A, LDA, WORK( IG11 ), LDG )
+      CALL SLACPY( 'A', P, N, B, LDB, WORK( IG21 ), LDG )
 *
 *     DEBUG
 *
@@ -533,8 +535,8 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Compute the QR factorization with column pivoting GΠ = Q1 R1
 *
-      CALL SGEQP3( M + P, N, G, LDG, IWORK, ALPHA,
-     $             WORK( Z + 1 ), LWORK - Z, INFO )
+      CALL SGEQP3( M + P, N, WORK( IG ), LDG, IWORK, ALPHA,
+     $             WORK( IVT ), LWORK - IVT + 1, INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
       END IF
@@ -542,7 +544,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     Determine the rank of G
 *
       DO I = 1, MIN( M + P, N )
-         IF( ABS( G( I, I ) ).LE.TOL ) THEN
+         IF( ABS( WORK( (I-1) * LDG + I ) ).LE.TOL ) THEN
             EXIT
          END IF
          L = L + 1
@@ -566,11 +568,11 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
       IF( WANTX ) THEN
          IF( L.LE.M ) THEN
-             CALL SLACPY( 'U', L, N, G, LDG, A, LDA )
+             CALL SLACPY( 'U', L, N, WORK( IG ), LDG, A, LDA )
              CALL SLASET( 'L', L - 1, N, 0.0E0, 0.0E0, A( 2, 1 ), LDA )
          ELSE
-             CALL SLACPY( 'U', M, N, G, LDG, A, LDA )
-             CALL SLACPY( 'U', L - M, N - M, G( M+1,M+1 ), LDG, B, LDB )
+             CALL SLACPY( 'U', M, N, WORK( IG ), LDG, A, LDA )
+             CALL SLACPY( 'U', L - M, N - M, WORK( IG22 ), LDG, B, LDB )
 *
              CALL SLASET( 'L', M - 1, N, 0.0E0, 0.0E0, A( 2, 1 ), LDA )
              CALL SLASET( 'L', L-M-1, N, 0.0E0, 0.0E0, B( 2, 1 ), LDB )
@@ -579,8 +581,8 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Explicitly form Q1 so that we can compute the CS decomposition
 *
-      CALL SORGQR( M + P, L, L, G, LDG, ALPHA,
-     $             WORK( Z + 1 ), LWORK - Z, INFO )
+      CALL SORGQR( M + P, L, L, WORK( IG ), LDG, ALPHA,
+     $             WORK( IVT ), LWORK - IVT + 1, INFO )
       IF ( INFO.NE.0 ) THEN
          RETURN
       END IF
@@ -595,10 +597,10 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       K = MIN( M, P, L, M + P - L )
       K1 = MAX( L - P, 0 )
       CALL SORCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, L,
-     $                 G( 1, 1 ), LDG, G( M + 1, 1 ), LDG,
+     $                 WORK( IG11 ), LDG, WORK( IG21 ), LDG,
      $                 ALPHA,
-     $                 U1, LDU1, U2, LDU2, VT, LDVT,
-     $                 WORK( Z + LDVT*N + 1 ), LWORK - Z - LDVT*N,
+     $                 U1, LDU1, U2, LDU2, WORK( IVT ), LDVT,
+     $                 WORK( IVT + LDVT*N ), LWORK - IVT - LDVT*N + 1,
      $                 IWORK( N + 1 ), INFO )
       IF( INFO.NE.0 ) THEN
          RETURN
@@ -614,14 +616,14 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          LDX = L
          IF ( L.LE.M ) THEN
             CALL SGEMM( 'N', 'N', L, N, L,
-     $                  1.0E0, VT, LDVT, A, LDA,
+     $                  1.0E0, WORK( IVT ), LDVT, A, LDA,
      $                  0.0E0, WORK( 2 ), LDX )
          ELSE
             CALL SGEMM( 'N', 'N', L, N, M,
-     $                  1.0E0, VT( 1, 1 ), LDVT, A, LDA,
+     $                  1.0E0, WORK( IVT ), LDVT, A, LDA,
      $                  0.0E0, WORK( 2 ), LDX )
             CALL SGEMM( 'N', 'N', L, N - M, L - M,
-     $                  1.0E0, VT( 1, M + 1 ), LDVT, B, LDB,
+     $                  1.0E0, WORK( IVT12 ), LDVT, B, LDB,
      $                  1.0E0, WORK( L*M + 2 ), LDX )
          END IF
 *        Revert column permutation Π by permuting the columns of X
@@ -640,7 +642,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
             ALPHA( I ) = 0.0E0
             BETA( I ) = 1.0E0
             IF( WANTX ) THEN
-               WORK( Z + I + 1 ) = 1.0E0
+               WORK( IVT + I ) = 1.0E0
             END IF
          ELSE
 *           iota comes in the greek alphabet after theta
@@ -651,13 +653,13 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
                ALPHA( I ) =  ( SIN( IOTA ) / TAN( THETA ) ) / W
                BETA( I ) = SIN( IOTA )
                IF( WANTX ) THEN
-                  WORK( Z + I + 1 ) = SIN( THETA ) / SIN( IOTA )
+                  WORK( IVT + I ) = SIN( THETA ) / SIN( IOTA )
                END IF
             ELSE
                ALPHA( I ) = COS( IOTA )
                BETA( I ) = SIN( IOTA )
                IF( WANTX ) THEN
-                  WORK( Z + I + 1 ) = COS( THETA ) / COS( IOTA ) / W
+                  WORK( IVT + I ) = COS( THETA ) / COS( IOTA ) / W
                END IF
             END IF
          END IF
@@ -670,7 +672,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
             END DO
             DO I = 1, K
                WORK( LDX*J + I + K1 + 1 ) =
-     $         WORK( LDX*J + I + K1 + 1 ) * WORK( Z + I + 1 )
+     $         WORK( LDX*J + I + K1 + 1 ) * WORK( IVT + I )
             END DO
          END DO
       END IF

From 2652e1fc6b026ad5bc8ebd41b74ae7ae1856ab02 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Fri, 12 Feb 2021 18:29:50 +0000
Subject: [PATCH 085/101] xGGQRCS: remove dead assignments to LWKOPT

---
 SRC/cggqrcs.f | 2 --
 SRC/dggqrcs.f | 2 --
 SRC/sggqrcs.f | 2 --
 SRC/zggqrcs.f | 2 --
 4 files changed, 8 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 6986365e99..b2fdb27fba 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -399,7 +399,6 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       WANTU2 = LSAME( JOBU2, 'Y' )
       WANTX = LSAME( JOBX, 'Y' )
       LQUERY = LWORK.EQ.-1 .OR. LRWORK.EQ.-1
-      LWKOPT = 1
 *
 *     Test the input arguments
 *
@@ -483,7 +482,6 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          CALL CGEQP3( M + P, N, WORK( IG ), LDG, IWORK, WORK, WORK, -1,
      $                RWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
-         LWKOPT = INT( WORK( 1 ) )
 *
          CALL CUNGQR( M + P, LMAX, LMAX, WORK( IG ), LDG, WORK, WORK,
      $                -1, INFO )
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index f71fd0c4bb..27f544e5c2 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -375,7 +375,6 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       WANTU2 = LSAME( JOBU2, 'Y' )
       WANTX = LSAME( JOBX, 'Y' )
       LQUERY = ( LWORK.EQ.-1 )
-      LWKOPT = 1
 *
 *     Test the input arguments
 *
@@ -458,7 +457,6 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          CALL DGEQP3( M + P, N, WORK( IG ), LDG, IWORK, ALPHA, WORK, -1,
      $                INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
-         LWKOPT = INT( WORK( 1 ) )
 *
          CALL DORGQR( M + P, LMAX, LMAX, WORK( IG ), LDG, ALPHA, WORK,
      $                -1, INFO )
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 74f41d06d2..ea4b746550 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -375,7 +375,6 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       WANTU2 = LSAME( JOBU2, 'Y' )
       WANTX = LSAME( JOBX, 'Y' )
       LQUERY = ( LWORK.EQ.-1 )
-      LWKOPT = 1
 *
 *     Test the input arguments
 *
@@ -458,7 +457,6 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          CALL SGEQP3( M + P, N, WORK( IG ), LDG, IWORK, ALPHA, WORK, -1,
      $                INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
-         LWKOPT = INT( WORK( 1 ) )
 *
          CALL SORGQR( M + P, LMAX, LMAX, WORK( IG ), LDG, ALPHA, WORK,
      $                -1, INFO )
diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index 1790b84ca4..c7965b347d 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -399,7 +399,6 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       WANTU2 = LSAME( JOBU2, 'Y' )
       WANTX = LSAME( JOBX, 'Y' )
       LQUERY = LWORK.EQ.-1 .OR. LRWORK.EQ.-1
-      LWKOPT = 1
 *
 *     Test the input arguments
 *
@@ -483,7 +482,6 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          CALL ZGEQP3( M + P, N, WORK( IG ), LDG, IWORK, WORK, WORK, -1,
      $                RWORK, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
-         LWKOPT = INT( WORK( 1 ) )
 *
          CALL ZUNGQR( M + P, LMAX, LMAX, WORK( IG ), LDG, WORK, WORK,
      $                -1, INFO )

From 42f99104c52824dd1b163a46ab3fa8293181b681 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Fri, 12 Feb 2021 19:37:06 +0000
Subject: [PATCH 086/101] xGGQRCS: fix incorrect info values on error

---
 SRC/cggqrcs.f | 10 +++++-----
 SRC/dggqrcs.f | 10 +++++-----
 SRC/sggqrcs.f | 10 +++++-----
 SRC/zggqrcs.f | 10 +++++-----
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index b2fdb27fba..f80f9b604d 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -416,15 +416,15 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
-         INFO = -9
+         INFO = -10
       ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
-         INFO = -11
+         INFO = -12
       ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
-         INFO = -15
+         INFO = -16
       ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
-         INFO = -17
+         INFO = -18
       ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
-         INFO = -19
+         INFO = -20
       END IF
 *
 *     Make sure A is the matrix smaller in norm
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 27f544e5c2..49c9341bfe 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -392,15 +392,15 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
-         INFO = -9
+         INFO = -10
       ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
-         INFO = -11
+         INFO = -12
       ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
-         INFO = -15
+         INFO = -16
       ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
-         INFO = -17
+         INFO = -18
       ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
-         INFO = -19
+         INFO = -20
       END IF
 *
 *     Make sure A is the matrix smaller in norm
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index ea4b746550..af794e7ec3 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -392,15 +392,15 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
-         INFO = -9
+         INFO = -10
       ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
-         INFO = -11
+         INFO = -12
       ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
-         INFO = -15
+         INFO = -16
       ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
-         INFO = -17
+         INFO = -18
       ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
-         INFO = -19
+         INFO = -20
       END IF
 *
 *     Make sure A is the matrix smaller in norm
diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index c7965b347d..e5b909c9f3 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -416,15 +416,15 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       ELSE IF( P.LT.1 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
-         INFO = -9
+         INFO = -10
       ELSE IF( LDB.LT.MAX( 1, P ) ) THEN
-         INFO = -11
+         INFO = -12
       ELSE IF( LDU1.LT.1 .OR. ( WANTU1 .AND. LDU1.LT.M ) ) THEN
-         INFO = -15
+         INFO = -16
       ELSE IF( LDU2.LT.1 .OR. ( WANTU2 .AND. LDU2.LT.P ) ) THEN
-         INFO = -17
+         INFO = -18
       ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
-         INFO = -19
+         INFO = -20
       END IF
 *
 *     Make sure A is the matrix smaller in norm

From f1358caef1aee627ea301e3cb510f355bdc7cc3e Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Fri, 12 Feb 2021 19:57:25 +0000
Subject: [PATCH 087/101] xGGQRCS: check for sufficiently large workspace

---
 SRC/cggqrcs.f | 15 ++++++++++++++-
 SRC/dggqrcs.f | 12 +++++++++++-
 SRC/sggqrcs.f | 12 +++++++++++-
 SRC/zggqrcs.f | 15 ++++++++++++++-
 4 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index f80f9b604d..93180d54a0 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -374,7 +374,7 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
       INTEGER            I, J, K, K1, LMAX, IG, IG11, IG21, IG22,
-     $                   IVT, IVT12, LDG, LDX, LDVT, LWKOPT
+     $                   IVT, IVT12, LDG, LDX, LDVT, LWKMIN, LWKOPT
       REAL               BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
      $                   THETA, IOTA, W
       COMPLEX            CNAN
@@ -477,14 +477,17 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     Compute workspace
 *
       IF( INFO.EQ.0 ) THEN
+         LWKMIN = 0
          LWKOPT = 0
 *
          CALL CGEQP3( M + P, N, WORK( IG ), LDG, IWORK, WORK, WORK, -1,
      $                RWORK, INFO )
+         LWKMIN = MAX( LWKMIN, N + 1 )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *
          CALL CUNGQR( M + P, LMAX, LMAX, WORK( IG ), LDG, WORK, WORK,
      $                -1, INFO )
+         LWKMIN = MAX( LWKMIN, LMAX )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *
          CALL CUNCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, LMAX,
@@ -492,11 +495,14 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                    ALPHA,
      $                    U1, LDU1, U2, LDU2, WORK( IVT ), LDVT,
      $                    WORK, -1, RWORK, LRWORK, IWORK, INFO )
+         LWKMIN = MAX( LWKMIN, INT( WORK( 1 ) ) )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *        The matrix (A, B) must be stored sequentially for CUNGQR
+         LWKMIN = LWKMIN + IVT
          LWKOPT = LWKOPT + IVT
 *        2-by-1 CSD matrix V1 must be stored
          IF( WANTX ) THEN
+            LWKMIN = LWKMIN + LDVT*N
             LWKOPT = LWKOPT + LDVT*N
          END IF
 *        Adjust CUNCSD2BY1 LRWORK for case with maximum memory
@@ -509,6 +515,13 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                - 8 * MAX( 0, MIN( M, P, N, M+P-N ) )
      $                + 8 * MIN( M, P, N )
          LRWKOPT = MAX( 2*N, LRWORK2BY1 )
+*        Check workspace size
+         IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -20
+         END IF
+         IF( LRWORK.LT.LRWKOPT .AND. .NOT.LQUERY ) THEN
+            INFO = -22
+         END IF
 *
          WORK( 1 ) = CMPLX( REAL( LWKOPT ), 0.0E0 )
          RWORK( 1 ) = REAL( LRWKOPT )
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 49c9341bfe..d3a28166b8 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -351,7 +351,7 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
       INTEGER            I, J, K, K1, LMAX, IG, IG11, IG21, IG22,
-     $                   IVT, IVT12, LDG, LDX, LDVT, LWKOPT
+     $                   IVT, IVT12, LDG, LDX, LDVT, LWKMIN, LWKOPT
       DOUBLE PRECISION   BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
      $                   THETA, IOTA, W
 *     ..
@@ -452,14 +452,17 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     Compute workspace
 *
       IF( INFO.EQ.0 ) THEN
+         LWKMIN = 0
          LWKOPT = 0
 *
          CALL DGEQP3( M + P, N, WORK( IG ), LDG, IWORK, ALPHA, WORK, -1,
      $                INFO )
+         LWKMIN = MAX( LWKMIN, 3 * N + 1 )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *
          CALL DORGQR( M + P, LMAX, LMAX, WORK( IG ), LDG, ALPHA, WORK,
      $                -1, INFO )
+         LWKMIN = MAX( LWKMIN, LMAX )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *
          CALL DORCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, LMAX,
@@ -467,13 +470,20 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                    ALPHA,
      $                    U1, LDU1, U2, LDU2, WORK( IVT ), LDVT,
      $                    WORK, -1, IWORK, INFO )
+         LWKMIN = MAX( LWKMIN, INT( WORK( 1 ) ) )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *        The matrix (A, B) must be stored sequentially for DORGQR
+         LWKMIN = LWKMIN + IVT
          LWKOPT = LWKOPT + IVT
 *        2-by-1 CSD matrix V1 must be stored
          IF( WANTX ) THEN
+            LWKMIN = LWKMIN + LDVT*N
             LWKOPT = LWKOPT + LDVT*N
          END IF
+*        Check for minimum workspace size
+         IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -20
+         END IF
 *
          WORK( 1 ) = DBLE( LWKOPT )
       END IF
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index af794e7ec3..fa9cc60c87 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -351,7 +351,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
       INTEGER            I, J, K, K1, LMAX, IG, IG11, IG21, IG22,
-     $                   IVT, IVT12, LDG, LDX, LDVT, LWKOPT
+     $                   IVT, IVT12, LDG, LDX, LDVT, LWKMIN, LWKOPT
       REAL               BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
      $                   THETA, IOTA, W
 *     ..
@@ -452,14 +452,17 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     Compute workspace
 *
       IF( INFO.EQ.0 ) THEN
+         LWKMIN = 0
          LWKOPT = 0
 *
          CALL SGEQP3( M + P, N, WORK( IG ), LDG, IWORK, ALPHA, WORK, -1,
      $                INFO )
+         LWKMIN = MAX( LWKMIN, 3 * N + 1 )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *
          CALL SORGQR( M + P, LMAX, LMAX, WORK( IG ), LDG, ALPHA, WORK,
      $                -1, INFO )
+         LWKMIN = MAX( LWKMIN, LMAX )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *
          CALL SORCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, LMAX,
@@ -467,13 +470,20 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                    ALPHA,
      $                    U1, LDU1, U2, LDU2, WORK( IVT ), LDVT,
      $                    WORK, -1, IWORK, INFO )
+         LWKMIN = MAX( LWKMIN, INT( WORK( 1 ) ) )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *        The matrix (A, B) must be stored sequentially for SORGQR
+         LWKMIN = LWKMIN + IVT
          LWKOPT = LWKOPT + IVT
 *        2-by-1 CSD matrix V1 must be stored
          IF( WANTX ) THEN
+            LWKMIN = LWKMIN + LDVT*N
             LWKOPT = LWKOPT + LDVT*N
          END IF
+*        Check workspace size
+         IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -20
+         END IF
 *
          WORK( 1 ) = REAL( LWKOPT )
       END IF
diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index e5b909c9f3..3046893aa3 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -374,7 +374,7 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     .. Local Scalars ..
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
       INTEGER            I, J, K, K1, LMAX, IG, IG11, IG21, IG22,
-     $                   IVT, IVT12, LDG, LDX, LDVT, LWKOPT
+     $                   IVT, IVT12, LDG, LDX, LDVT, LWKMIN, LWKOPT
       DOUBLE PRECISION   BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
      $                   THETA, IOTA, W
       COMPLEX*16         ZNAN
@@ -477,14 +477,17 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *     Compute workspace
 *
       IF( INFO.EQ.0 ) THEN
+         LWKMIN = 0
          LWKOPT = 0
 *
          CALL ZGEQP3( M + P, N, WORK( IG ), LDG, IWORK, WORK, WORK, -1,
      $                RWORK, INFO )
+         LWKMIN = MAX( LWKMIN, N + 1 )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *
          CALL ZUNGQR( M + P, LMAX, LMAX, WORK( IG ), LDG, WORK, WORK,
      $                -1, INFO )
+         LWKMIN = MAX( LWKMIN, LMAX )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *
          CALL ZUNCSD2BY1( JOBU1, JOBU2, JOBX, M + P, M, LMAX,
@@ -492,11 +495,14 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                    ALPHA,
      $                    U1, LDU1, U2, LDU2, WORK( IVT ), LDVT,
      $                    WORK, -1, RWORK, LRWORK, IWORK, INFO )
+         LWKMIN = MAX( LWKMIN, INT( WORK( 1 ) ) )
          LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
 *        The matrix (A, B) must be stored sequentially for ZUNGQR
+         LWKMIN = LWKMIN + IVT
          LWKOPT = LWKOPT + IVT
 *        2-by-1 CSD matrix V1 must be stored
          IF( WANTX ) THEN
+            LWKMIN = LWKMIN + LDVT*N
             LWKOPT = LWKOPT + LDVT*N
          END IF
 *        Adjust ZUNCSD2BY1 LRWORK for case with maximum memory
@@ -509,6 +515,13 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                - 8 * MAX( 0, MIN( M, P, N, M+P-N ) )
      $                + 8 * MIN( M, P, N )
          LRWKOPT = MAX( 2*N, LRWORK2BY1 )
+*        Check workspace size
+         IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -20
+         END IF
+         IF( LRWORK.LT.LRWKOPT .AND. .NOT.LQUERY ) THEN
+            INFO = -22
+         END IF
 *
          WORK( 1 ) = DCMPLX( DBLE( LWKOPT ), 0.0D0 )
          RWORK( 1 ) = DBLE( LRWKOPT )

From bd8022807fa1f5afbdd2114652c1b0172cf59e3d Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Fri, 12 Feb 2021 19:57:58 +0000
Subject: [PATCH 088/101] ZGGQRCS: fix documentation typos

---
 SRC/zggqrcs.f | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index 3046893aa3..c30f5640b2 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -285,8 +285,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit.
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
-*>          > 0:  CBBCSD did not converge. For further details, see
-*>                subroutine CUNCSDBY1.
+*>          > 0:  ZBBCSD did not converge. For further details, see
+*>                subroutine ZUNCSDBY1.
 *> \endverbatim
 *
 *> \par Internal Parameters:

From 24fc861b1eba4c5b2d849f91b5f5799fc62ec52d Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Sat, 13 Feb 2021 14:17:59 +0000
Subject: [PATCH 089/101] xGGQRCS: update version number, release date

---
 SRC/cggqrcs.f | 6 +++---
 SRC/dggqrcs.f | 6 +++---
 SRC/sggqrcs.f | 6 +++---
 SRC/zggqrcs.f | 6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 93180d54a0..7a48929357 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -315,7 +315,7 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date October 2019, May 2020
+*> \date October 2019, May 2020, February 2021
 *
 *> \ingroup realGEsing
 *
@@ -345,10 +345,10 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                              WORK, LWORK, RWORK, LRWORK, IWORK,
      $                              INFO )
 *
-*  -- LAPACK driver routine (version 3.7.0) --
+*  -- LAPACK driver routine (version 3.10.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     September 2016
+*     February 2021
 *
       IMPLICIT NONE
 *     .. Scalar Arguments ..
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index d3a28166b8..5b44daa562 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -298,7 +298,7 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date October 2019, May 2020
+*> \date September 2016, May 2020, February 2021
 *
 *> \ingroup doubleGEsing
 *
@@ -327,10 +327,10 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                              U1, LDU1, U2, LDU2,
      $                              WORK, LWORK, IWORK, INFO )
 *
-*  -- LAPACK driver routine (version 3.7.0) --
+*  -- LAPACK driver routine (version 3.10.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     September 2016
+*     Feburary 2021
 *
       IMPLICIT NONE
 *     .. Scalar Arguments ..
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index fa9cc60c87..8804546657 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -298,7 +298,7 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date October 2019, May 2020
+*> \date October 2019, May 2020, February 2021
 *
 *> \ingroup realGEsing
 *
@@ -327,10 +327,10 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                              U1, LDU1, U2, LDU2,
      $                              WORK, LWORK, IWORK, INFO )
 *
-*  -- LAPACK driver routine (version 3.7.0) --
+*  -- LAPACK driver routine (version 3.10.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     September 2016
+*     February 2021
 *
       IMPLICIT NONE
 *     .. Scalar Arguments ..
diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index c30f5640b2..6227418903 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -315,7 +315,7 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date October 2019, May 2020
+*> \date October 2019, May 2020, February 2021
 *
 *> \ingroup realGEsing
 *
@@ -345,10 +345,10 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                              WORK, LWORK, RWORK, LRWORK, IWORK,
      $                              INFO )
 *
-*  -- LAPACK driver routine (version 3.7.0) --
+*  -- LAPACK driver routine (version 3.10.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     September 2016
+*     February 2021
 *
       IMPLICIT NONE
 *     .. Scalar Arguments ..

From 3f267f113219b52afa75e7a1b76af5e9b58e0da5 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Sat, 13 Feb 2021 14:19:10 +0000
Subject: [PATCH 090/101] xLASRTI: update version number, release date

---
 SRC/dlasrti.f | 4 ++--
 SRC/slasrti.f | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/SRC/dlasrti.f b/SRC/dlasrti.f
index 195cd97980..b8ea95cb0a 100644
--- a/SRC/dlasrti.f
+++ b/SRC/dlasrti.f
@@ -98,10 +98,10 @@
 *  =====================================================================
       SUBROUTINE DLASRTI( ID, N, X, INDICES, INFO )
 *
-*  -- LAPACK computational routine (version TODO) --
+*  -- LAPACK computational routine (version 3.10.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     TODO
+*     February 2021
 *
 *     .. Scalar Arguments ..
       CHARACTER          ID
diff --git a/SRC/slasrti.f b/SRC/slasrti.f
index 462e41192e..1640befae2 100644
--- a/SRC/slasrti.f
+++ b/SRC/slasrti.f
@@ -98,10 +98,10 @@
 *  =====================================================================
       SUBROUTINE SLASRTI( ID, N, X, INDICES, INFO )
 *
-*  -- LAPACK computational routine (version TODO) --
+*  -- LAPACK computational routine (version 3.10.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     TODO
+*     February 2021
 *
 *     .. Scalar Arguments ..
       CHARACTER          ID

From 305afaf5e001f3bada1a8bf5de62d4668aaa9072 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Sat, 13 Feb 2021 14:20:13 +0000
Subject: [PATCH 091/101] xLASRTR: update version number, release date

---
 SRC/clasrtr.f | 4 ++--
 SRC/dlasrtr.f | 4 ++--
 SRC/slasrtr.f | 4 ++--
 SRC/zlasrtr.f | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/SRC/clasrtr.f b/SRC/clasrtr.f
index 5a2db59462..3c714061c1 100644
--- a/SRC/clasrtr.f
+++ b/SRC/clasrtr.f
@@ -107,10 +107,10 @@
 *  =====================================================================
       SUBROUTINE CLASRTR( ID, M, N, A, LDA, IPVT, RWORK, INFO )
 *
-*  -- LAPACK computational routine (version TODO) --
+*  -- LAPACK computational routine (version 3.10.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     TODO
+*     February 2021
 *
 *     .. Scalar Arguments ..
       CHARACTER          ID
diff --git a/SRC/dlasrtr.f b/SRC/dlasrtr.f
index 308c97176d..d5df943a28 100644
--- a/SRC/dlasrtr.f
+++ b/SRC/dlasrtr.f
@@ -106,10 +106,10 @@
 *  =====================================================================
       SUBROUTINE DLASRTR( ID, M, N, A, LDA, IPVT, WORK, INFO )
 *
-*  -- LAPACK computational routine (version TODO) --
+*  -- LAPACK computational routine (version 3.10.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     TODO
+*     February 2021
 *
 *     .. Scalar Arguments ..
       CHARACTER          ID
diff --git a/SRC/slasrtr.f b/SRC/slasrtr.f
index b32b7d967a..a969ecda07 100644
--- a/SRC/slasrtr.f
+++ b/SRC/slasrtr.f
@@ -106,10 +106,10 @@
 *  =====================================================================
       SUBROUTINE SLASRTR( ID, M, N, A, LDA, IPVT, WORK, INFO )
 *
-*  -- LAPACK computational routine (version TODO) --
+*  -- LAPACK computational routine (version 3.10.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     TODO
+*     February 2021
 *
 *     .. Scalar Arguments ..
       CHARACTER          ID
diff --git a/SRC/zlasrtr.f b/SRC/zlasrtr.f
index 22da41cf68..e9555d1170 100644
--- a/SRC/zlasrtr.f
+++ b/SRC/zlasrtr.f
@@ -107,10 +107,10 @@
 *  =====================================================================
       SUBROUTINE ZLASRTR( ID, M, N, A, LDA, IPVT, RWORK, INFO )
 *
-*  -- LAPACK computational routine (version TODO) --
+*  -- LAPACK computational routine (version 3.10.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     TODO
+*     February 2021
 *
 *     .. Scalar Arguments ..
       CHARACTER          ID

From 224c52728deac3f785de5542a6ce0310f362a310 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Sat, 13 Feb 2021 15:15:15 +0000
Subject: [PATCH 092/101] xGGQRCS: add Fortran files to Makefile

---
 SRC/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/SRC/Makefile b/SRC/Makefile
index 527fb086db..ba1050ab7e 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -124,7 +124,7 @@ SLASRC = \
    sgetc2.o sgetf2.o sgetri.o \
    sggbak.o sggbal.o sgges.o  sgges3.o sggesx.o \
    sggev.o  sggev3.o sggevx.o \
-   sggglm.o sgghrd.o sgghd3.o sgglse.o sggqrf.o \
+   sggglm.o sgghrd.o sgghd3.o sgglse.o sggqrcs.o sggqrf.o \
    sggrqf.o sggsvd3.o sggsvp3.o sgtcon.o sgtrfs.o sgtsv.o \
    sgtsvx.o sgttrf.o sgttrs.o sgtts2.o shgeqz.o \
    slaqz0.o slaqz1.o slaqz2.o slaqz3.o slaqz4.o \
@@ -218,7 +218,7 @@ CLASRC = \
    cgesvx.o cgetc2.o cgetf2.o cgetri.o \
    cggbak.o cggbal.o cgges.o  cgges3.o cggesx.o \
    cggev.o  cggev3.o cggevx.o cggglm.o \
-   cgghrd.o cgghd3.o cgglse.o cggqrf.o cggrqf.o \
+   cgghrd.o cgghd3.o cgglse.o cggqrcs.o cggqrf.o cggrqf.o \
    cggsvd3.o cggsvp3.o \
    cgtcon.o cgtrfs.o cgtsv.o  cgtsvx.o cgttrf.o cgttrs.o cgtts2.o chbev.o \
    chbevd.o chbevx.o chbgst.o chbgv.o  chbgvd.o chbgvx.o chbtrd.o \
@@ -326,7 +326,7 @@ DLASRC = \
    dgetc2.o dgetf2.o dgetrf.o dgetri.o \
    dgetrs.o dggbak.o dggbal.o dgges.o  dgges3.o dggesx.o \
    dggev.o  dggev3.o dggevx.o \
-   dggglm.o dgghrd.o dgghd3.o dgglse.o dggqrf.o \
+   dggglm.o dgghrd.o dgghd3.o dgglse.o dggqrcs.o dggqrf.o \
    dggrqf.o dggsvd3.o dggsvp3.o dgtcon.o dgtrfs.o dgtsv.o \
    dgtsvx.o dgttrf.o dgttrs.o dgtts2.o dhgeqz.o \
    dlaqz0.o dlaqz1.o dlaqz2.o dlaqz3.o dlaqz4.o \
@@ -420,7 +420,7 @@ ZLASRC = \
    zgetri.o zgetrs.o \
    zggbak.o zggbal.o zgges.o  zgges3.o zggesx.o \
    zggev.o  zggev3.o zggevx.o zggglm.o \
-   zgghrd.o zgghd3.o zgglse.o zggqrf.o zggrqf.o \
+   zgghrd.o zgghd3.o zgglse.o zggqrcs.o zggqrf.o zggrqf.o \
    zggsvd3.o zggsvp3.o \
    zgtcon.o zgtrfs.o zgtsv.o  zgtsvx.o zgttrf.o zgttrs.o zgtts2.o zhbev.o \
    zhbevd.o zhbevx.o zhbgst.o zhbgv.o  zhbgvd.o zhbgvx.o zhbtrd.o \

From 0298da077ac9f0f3882cc9219a947a1693c96bf5 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Sat, 13 Feb 2021 16:38:10 +0000
Subject: [PATCH 093/101] xGGRCS: documentation improvements

* fix typos
* remove spurious direction value for an internal variable
* fix formatting of internal variables section
---
 SRC/cggqrcs.f | 7 ++-----
 SRC/dggqrcs.f | 5 +----
 SRC/sggqrcs.f | 7 ++-----
 SRC/zggqrcs.f | 7 ++-----
 4 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 7a48929357..ea5f84618f 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -292,15 +292,12 @@
 *> \par Internal Parameters:
 *  =========================
 *>
-*> \param[out] W
 *> \verbatim
-*>          W is REAL
+*>  W       REAL
 *>          W is a radix power chosen such that the Frobenius norm of A
 *>          and W*B are with SQRT(RADIX) and 1/SQRT(RADIX) of each
 *>          other.
-*> \endverbatim
 *>
-*> \verbatim
 *>  TOL     REAL
 *>          Let G = (A**H,B**H)**H. TOL is the threshold to determine
 *>          the effective rank of G. Generally, it is set to
@@ -328,7 +325,7 @@
 *> \par Further Details:
 *  =====================
 *>
-*>  CGGQRCS should be significantly faster than DGGSVD3 for large
+*>  CGGQRCS should be significantly faster than CGGSVD3 for large
 *>  matrices because the matrices A and B are reduced to a pair of
 *>  well-conditioned bidiagonal matrices instead of pairs of upper
 *>  triangular matrices. On the downside, CGGQRCS requires a much larger
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 5b44daa562..1d8f764516 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -275,15 +275,12 @@
 *> \par Internal Parameters:
 *  =========================
 *>
-*> \param[out] W
 *> \verbatim
-*>          W is DOUBLE PRECISION
+*>  W       DOUBLE PRECISION
 *>          W is a radix power chosen such that the Frobenius norm of A
 *>          and W*B are with SQRT(RADIX) and 1/SQRT(RADIX) of each
 *>          other.
-*> \endverbatim
 *>
-*> \verbatim
 *>  TOL     DOUBLE PRECISION
 *>          Let G = (A**T,B**T)**T. TOL is the threshold to determine
 *>          the effective rank of G. Generally, it is set to
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 8804546657..bf7725e5cd 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -275,15 +275,12 @@
 *> \par Internal Parameters:
 *  =========================
 *>
-*> \param[out] W
 *> \verbatim
-*>          W is REAL
+*>  W       REAL
 *>          W is a radix power chosen such that the Frobenius norm of A
 *>          and W*B are with SQRT(RADIX) and 1/SQRT(RADIX) of each
 *>          other.
-*> \endverbatim
 *>
-*> \verbatim
 *>  TOL     REAL
 *>          Let G = (A**T,B**T)**T. TOL is the threshold to determine
 *>          the effective rank of G. Generally, it is set to
@@ -311,7 +308,7 @@
 *> \par Further Details:
 *  =====================
 *>
-*>  SGGQRCS should be significantly faster than DGGSVD3 for large
+*>  SGGQRCS should be significantly faster than SGGSVD3 for large
 *>  matrices because the matrices A and B are reduced to a pair of
 *>  well-conditioned bidiagonal matrices instead of pairs of upper
 *>  triangular matrices. On the downside, SGGQRCS requires a much larger
diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index 6227418903..faaa2f28ab 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -292,15 +292,12 @@
 *> \par Internal Parameters:
 *  =========================
 *>
-*> \param[out] W
 *> \verbatim
-*>          W is DOUBLE PRECISION
+*>  W       DOUBLE PRECISION
 *>          W is a radix power chosen such that the Frobenius norm of A
 *>          and W*B are with SQRT(RADIX) and 1/SQRT(RADIX) of each
 *>          other.
-*> \endverbatim
 *>
-*> \verbatim
 *>  TOL     DOUBLE PRECISION
 *>          Let G = (A**H,B**H)**H. TOL is the threshold to determine
 *>          the effective rank of G. Generally, it is set to
@@ -328,7 +325,7 @@
 *> \par Further Details:
 *  =====================
 *>
-*>  ZGGQRCS should be significantly faster than DGGSVD3 for large
+*>  ZGGQRCS should be significantly faster than ZGGSVD3 for large
 *>  matrices because the matrices A and B are reduced to a pair of
 *>  well-conditioned bidiagonal matrices instead of pairs of upper
 *>  triangular matrices. On the downside, ZGGQRCS requires a much larger

From 3cb8736f9d8fdb47f067f609d8f695fccdf05aee Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Sat, 13 Feb 2021 18:23:48 +0000
Subject: [PATCH 094/101] xGGQRCS: remove debugging code

---
 SRC/cggqrcs.f | 28 ++++------------------------
 SRC/dggqrcs.f | 26 ++++----------------------
 SRC/sggqrcs.f | 26 ++++----------------------
 SRC/zggqrcs.f | 29 ++++-------------------------
 4 files changed, 16 insertions(+), 93 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index ea5f84618f..b79a787f23 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -372,9 +372,8 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
       INTEGER            I, J, K, K1, LMAX, IG, IG11, IG21, IG22,
      $                   IVT, IVT12, LDG, LDX, LDVT, LWKMIN, LWKOPT
-      REAL               BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
+      REAL               BASE, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
      $                   THETA, IOTA, W
-      COMPLEX            CNAN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -451,10 +450,6 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
 *     Initialize variables
 *
-*     Computing 0.0 / 0.0 directly causes compiler errors
-      NAN = 1.0E0
-      NAN = 0.0 / (NAN - 1.0E0)
-      CNAN = CMPLX( NAN, NAN )
 *
       SWAPPED = .FALSE.
       L = 0
@@ -467,9 +462,9 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IG22 = LDG * M + M + 1
       IVT = LDG * N + 2
       IVT12 = IVT + LDVT * M
-      THETA = NAN
-      IOTA = NAN
-      W = NAN
+      THETA = -1
+      IOTA = -1
+      W = -1
 *
 *     Compute workspace
 *
@@ -555,11 +550,6 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       CALL CLACPY( 'A', M, N, A, LDA, WORK( IG11 ), LDG )
       CALL CLACPY( 'A', P, N, B, LDB, WORK( IG21 ), LDG )
 *
-*     DEBUG
-*
-      CALL CLASET( 'A', M, N, CNAN, CNAN, A, LDA )
-      CALL CLASET( 'A', P, N, CNAN, CNAN, B, LDB )
-*
 *     Compute the Frobenius norm of matrix G
 *
       NORMG = NORMB * SQRT( 1.0E0 + ( ( W * NORMA ) / NORMB )**2 )
@@ -633,11 +623,6 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          RETURN
       END IF
 *
-*     DEBUG
-*
-      ALPHA( 1:N ) = CNAN
-      BETA( 1:N ) = CNAN
-*
 *     Compute the CS decomposition of Q1( :, 1:L )
 *
       K = MIN( M, P, L, M + P - L )
@@ -653,11 +638,6 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          RETURN
       END IF
 *
-*     DEBUG
-*
-      WORK( 1:LDG*N ) = CNAN
-      RWORK( 1:2*N ) = NAN
-*
 *     Compute X = V^T R1( 1:L, : ) and adjust for matrix scaling
 *
       IF( WANTX ) THEN
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 1d8f764516..6424c8e8f9 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -349,7 +349,7 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
       INTEGER            I, J, K, K1, LMAX, IG, IG11, IG21, IG22,
      $                   IVT, IVT12, LDG, LDX, LDVT, LWKMIN, LWKOPT
-      DOUBLE PRECISION   BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
+      DOUBLE PRECISION   BASE, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
      $                   THETA, IOTA, W
 *     ..
 *     .. External Functions ..
@@ -426,10 +426,6 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       END IF
 *
 *     Initialize variables
-*
-*     Computing 0.0 / 0.0 directly causes compiler errors
-      NAN = 1.0D0
-      NAN = 0.0 / (NAN - 1.0D0)
 *
       SWAPPED = .FALSE.
       L = 0
@@ -442,9 +438,9 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IG22 = LDG * M + M + 1
       IVT = LDG * N + 2
       IVT12 = IVT + LDVT * M
-      THETA = NAN
-      IOTA = NAN
-      W = NAN
+      THETA = -1
+      IOTA = -1
+      W = -1
 *
 *     Compute workspace
 *
@@ -516,11 +512,6 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       CALL DLACPY( 'A', M, N, A, LDA, WORK( IG11 ), LDG )
       CALL DLACPY( 'A', P, N, B, LDB, WORK( IG21 ), LDG )
 *
-*     DEBUG
-*
-      CALL DLASET( 'A', M, N, NAN, NAN, A, LDA )
-      CALL DLASET( 'A', P, N, NAN, NAN, B, LDB )
-*
 *     Compute the Frobenius norm of matrix G
 *
       NORMG = NORMB * SQRT( 1.0D0 + ( ( W * NORMA ) / NORMB )**2 )
@@ -592,11 +583,6 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          RETURN
       END IF
 *
-*     DEBUG
-*
-      ALPHA( 1:N ) = NAN
-      BETA( 1:N ) = NAN
-*
 *     Compute the CS decomposition of Q1( :, 1:L )
 *
       K = MIN( M, P, L, M + P - L )
@@ -611,10 +597,6 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          RETURN
       END IF
 *
-*     DEBUG
-*
-      WORK( 1:LDG*N ) = NAN
-*
 *     Compute X = V^T R1( 1:L, : ) and adjust for matrix scaling
 *
       IF( WANTX ) THEN
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index bf7725e5cd..6f7a863a0e 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -349,7 +349,7 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
       INTEGER            I, J, K, K1, LMAX, IG, IG11, IG21, IG22,
      $                   IVT, IVT12, LDG, LDX, LDVT, LWKMIN, LWKOPT
-      REAL               BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
+      REAL               BASE, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
      $                   THETA, IOTA, W
 *     ..
 *     .. External Functions ..
@@ -426,10 +426,6 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       END IF
 *
 *     Initialize variables
-*
-*     Computing 0.0 / 0.0 directly causes compiler errors
-      NAN = 1.0E0
-      NAN = 0.0 / (NAN - 1.0E0)
 *
       SWAPPED = .FALSE.
       L = 0
@@ -442,9 +438,9 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IG22 = LDG * M + M + 1
       IVT = LDG * N + 2
       IVT12 = IVT + LDVT * M
-      THETA = NAN
-      IOTA = NAN
-      W = NAN
+      THETA = -1
+      IOTA = -1
+      W = -1
 *
 *     Compute workspace
 *
@@ -516,11 +512,6 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       CALL SLACPY( 'A', M, N, A, LDA, WORK( IG11 ), LDG )
       CALL SLACPY( 'A', P, N, B, LDB, WORK( IG21 ), LDG )
 *
-*     DEBUG
-*
-      CALL SLASET( 'A', M, N, NAN, NAN, A, LDA )
-      CALL SLASET( 'A', P, N, NAN, NAN, B, LDB )
-*
 *     Compute the Frobenius norm of matrix G
 *
       NORMG = NORMB * SQRT( 1.0E0 + ( ( W * NORMA ) / NORMB )**2 )
@@ -592,11 +583,6 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          RETURN
       END IF
 *
-*     DEBUG
-*
-      ALPHA( 1:N ) = NAN
-      BETA( 1:N ) = NAN
-*
 *     Compute the CS decomposition of Q1( :, 1:L )
 *
       K = MIN( M, P, L, M + P - L )
@@ -611,10 +597,6 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          RETURN
       END IF
 *
-*     DEBUG
-*
-      WORK( 1:LDG*N ) = NAN
-*
 *     Compute X = V^T R1( 1:L, : ) and adjust for matrix scaling
 *
       IF( WANTX ) THEN
diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index faaa2f28ab..e672e08b0c 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -372,9 +372,8 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       LOGICAL            WANTU1, WANTU2, WANTX, LQUERY
       INTEGER            I, J, K, K1, LMAX, IG, IG11, IG21, IG22,
      $                   IVT, IVT12, LDG, LDX, LDVT, LWKMIN, LWKOPT
-      DOUBLE PRECISION   BASE, NAN, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
+      DOUBLE PRECISION   BASE, NORMA, NORMB, NORMG, TOL, ULP, UNFL,
      $                   THETA, IOTA, W
-      COMPLEX*16         ZNAN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -450,11 +449,6 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       END IF
 *
 *     Initialize variables
-*
-*     Computing 0.0 / 0.0 directly causes compiler errors
-      NAN = 1.0D0
-      NAN = 0.0 / (NAN - 1.0D0)
-      ZNAN = DCMPLX( NAN, NAN )
 *
       SWAPPED = .FALSE.
       L = 0
@@ -467,9 +461,9 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       IG22 = LDG * M + M + 1
       IVT = LDG * N + 2
       IVT12 = IVT + LDVT * M
-      THETA = NAN
-      IOTA = NAN
-      W = NAN
+      THETA = -1
+      IOTA = -1
+      W = -1
 *
 *     Compute workspace
 *
@@ -555,11 +549,6 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
       CALL ZLACPY( 'A', M, N, A, LDA, WORK( IG11 ), LDG )
       CALL ZLACPY( 'A', P, N, B, LDB, WORK( IG21 ), LDG )
 *
-*     DEBUG
-*
-      CALL ZLASET( 'A', M, N, ZNAN, ZNAN, A, LDA )
-      CALL ZLASET( 'A', P, N, ZNAN, ZNAN, B, LDB )
-*
 *     Compute the Frobenius norm of matrix G
 *
       NORMG = NORMB * SQRT( 1.0D0 + ( ( W * NORMA ) / NORMB )**2 )
@@ -633,11 +622,6 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          RETURN
       END IF
 *
-*     DEBUG
-*
-      ALPHA( 1:N ) = ZNAN
-      BETA( 1:N ) = ZNAN
-*
 *     Compute the CS decomposition of Q1( :, 1:L )
 *
       K = MIN( M, P, L, M + P - L )
@@ -653,11 +637,6 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          RETURN
       END IF
 *
-*     DEBUG
-*
-      WORK( 1:LDG*N ) = ZNAN
-      RWORK( 1:2*N ) = NAN
-*
 *     Compute X = V^T R1( 1:L, : ) and adjust for matrix scaling
 *
       IF( WANTX ) THEN

From 5b48881409bb938ceff0e03c87878ee380451be8 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Tue, 20 Apr 2021 16:04:39 +0000
Subject: [PATCH 095/101] xGGQRCS: remove version and date information

---
 SRC/cggqrcs.f | 5 +----
 SRC/dggqrcs.f | 5 +----
 SRC/sggqrcs.f | 5 +----
 SRC/zggqrcs.f | 5 +----
 4 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index b79a787f23..6cd851e56b 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -312,8 +312,6 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date October 2019, May 2020, February 2021
-*
 *> \ingroup realGEsing
 *
 *> \par Contributors:
@@ -342,10 +340,9 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                              WORK, LWORK, RWORK, LRWORK, IWORK,
      $                              INFO )
 *
-*  -- LAPACK driver routine (version 3.10.0) --
+*  -- LAPACK driver routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     February 2021
 *
       IMPLICIT NONE
 *     .. Scalar Arguments ..
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 6424c8e8f9..fb15907e91 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -295,8 +295,6 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date September 2016, May 2020, February 2021
-*
 *> \ingroup doubleGEsing
 *
 *> \par Contributors:
@@ -324,10 +322,9 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                              U1, LDU1, U2, LDU2,
      $                              WORK, LWORK, IWORK, INFO )
 *
-*  -- LAPACK driver routine (version 3.10.0) --
+*  -- LAPACK driver routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     Feburary 2021
 *
       IMPLICIT NONE
 *     .. Scalar Arguments ..
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 6f7a863a0e..eff4cd089c 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -295,8 +295,6 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date October 2019, May 2020, February 2021
-*
 *> \ingroup realGEsing
 *
 *> \par Contributors:
@@ -324,10 +322,9 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                              U1, LDU1, U2, LDU2,
      $                              WORK, LWORK, IWORK, INFO )
 *
-*  -- LAPACK driver routine (version 3.10.0) --
+*  -- LAPACK driver routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     February 2021
 *
       IMPLICIT NONE
 *     .. Scalar Arguments ..
diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index e672e08b0c..00f94c9040 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -312,8 +312,6 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date October 2019, May 2020, February 2021
-*
 *> \ingroup realGEsing
 *
 *> \par Contributors:
@@ -342,10 +340,9 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
      $                              WORK, LWORK, RWORK, LRWORK, IWORK,
      $                              INFO )
 *
-*  -- LAPACK driver routine (version 3.10.0) --
+*  -- LAPACK driver routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     February 2021
 *
       IMPLICIT NONE
 *     .. Scalar Arguments ..

From 9f23fbdacf96de94796c753d41813f4cef54316e Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Tue, 20 Apr 2021 16:29:40 +0000
Subject: [PATCH 096/101] xLASRTR: remove version and date information

---
 SRC/clasrtr.f | 5 +----
 SRC/dlasrtr.f | 5 +----
 SRC/slasrtr.f | 5 +----
 SRC/zlasrtr.f | 5 +----
 4 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/SRC/clasrtr.f b/SRC/clasrtr.f
index 3c714061c1..b2450965d9 100644
--- a/SRC/clasrtr.f
+++ b/SRC/clasrtr.f
@@ -100,17 +100,14 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date April 2020
-*
 *> \ingroup auxOTHERcomputational
 *
 *  =====================================================================
       SUBROUTINE CLASRTR( ID, M, N, A, LDA, IPVT, RWORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.10.0) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     February 2021
 *
 *     .. Scalar Arguments ..
       CHARACTER          ID
diff --git a/SRC/dlasrtr.f b/SRC/dlasrtr.f
index d5df943a28..91b1bf91f8 100644
--- a/SRC/dlasrtr.f
+++ b/SRC/dlasrtr.f
@@ -99,17 +99,14 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date April 2020
-*
 *> \ingroup auxOTHERcomputational
 *
 *  =====================================================================
       SUBROUTINE DLASRTR( ID, M, N, A, LDA, IPVT, WORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.10.0) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     February 2021
 *
 *     .. Scalar Arguments ..
       CHARACTER          ID
diff --git a/SRC/slasrtr.f b/SRC/slasrtr.f
index a969ecda07..9896f5134c 100644
--- a/SRC/slasrtr.f
+++ b/SRC/slasrtr.f
@@ -99,17 +99,14 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date April 2020
-*
 *> \ingroup auxOTHERcomputational
 *
 *  =====================================================================
       SUBROUTINE SLASRTR( ID, M, N, A, LDA, IPVT, WORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.10.0) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     February 2021
 *
 *     .. Scalar Arguments ..
       CHARACTER          ID
diff --git a/SRC/zlasrtr.f b/SRC/zlasrtr.f
index e9555d1170..22a7f3b021 100644
--- a/SRC/zlasrtr.f
+++ b/SRC/zlasrtr.f
@@ -100,17 +100,14 @@
 *
 *> \author Christoph Conrads (https://christoph-conrads.name)
 *
-*> \date April 2020
-*
 *> \ingroup auxOTHERcomputational
 *
 *  =====================================================================
       SUBROUTINE ZLASRTR( ID, M, N, A, LDA, IPVT, RWORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.10.0) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     February 2021
 *
 *     .. Scalar Arguments ..
       CHARACTER          ID

From 2a8c4dd9f696ecba3839274e79a11d7db29cdc73 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Tue, 20 Apr 2021 16:30:42 +0000
Subject: [PATCH 097/101] xLASRTI: remove version and date information

---
 SRC/dlasrti.f | 5 +----
 SRC/slasrti.f | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/SRC/dlasrti.f b/SRC/dlasrti.f
index b8ea95cb0a..23e8299b8a 100644
--- a/SRC/dlasrti.f
+++ b/SRC/dlasrti.f
@@ -91,17 +91,14 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date April 2020
-*
 *> \ingroup auxOTHERcomputational
 *
 *  =====================================================================
       SUBROUTINE DLASRTI( ID, N, X, INDICES, INFO )
 *
-*  -- LAPACK computational routine (version 3.10.0) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     February 2021
 *
 *     .. Scalar Arguments ..
       CHARACTER          ID
diff --git a/SRC/slasrti.f b/SRC/slasrti.f
index 1640befae2..36bf402680 100644
--- a/SRC/slasrti.f
+++ b/SRC/slasrti.f
@@ -91,17 +91,14 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date April 2020
-*
 *> \ingroup auxOTHERcomputational
 *
 *  =====================================================================
       SUBROUTINE SLASRTI( ID, N, X, INDICES, INFO )
 *
-*  -- LAPACK computational routine (version 3.10.0) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     February 2021
 *
 *     .. Scalar Arguments ..
       CHARACTER          ID

From 48d8488ef5be018d763ef0ac2e69b7b6e4acbde6 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Tue, 20 Apr 2021 18:34:31 +0000
Subject: [PATCH 098/101] xERRGG: test xGGQRCS input handling

---
 TESTING/EIG/cerrgg.f | 69 ++++++++++++++++++++++++++++++++++++++++++--
 TESTING/EIG/derrgg.f | 53 ++++++++++++++++++++++++++++++++--
 TESTING/EIG/serrgg.f | 54 ++++++++++++++++++++++++++++++++--
 TESTING/EIG/zerrgg.f | 69 ++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 237 insertions(+), 8 deletions(-)

diff --git a/TESTING/EIG/cerrgg.f b/TESTING/EIG/cerrgg.f
index 3839b5f2c3..0a4ca7248a 100644
--- a/TESTING/EIG/cerrgg.f
+++ b/TESTING/EIG/cerrgg.f
@@ -22,7 +22,7 @@
 *> \verbatim
 *>
 *> CERRGG tests the error exits for CGGES, CGGESX, CGGEV, CGGEVX,
-*> CGGES3, CGGEV3, CGGGLM, CGGHRD, CGGLSE, CGGQRF, CGGRQF,
+*> CGGES3, CGGEV3, CGGGLM, CGGHRD, CGGLSE, CGGQRCS, CGGQRF, CGGRQF,
 *> CGGSVD3, CGGSVP3, CHGEQZ, CTGEVC, CTGEXC, CTGSEN, CTGSJA,
 *> CTGSNA, CTGSYL, and CUNCSD.
 *> \endverbatim
@@ -94,7 +94,7 @@ SUBROUTINE CERRGG( PATH, NUNIT )
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGGES, CGGESX, CGGEV, CGGEVX, CGGGLM, CGGHRD,
-     $                   CGGLSE, CGGQRF, CGGRQF, CHGEQZ,
+     $                   CGGLSE, CGGQRCS, CGGQRF, CGGRQF, CHGEQZ,
      $                   CHKXER, CTGEVC, CTGEXC, CTGSEN, CTGSJA, CTGSNA,
      $                   CTGSYL, CUNCSD, CGGES3, CGGEV3, CGGHD3,
      $                   CGGSVD3, CGGSVP3, XLAENV
@@ -637,6 +637,71 @@ SUBROUTINE CERRGG( PATH, NUNIT )
 *
       ELSE IF( LSAMEN( 3, PATH, 'GQR' ) ) THEN
 *
+*        CGGQRCS
+*
+         SRNAMT = 'CGGQRCS'
+         INFOT = 1
+         CALL CGGQRCS( '/', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'CGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CGGQRCS( 'N', '/', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'CGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL CGGQRCS( 'N', 'N', '/', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'CGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL CGGQRCS( 'N', 'N', 'N', -1, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'CGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CGGQRCS( 'N', 'N', 'N', 0, -1, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'CGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL CGGQRCS( 'N', 'N', 'N', 0, 0, -1, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'CGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL CGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 0, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'CGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 12
+         CALL CGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 0, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'CGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 16
+         CALL CGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 0, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'CGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 18
+         CALL CGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 0,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'CGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 20
+         CALL CGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, 0, RW, LW, IW, INFO )
+         CALL CHKXER( 'CGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 22
+         CALL CGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, 0, IW, INFO )
+         CALL CHKXER( 'CGGQRCS', INFOT, NOUT, LERR, OK )
+         NT = NT + 12
+*
 *        CGGQRF
 *
          SRNAMT = 'CGGQRF'
diff --git a/TESTING/EIG/derrgg.f b/TESTING/EIG/derrgg.f
index 6fc61b8727..0ef495f00e 100644
--- a/TESTING/EIG/derrgg.f
+++ b/TESTING/EIG/derrgg.f
@@ -22,7 +22,7 @@
 *> \verbatim
 *>
 *> DERRGG tests the error exits for DGGES, DGGESX, DGGEV,  DGGEVX,
-*> DGGGLM, DGGHRD, DGGLSE, DGGQRF, DGGRQF, DGGSVD3,
+*> DGGGLM, DGGHRD, DGGLSE, DGGQRCS, DGGQRF, DGGRQF, DGGSVD3,
 *> DGGSVP3, DHGEQZ, DORCSD, DTGEVC, DTGEXC, DTGSEN, DTGSJA, DTGSNA,
 *> DGGES3, DGGEV3, and DTGSYL.
 *> \endverbatim
@@ -93,7 +93,7 @@ SUBROUTINE DERRGG( PATH, NUNIT )
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, DGGES, DGGESX, DGGEV, DGGEVX, DGGGLM,
-     $                   DGGHRD, DGGLSE, DGGQRF, DGGRQF,
+     $                   DGGHRD, DGGLSE, DGGQRCS, DGGQRF, DGGRQF,
      $                   DHGEQZ, DORCSD, DTGEVC, DTGEXC, DTGSEN, DTGSJA,
      $                   DTGSNA, DTGSYL, DGGHD3, DGGES3, DGGEV3,
      $                   DGGSVD3, DGGSVP3, XLAENV
@@ -609,6 +609,55 @@ SUBROUTINE DERRGG( PATH, NUNIT )
 *
       ELSE IF( LSAMEN( 3, PATH, 'GQR' ) ) THEN
 *
+*        DGGQRCS
+*
+         SRNAMT = 'DGGQRCS'
+         INFOT = 1
+         CALL DGGQRCS( '/', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'DGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DGGQRCS( 'N', '/', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'DGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL DGGQRCS( 'N', 'N', '/', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'DGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL DGGQRCS( 'N', 'N', 'N', -1, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'DGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL DGGQRCS( 'N', 'N', 'N', 0, -1, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'DGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL DGGQRCS( 'N', 'N', 'N', 0, 0, -1, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'DGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL DGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 0, B, 1, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'DGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 12
+         CALL DGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 0, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'DGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 16
+         CALL DGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 0, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'DGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 18
+         CALL DGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 0, W, LW, IW, INFO )
+         CALL CHKXER( 'DGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 20
+         CALL DGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1, W, 0, IW, INFO )
+         CALL CHKXER( 'DGGQRCS', INFOT, NOUT, LERR, OK )
+         NT = NT + 11
+*
 *        DGGQRF
 *
          SRNAMT = 'DGGQRF'
diff --git a/TESTING/EIG/serrgg.f b/TESTING/EIG/serrgg.f
index 7824f8618d..9b3b2f2d42 100644
--- a/TESTING/EIG/serrgg.f
+++ b/TESTING/EIG/serrgg.f
@@ -22,7 +22,7 @@
 *> \verbatim
 *>
 *> SERRGG tests the error exits for SGGES, SGGESX, SGGEV, SGGEVX,
-*> SGGES3, SGGEV3, SGGGLM, SGGHRD, SGGLSE, SGGQRF, SGGRQF,
+*> SGGES3, SGGEV3, SGGGLM, SGGHRD, SGGLSE, SGGQRCS, SGGQRF, SGGRQF,
 *> SGGSVD3, SGGSVP3, SHGEQZ, SORCSD, STGEVC, STGEXC, STGSEN,
 *> STGSJA, STGSNA, and STGSYL.
 *> \endverbatim
@@ -73,6 +73,7 @@ SUBROUTINE SERRGG( PATH, NUNIT )
       PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
 *     ..
 *     .. Local Scalars ..
+      LOGICAL            SWAPPED
       CHARACTER*2        C2
       INTEGER            DUMMYK, DUMMYL, I, IFST, ILO, IHI, ILST, INFO,
      $                   J, M, NCYCLE, NT, SDIM, LWORK
@@ -93,7 +94,7 @@ SUBROUTINE SERRGG( PATH, NUNIT )
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, SGGES, SGGESX, SGGEV, SGGEVX, SGGGLM,
-     $                   SGGHRD, SGGLSE, SGGQRF, SGGRQF,
+     $                   SGGHRD, SGGLSE, SGGQRCS, SGGQRF, SGGRQF,
      $                   SHGEQZ, SORCSD, STGEVC, STGEXC, STGSEN, STGSJA,
      $                   STGSNA, STGSYL, SGGES3, SGGEV3, SGGHD3,
      $                   SGGSVD3, SGGSVP3, XLAENV
@@ -655,6 +656,55 @@ SUBROUTINE SERRGG( PATH, NUNIT )
          CALL CHKXER( 'SGGRQF', INFOT, NOUT, LERR, OK )
          NT = NT + 6
 *
+*        SGGQRCS
+*
+         SRNAMT = 'SGGQRCS'
+         INFOT = 1
+         CALL SGGQRCS( '/', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'SGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL SGGQRCS( 'N', '/', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'SGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL SGGQRCS( 'N', 'N', '/', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'SGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL SGGQRCS( 'N', 'N', 'N', -1, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'SGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SGGQRCS( 'N', 'N', 'N', 0, -1, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'SGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL SGGQRCS( 'N', 'N', 'N', 0, 0, -1, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'SGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL SGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 0, B, 1, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'SGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 12
+         CALL SGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 0, R1, R2, U, 1, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'SGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 16
+         CALL SGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 0, V, 1, W, LW, IW, INFO )
+         CALL CHKXER( 'SGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 18
+         CALL SGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 0, W, LW, IW, INFO )
+         CALL CHKXER( 'SGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 20
+         CALL SGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1, W, 0, IW, INFO )
+         CALL CHKXER( 'SGGQRCS', INFOT, NOUT, LERR, OK )
+         NT = NT + 11
+*
 *     Test error exits for the SGS, SGV, SGX, and SXV paths.
 *
       ELSE IF( LSAMEN( 3, PATH, 'SGS' ) .OR.
diff --git a/TESTING/EIG/zerrgg.f b/TESTING/EIG/zerrgg.f
index 26e8df9834..d8f84f64da 100644
--- a/TESTING/EIG/zerrgg.f
+++ b/TESTING/EIG/zerrgg.f
@@ -22,7 +22,7 @@
 *> \verbatim
 *>
 *> ZERRGG tests the error exits for ZGGES, ZGGESX, ZGGEV, ZGGEVX,
-*> ZGGES3, ZGGEV3, ZGGGLM, ZGGHRD, ZGGLSE, ZGGQRF, ZGGRQF,
+*> ZGGES3, ZGGEV3, ZGGGLM, ZGGHRD, ZGGLSE, ZGGQRCS, ZGGQRF, ZGGRQF,
 *> ZGGSVD3, ZGGSVP3, ZHGEQZ, ZTGEVC, ZTGEXC, ZTGSEN, ZTGSJA,
 *> ZTGSNA, ZTGSYL, and ZUNCSD.
 *> \endverbatim
@@ -94,7 +94,7 @@ SUBROUTINE ZERRGG( PATH, NUNIT )
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, ZGGES,  ZGGESX, ZGGEV,  ZGGEVX, ZGGGLM,
-     $                   ZGGHRD, ZGGLSE, ZGGQRF, ZGGRQF,
+     $                   ZGGHRD, ZGGLSE, ZGGQRCS, ZGGQRF, ZGGRQF,
      $                   ZHGEQZ, ZTGEVC, ZTGEXC, ZTGSEN, ZTGSJA, ZTGSNA,
      $                   ZTGSYL, ZUNCSD, ZGGES3, ZGGEV3, ZGGHD3,
      $                   ZGGSVD3, ZGGSVP3, XLAENV
@@ -637,6 +637,71 @@ SUBROUTINE ZERRGG( PATH, NUNIT )
 *
       ELSE IF( LSAMEN( 3, PATH, 'GQR' ) ) THEN
 *
+*        ZGGQRCS
+*
+         SRNAMT = 'ZGGQRCS'
+         INFOT = 1
+         CALL ZGGQRCS( '/', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'ZGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZGGQRCS( 'N', '/', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'ZGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZGGQRCS( 'N', 'N', '/', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'ZGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL ZGGQRCS( 'N', 'N', 'N', -1, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'ZGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZGGQRCS( 'N', 'N', 'N', 0, -1, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'ZGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL ZGGQRCS( 'N', 'N', 'N', 0, 0, -1, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'ZGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL ZGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 0, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'ZGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 12
+         CALL ZGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 0, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'ZGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 16
+         CALL ZGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 0, V, 1,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'ZGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 18
+         CALL ZGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 0,
+     $                W, LW, RW, LW, IW, INFO )
+         CALL CHKXER( 'ZGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 20
+         CALL ZGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, 0, RW, LW, IW, INFO )
+         CALL CHKXER( 'ZGGQRCS', INFOT, NOUT, LERR, OK )
+         INFOT = 22
+         CALL ZGGQRCS( 'N', 'N', 'N', 0, 0, 0, I, SWAPPED,
+     $                A, 1, B, 1, R1, R2, U, 1, V, 1,
+     $                W, LW, RW, 0, IW, INFO )
+         CALL CHKXER( 'ZGGQRCS', INFOT, NOUT, LERR, OK )
+         NT = NT + 12
+*
 *        ZGGQRF
 *
          SRNAMT = 'ZGGQRF'

From 6cb9a5ffa5254c6bb4c524571f50f17d8a3de6e0 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Tue, 20 Apr 2021 18:49:47 +0000
Subject: [PATCH 099/101] xGGQRCS: allow matrix input with dimension zero

---
 SRC/cggqrcs.f | 6 +++---
 SRC/dggqrcs.f | 6 +++---
 SRC/sggqrcs.f | 6 +++---
 SRC/zggqrcs.f | 6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 6cd851e56b..bc0708d4d1 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -402,11 +402,11 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          INFO = -2
       ELSE IF( .NOT.( WANTX .OR. LSAME( JOBX, 'N' ) ) ) THEN
          INFO = -3
-      ELSE IF( M.LT.1 ) THEN
+      ELSE IF( M.LT.0 ) THEN
          INFO = -4
-      ELSE IF( N.LT.1 ) THEN
+      ELSE IF( N.LT.0 ) THEN
          INFO = -5
-      ELSE IF( P.LT.1 ) THEN
+      ELSE IF( P.LT.0 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -10
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index fb15907e91..0ee5765c49 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -379,11 +379,11 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          INFO = -2
       ELSE IF( .NOT.( WANTX .OR. LSAME( JOBX, 'N' ) ) ) THEN
          INFO = -3
-      ELSE IF( M.LT.1 ) THEN
+      ELSE IF( M.LT.0 ) THEN
          INFO = -4
-      ELSE IF( N.LT.1 ) THEN
+      ELSE IF( N.LT.0 ) THEN
          INFO = -5
-      ELSE IF( P.LT.1 ) THEN
+      ELSE IF( P.LT.0 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -10
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index eff4cd089c..5d09c34765 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -379,11 +379,11 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          INFO = -2
       ELSE IF( .NOT.( WANTX .OR. LSAME( JOBX, 'N' ) ) ) THEN
          INFO = -3
-      ELSE IF( M.LT.1 ) THEN
+      ELSE IF( M.LT.0 ) THEN
          INFO = -4
-      ELSE IF( N.LT.1 ) THEN
+      ELSE IF( N.LT.0 ) THEN
          INFO = -5
-      ELSE IF( P.LT.1 ) THEN
+      ELSE IF( P.LT.0 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -10
diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index 00f94c9040..e9862ab978 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -402,11 +402,11 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          INFO = -2
       ELSE IF( .NOT.( WANTX .OR. LSAME( JOBX, 'N' ) ) ) THEN
          INFO = -3
-      ELSE IF( M.LT.1 ) THEN
+      ELSE IF( M.LT.0 ) THEN
          INFO = -4
-      ELSE IF( N.LT.1 ) THEN
+      ELSE IF( N.LT.0 ) THEN
          INFO = -5
-      ELSE IF( P.LT.1 ) THEN
+      ELSE IF( P.LT.0 ) THEN
          INFO = -6
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -10

From e805767fa3dc77aadae615b78346c1f8d1eebcd0 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Wed, 21 Apr 2021 17:47:28 +0000
Subject: [PATCH 100/101] {c,z}GGQRCS: check if LRWORK is valid

---
 SRC/cggqrcs.f | 2 ++
 SRC/zggqrcs.f | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index bc0708d4d1..23409df1f3 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -418,6 +418,8 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          INFO = -18
       ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
          INFO = -20
+      ELSE IF( LRWORK.LT.1 .AND. .NOT.LQUERY ) THEN
+         INFO = -22
       END IF
 *
 *     Make sure A is the matrix smaller in norm
diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index e9862ab978..9759ea310e 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -418,6 +418,8 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
          INFO = -18
       ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
          INFO = -20
+      ELSE IF( LRWORK.LT.1 .AND. .NOT.LQUERY ) THEN
+         INFO = -22
       END IF
 *
 *     Make sure A is the matrix smaller in norm

From 8a0bef17df9c807be7fd3cec34e8df1514372523 Mon Sep 17 00:00:00 2001
From: Christoph Conrads <christoph.conrads@inria.fr>
Date: Thu, 22 Apr 2021 19:15:39 +0000
Subject: [PATCH 101/101] xGGQRCS: ensure leading dimension is at least one

Ensure the leading dimension of the assembled matrix `G = [A; B]` is
always at least one.
---
 SRC/cggqrcs.f | 3 ++-
 SRC/dggqrcs.f | 3 ++-
 SRC/sggqrcs.f | 3 ++-
 SRC/zggqrcs.f | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/SRC/cggqrcs.f b/SRC/cggqrcs.f
index 23409df1f3..f51ead2947 100644
--- a/SRC/cggqrcs.f
+++ b/SRC/cggqrcs.f
@@ -452,7 +452,8 @@ RECURSIVE SUBROUTINE CGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
       SWAPPED = .FALSE.
       L = 0
-      LDG = M + P
+*     The leading dimension must never be zero
+      LDG = MAX( M + P, 1 )
       LDVT = N
       LMAX = MIN( M + P, N )
       IG = 1
diff --git a/SRC/dggqrcs.f b/SRC/dggqrcs.f
index 0ee5765c49..fc56c2e6c1 100644
--- a/SRC/dggqrcs.f
+++ b/SRC/dggqrcs.f
@@ -426,7 +426,8 @@ RECURSIVE SUBROUTINE DGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
       SWAPPED = .FALSE.
       L = 0
-      LDG = M + P
+*     The leading dimension must never be zero
+      LDG = MAX( M + P, 1 )
       LDVT = N
       LMAX = MIN( M + P, N )
       IG = 1
diff --git a/SRC/sggqrcs.f b/SRC/sggqrcs.f
index 5d09c34765..2303447f85 100644
--- a/SRC/sggqrcs.f
+++ b/SRC/sggqrcs.f
@@ -426,7 +426,8 @@ RECURSIVE SUBROUTINE SGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
       SWAPPED = .FALSE.
       L = 0
-      LDG = M + P
+*     The leading dimension must never be zero
+      LDG = MAX( M + P, 1 )
       LDVT = N
       LMAX = MIN( M + P, N )
       IG = 1
diff --git a/SRC/zggqrcs.f b/SRC/zggqrcs.f
index 9759ea310e..de3295e54d 100644
--- a/SRC/zggqrcs.f
+++ b/SRC/zggqrcs.f
@@ -451,7 +451,8 @@ RECURSIVE SUBROUTINE ZGGQRCS( JOBU1, JOBU2, JOBX, M, N, P, L,
 *
       SWAPPED = .FALSE.
       L = 0
-      LDG = M + P
+*     The leading dimension must never be zero
+      LDG = MAX( M + P, 1 )
       LDVT = N
       LMAX = MIN( M + P, N )
       IG = 1