/* ---------------------------------------------------------------------
*
*  -- ScaLAPACK routine (version 1.0) --
*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
*     and University of California, Berkeley.
*     November 17, 1996
*
*  ---------------------------------------------------------------------
*/
/*
*  Include files
*/
#include "pblas.h"

void pdtrsm_( side, uplo, transa, diag, m, n, alpha, A, ia, ja, desc_A,
              B, ib, jb, desc_B )
/*
*  .. Scalar Arguments ..
*/
   F_CHAR      diag, side, transa, uplo;
   int         * ia, * ib, * ja, * jb, * m, * n;
   double      * alpha;
/* ..
*  .. Array Arguments ..
*/
   int         desc_A[], desc_B[];
   double      A[], B[];
{
/*
*  Purpose
*  =======
*
*  PDTRSM  solves one of the distributed matrix equations
*
*                 op( sub( A ) )*X = alpha*sub( B ),   or
*
*                 X*op( sub( A ) ) = alpha*sub( B ),
*
*  where sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1)  if SIDE = 'L',
*        sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1)  if SIDE = 'R',
*
*        sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1),
*
*  alpha is a scalar, X and sub( B ) are an M-by-N distributed matrix,
*  sub( A ) is a unit, or non-unit, upper or lower triangular distribu-
*  ted matrix and op( A ) is one of
*
*     op( A ) = A   or   op( A ) = A'.
*
*  The distributed matrix X is overwritten on sub( B ).
*
*  Notes
*  =====
*
*  Each global data object is described by an associated description
*  vector.  This vector stores the information required to establish
*  the mapping between an object element and its corresponding process
*  and memory location.
*
*  Let A be a generic term for any 2D block cyclicly distributed array.
*  Such a global array has an associated description vector descA.
*  In the following comments, the character _ should be read as
*  "of the global array".
*
*  NOTATION        STORED IN      EXPLANATION
*  --------------- -------------- --------------------------------------
*  DT_A   (global) descA[ DT_ ]   The descriptor type.  In this case,
*                                 DT_A = 1.
*  CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating
*                                 the BLACS process grid A is distribu-
*                                 ted over. The context itself is glo-
*                                 bal, but the handle (the integer
*                                 value) may vary.
*  M_A    (global) descA[ M_ ]    The number of rows in the global
*                                 array A.
*  N_A    (global) descA[ N_ ]    The number of columns in the global
*                                 array A.
*  MB_A   (global) descA[ MB_ ]   The blocking factor used to distribu-
*                                 te the rows of the array.
*  NB_A   (global) descA[ NB_ ]   The blocking factor used to distribu-
*                                 te the columns of the array.
*  RSRC_A (global) descA[ RSRC_ ] The process row over which the first
*                                 row of the array A is distributed.
*  CSRC_A (global) descA[ CSRC_ ] The process column over which the
*                                 first column of the array A is
*                                 distributed.
*  LLD_A  (local)  descA[ LLD_ ]  The leading dimension of the local
*                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
*
*  Let K be the number of rows or columns of a distributed matrix,
*  and assume that its process grid has dimension p x q.
*  LOCr( K ) denotes the number of elements of K that a process
*  would receive if K were distributed over the p processes of its
*  process column.
*  Similarly, LOCc( K ) denotes the number of elements of K that a
*  process would receive if K were distributed over the q processes of
*  its process row.
*  The values of LOCr() and LOCc() may be determined via a call to the
*  ScaLAPACK tool function, NUMROC:
*          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
*          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
*  An upper bound for these quantities may be computed by:
*          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
*          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
*
*  The triangular distributed matrix sub( A ) must be distributed
*  according to a square block cyclic decomposition, i.e MB_A = NB_A, if
*  NA+MOD(IA-1,MB_A) > MB_A or NA+MOD(JA-1,NB_A) > NB_A.
*  If SIDE = 'Left', the distributed matrix sub( A ) is of order NA = M,
*  and NA = N if SIDE = 'Right'. If NA+MOD(IA-1,MB_A) > MB_A or
*  NA+MOD(JA-1,NB_A) > NB_A, then sub( A ) is not just contained into a
*  block, in which case it is required that IA-1 (resp. JA-1) is a
*  multiple of MB_A (resp. NB_A).
*
*  If SIDE = 'Left', the row process having the first entries of
*  sub( B ) must also own the first entries of sub( A ).
*  If sub( A ) is not just contained into a block, IB-1 (resp. IA-1,
*  JA-1) must be a multiple of MB_B (resp. MB_A, NB_A = MB_A), and
*  the column block size of A should be equal to the row block size of
*  B, i.e NB_A = MB_B.
*
*  If SIDE = 'Right', the column process having the first entries of
*  sub( B ) must also own the first entries of sub( A ).
*  If sub( A ) is not just contained into a block, JB-1 (resp. IA-1,
*  JA-1) must be a multiple of NB_B (resp. MB_A, NB_A = MB_A), and
*  the row block size of A should be equal to the column block size of
*  B, i.e MB_A = NB_B.
*
*  Parameters
*  ==========
*
*  SIDE    (global input) pointer to CHARACTER
*          On entry, SIDE specifies whether op( A ) appears on the left
*          or right of X as follows:
*
*          SIDE = 'L' or 'l'   op( sub( A ) )*X = alpha*sub( B ),
*
*          SIDE = 'R' or 'r'   X*op( sub( A ) ) = alpha*sub( B ).
*
*  UPLO    (global input) pointer to CHARACTER
*          On entry, UPLO specifies whether the distributed matrix
*          sub( A ) is an upper or lower triangular distributed matrix
*          as follows:
*
*             UPLO = 'U' or 'u'   A is an upper triangular matrix,
*
*             UPLO = 'L' or 'l'   A is a lower triangular matrix.
*
*  TRANSA  (global input) pointer to CHARACTER
*          On entry, TRANSA specifies the form of op( A ) to be
*          used in the matrix multiplication as follows:
*
*          TRANSA = 'N' or 'n'   op( A ) = A,
*
*          TRANSA = 'T' or 't'   op( A ) = A',
*
*          TRANSA = 'C' or 'c'   op( A ) = A'.
*
*  DIAG    (global input) pointer to CHARACTER
*          On entry, DIAG specifies whether or not sub( A ) is unit
*          triangular as follows:
*
*          DIAG = 'U' or 'u'  sub( A ) is assumed to be unit
*                             triangular,
*
*          DIAG = 'N' or 'n'  sub( A ) is not assumed to be unit
*                             triangular.
*
*  M       (global input) pointer to INTEGER
*          The number of rows to be operated on i.e the number of rows
*          of the distributed submatrix sub( B ). M >= 0.
*
*  N       (global input) pointer to INTEGER
*          The number of columns to be operated on i.e the number of
*          columns of the distributed submatrix sub( B ). N >= 0.
*
*  ALPHA   (global input) pointer to DOUBLE PRECISION
*          On entry, ALPHA specifies the scalar alpha.
*
*  A       (local input) DOUBLE PRECISION pointer into the local memory
*          to an array of dimension (LLD_A, LOCc(JA+NA-1). Before entry
*          with  UPLO = 'U' or 'u', the  leading NA-by-NA upper trian-
*          gular part of the distributed matrix sub( A ) must contain
*          the local pieces of the upper triangular distributed matrix
*          and its strictly lower triangular part is not referenced.
*          Before entry  with  UPLO = 'L' or 'l', the leading  NA-by-NA
*          lower triangular part of the distributed matrix sub( A ) must
*          contain the lower triangular distributed matrix and its
*          strictly upper triangular part is not referenced.  Note that
*          when  DIAG = 'U' or 'u', the diagonal elements of sub( A )
*          are not referenced either, but are assumed to be  unity.
*
*  IA      (global input) pointer to INTEGER
*          The global row index of the submatrix of the distributed
*          matrix A to operate on.
*
*  JA      (global input) pointer to INTEGER
*          The global column index of the submatrix of the distributed
*          matrix A to operate on.
*
*  DESCA   (global and local input) INTEGER array of dimension 8.
*          The array descriptor of the distributed matrix A.
*
*  B       (local input/local output) DOUBLE PRECISION pointer into the
*          local memory to an array of dimension (LLD_B, LOCc(JB+N-1)).
*          Before entry, this array contains the local pieces of the
*          distributed matrix sub( B ). On exit, sub( B ) is overwritten
*          by the solution distributed matrix.
*
*  IB      (global input) pointer to INTEGER
*          The global row index of the submatrix of the distributed
*          matrix B to operate on.
*
*  JB      (global input) pointer to INTEGER
*          The global column index of the submatrix of the distributed
*          matrix B to operate on.
*
*  DESCB   (global and local input) INTEGER array of dimension 8.
*          The array descriptor of the distributed matrix B.
*
*  =====================================================================
*
*  .. Local Scalars ..
*/
   char        * ctop, DiagA, matblk, * rtop, SideA, TrA, UploA;
   int         block, i, iacol, iarow, ibcol, iblk, ibpos, ibrow,
               icoffa, icoffb, ictxt, iia, iib, in, info, iroffa,
               iroffb, j, jblk, jja, jjb, jn, lcm, lcmp, lcmq, lside,
               mone=-1, mp0, mq0, mycol, myrow, nca, ncb, np0, nprow,
               npcol, nq0, nra, nrb, tmp0, tmp1, tmp2, wksz;
/* ..
*  .. PBLAS Buffer ..
*/
  double      * buff;
/* ..
*  .. External Functions ..
*/
   void        blacs_gridinfo_();
   void        pberror_();
   void        pbchkmat();
   char        * getpbbuf();
   char        * ptop();
   F_VOID_FCT  pbdtrsm_();
   F_INTG_FCT  ilcm_();
/* ..
*  .. Executable Statements ..
*
*  Get grid parameters
*/
   ictxt = desc_A[CTXT_];
   blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol );
/*
*  Test the input parameters
*/
   info = 0;
   if( nprow == -1 )
      info = -(1100+CTXT_+1);
   else
   {
      DiagA = Mupcase( F2C_CHAR( diag )[0] );
      UploA = Mupcase( F2C_CHAR( uplo )[0] );
      SideA = Mupcase( F2C_CHAR( side )[0] );
      lside  = ( SideA == 'L' );
      TrA = Mupcase( F2C_CHAR( transa )[0] );
      iroffa = (*ia-1) % desc_A[MB_];
      icoffa = (*ja-1) % desc_A[NB_];
      iroffb = (*ib-1) % desc_B[MB_];
      icoffb = (*jb-1) % desc_B[NB_];
      if( lside )
      {
         block = ( ( ( (*m) + iroffa ) <= desc_A[MB_] ) &&
                   ( ( (*m) + icoffa ) <= desc_A[NB_] ) );
         pbchkmat( *m, 5, *m, 5, *ia, *ja, desc_A, 11, &iia, &jja,
                   &iarow, &iacol, nprow, npcol, myrow, mycol,
                   &nra, &nca, &info );
      }
      else
      {
         block = ( ( ( (*n) + iroffa ) <= desc_A[MB_] ) &&
                   ( ( (*n) + icoffa ) <= desc_A[NB_] ) );
         pbchkmat( *n, 6, *n, 6, *ia, *ja, desc_A, 11, &iia, &jja,
                   &iarow, &iacol, nprow, npcol, myrow, mycol,
                   &nra, &nca, &info );
      }
      pbchkmat( *m, 5, *n, 6, *ib, *jb, desc_B, 15, &iib, &jjb,
                &ibrow, &ibcol, nprow, npcol, myrow, mycol,
                &nrb, &ncb, &info );
      if( info == 0 )
      {
         if( ( SideA != 'R' ) && ( SideA != 'L' ) )
            info = -1;
         else if( ( UploA != 'U' ) && (UploA != 'L' ) )
            info = -2;
         else if( ( TrA != 'N' ) && ( TrA != 'T' ) && ( TrA != 'C' ) )
            info = -3;
         else if( ( DiagA != 'U' ) && ( DiagA != 'N' ) )
            info = -4;
         else if( !block && desc_A[MB_] != desc_A[NB_] )
            info = -(1100+NB_+1);
         else if( !block && ( iroffa != 0 ) )
            info = -9;
         else if( !block && ( icoffa != 0 ) )
            info = -10;
         if( lside )
         {
            if( ( !block && ( iroffb != 0 ) ) || ( ibrow != iarow ) )
               info = -13;
            else if( block && ( nprow != 1 ) &&
                     ( (*m)+iroffb > desc_B[MB_] ) )
               info = -13;
            else if( !block && ( desc_A[NB_] != desc_B[MB_] ) )
               info = -(1500+MB_+1);
         }
         else
         {
            if( ( !block && ( icoffb != 0 ) ) || ( ibcol != iacol ) )
               info = -14;
            else if( block && ( npcol != 1 ) &&
                     ( (*n)+icoffb > desc_B[NB_] ) )
               info = -14;
            else if( !block && (desc_A[MB_] != desc_B[NB_]) )
               info = -(1500+NB_+1);
         }
         if( ictxt != desc_B[CTXT_] )
            info = -(1500+CTXT_+1);
      }
   }
   if( info )
   {
      pberror_( &ictxt, "PDTRSM", &info );
      return;
   }
/*
*  Quick return if possible
*/
   if( *m == 0 || *n == 0 )
      return;
/*
*  Figure out the arguments to be passed to pbdtrsm
*/
   if( lside )
   {
      ibpos = ibcol;
      if( block )
      {
         matblk = 'B';
         wksz = (*m) * (*m);
      }
      else
      {
         matblk = 'M';
         if( TrA == 'N' )
         {
            tmp0 = npcol-1;
            tmp0 = CEIL( tmp0, nprow );
            tmp0 = MAX( 1, tmp0 );
            tmp1 = (*m) / desc_A[MB_];
            wksz = desc_B[NB_] * ( desc_A[MB_] * tmp0 +
                   MYROC0( tmp1, *m, desc_A[MB_], nprow ) );
         }
         else
         {
            tmp0 = nprow-1;
            tmp0 = CEIL( tmp0, npcol );
            tmp0 = desc_A[MB_] * MAX( 1, tmp0 );
            lcm = ilcm_( &nprow, &npcol );
            lcmq = lcm / npcol;
            tmp1 = (*m) / desc_A[NB_];
            mq0 = MYROC0( tmp1, *m, desc_A[NB_], npcol );
            tmp1 = mq0 / desc_A[NB_];
            tmp1 = MYROC0( tmp1, mq0, desc_A[NB_], lcmq );
            lcmp = lcm / nprow;
            tmp2 = (*m) / desc_A[MB_];
            mp0 = MYROC0( tmp2, *m, desc_A[MB_], nprow );
            tmp2 = mp0 / desc_A[NB_];
            tmp2 = MYROC0( tmp2, mp0, desc_A[MB_], lcmp );
            tmp2 = MAX( tmp1, tmp2 );
            wksz = desc_B[NB_] * ( mq0 + MAX( tmp0, tmp2 ) );
         }
      }
   }
   else
   {
      ibpos = ibrow;
      if( block )
      {
         matblk = 'B';
         wksz = (*n) * (*n);
      }
      else
      {
         matblk = 'M';
         if( TrA == 'N' )
         {
            tmp0 = nprow-1;
            tmp0 = CEIL( tmp0, npcol );
            tmp0 = MAX( 1, tmp0 );
            tmp1 = (*n) / desc_A[MB_];
            wksz = desc_B[MB_] * ( desc_A[MB_]*tmp0 +
                   MYROC0( tmp1, *n, desc_A[MB_], npcol ) );
         }
         else
         {
            tmp0 = npcol-1;
            tmp0 = CEIL( tmp0, nprow );
            tmp0 = desc_A[MB_] * MAX( 1, tmp0 );
            lcm = ilcm_( &nprow, &npcol );
            lcmq = lcm / npcol;
            tmp1 = (*n) / desc_A[NB_];
            nq0 = MYROC0( tmp1, *n, desc_A[NB_], npcol );
            tmp1 = nq0 / desc_A[NB_];
            tmp1 = MYROC0( tmp1, nq0, desc_A[NB_], lcmq );
            lcmp = lcm / nprow;
            tmp2 = (*n) / desc_A[MB_];
            np0 = MYROC0( tmp2, *n, desc_A[MB_], nprow );
            tmp2 = np0 / desc_A[NB_];
            tmp2 = MYROC0( tmp2, np0, desc_A[MB_], lcmp );
            tmp2 = MAX( tmp1, tmp2 );
            wksz = desc_B[MB_] * ( np0 + MAX( tmp0, tmp2 ) );
         }
      }
   }
   buff = (double *)getpbbuf( "PDTRSM", wksz*sizeof(double) );
/*
*  Call PB-BLAS routine
*/
   if( block )
   {
      if( lside )
      {
         rtop = ptop( BROADCAST, ROW, TOPGET );
         j = CEIL( (*jb), desc_B[NB_] ) * desc_B[NB_];
         jn = (*jb)+(*n)-1;
         jn = MIN( j, jn );
                                     /* Handle first block separately */
         jblk = jn-(*jb)+1;
         pbdtrsm_( &ictxt, C2F_CHAR( &matblk ), side, uplo, transa,
                   diag, m, &jblk, &desc_B[NB_], alpha,
                   &A[iia-1+(jja-1)*desc_A[LLD_]], &desc_A[LLD_],
                   &B[iib-1+(jjb-1)*desc_B[LLD_]], &desc_B[LLD_],
                   &iarow, &iacol, &ibpos, C2F_CHAR( rtop ),
                   C2F_CHAR( NO ), buff );
         if( mycol == ibpos )
         {
            jjb += jblk;
            jjb = MIN( jjb, ncb );
         }
         ibpos = (ibpos+1) % npcol;
         jblk = (*n) - jblk;
         pbdtrsm_( &ictxt, C2F_CHAR( &matblk ), side, uplo, transa,
                   diag, m, &jblk, &desc_B[NB_], alpha, buff, m,
                   &B[iib-1+(jjb-1)*desc_B[LLD_]], &desc_B[LLD_], &iarow,
                   &mone, &ibpos, C2F_CHAR( rtop ), C2F_CHAR( YES ), buff );
      }
      else
      {
         ctop = ptop( BROADCAST, COLUMN, TOPGET );
         i = CEIL( (*ib), desc_B[MB_] ) * desc_B[MB_];
         in = (*ib)+(*m)-1;
         in = MIN( i, in );
                                     /* Handle first block separately */
         iblk = in-(*ib)+1;
         pbdtrsm_( &ictxt, C2F_CHAR( &matblk ), side, uplo, transa,
                   diag, &iblk, n, &desc_B[MB_], alpha,
                   &A[iia-1+(jja-1)*desc_A[LLD_]], &desc_A[LLD_],
                   &B[iib-1+(jjb-1)*desc_B[LLD_]], &desc_B[LLD_],
                   &iarow, &iacol, &ibpos, C2F_CHAR( ctop ),
                   C2F_CHAR( NO ), buff );
         if( myrow == ibpos )
         {
            iib += iblk;
            iib = MIN( iib, nrb );
         }
         ibpos = (ibpos+1) % nprow;
         iblk = (*m) - iblk;
         pbdtrsm_( &ictxt, C2F_CHAR( &matblk ), side, uplo, transa,
                   diag, &iblk, n, &desc_B[MB_], alpha, buff, n,
                   &B[iib-1+(jjb-1)*desc_B[LLD_]], &desc_B[LLD_],
                   &mone, &iacol, &ibpos, C2F_CHAR( ctop ),
                   C2F_CHAR( YES ), buff );
      }
   }
   else
   {
      if( lside )
      {
         j = CEIL( (*jb), desc_B[NB_] ) * desc_B[NB_];
         jn = (*jb)+(*n)-1;
         jn = MIN( j, jn );
                                     /* Handle first block separately */
         jblk = jn-(*jb)+1;
         pbdtrsm_( &ictxt, C2F_CHAR( &matblk ), side, uplo, transa,
                   diag, m, &jblk, &desc_A[MB_], alpha,
                   &A[iia-1+(jja-1)*desc_A[LLD_]], &desc_A[LLD_],
                   &B[iib-1+(jjb-1)*desc_B[LLD_]], &desc_B[LLD_],
                   &iarow, &iacol, &ibpos, C2F_CHAR( TOPDEF ),
                   C2F_CHAR( NO ), buff );
         if( mycol == ibpos )
         {
            jjb += jblk;
            jjb = MIN( jjb, ncb );
         }
         ibpos = (ibpos+1) % npcol;
                              /* loop over remaining block of columns */
         tmp0 = (*jb)+(*n)-1;
         for( j=jn+1; j<=tmp0; j+=desc_B[NB_] )
         {
             jblk = (*n)-j+(*jb);
             jblk = MIN( desc_B[NB_], jblk );
             pbdtrsm_( &ictxt, C2F_CHAR( &matblk ), side, uplo, transa,
                       diag, m, &jblk, &desc_A[MB_], alpha,
                       &A[iia-1+(jja-1)*desc_A[LLD_]], &desc_A[LLD_],
                       &B[iib-1+(jjb-1)*desc_B[LLD_]], &desc_B[LLD_],
                       &iarow, &iacol, &ibpos, C2F_CHAR( TOPDEF ),
                       C2F_CHAR( NO ), buff );
             if( mycol == ibpos )
             {
                jjb += jblk;
                jjb = MIN( jjb, ncb );
             }
             ibpos = (ibpos+1) % npcol;
         }
      }
      else
      {
         i = CEIL( (*ib), desc_B[MB_] ) * desc_B[MB_];
         in = (*ib)+(*m)-1;
         in = MIN( i, in );
                                     /* Handle first block separately */
         iblk = in-(*ib)+1;
         pbdtrsm_( &ictxt, C2F_CHAR( &matblk ), side, uplo, transa,
                   diag, &iblk, n, &desc_A[MB_], alpha,
                   &A[iia-1+(jja-1)*desc_A[LLD_]], &desc_A[LLD_],
                   &B[iib-1+(jjb-1)*desc_B[LLD_]], &desc_B[LLD_],
                   &iarow, &iacol, &ibpos, C2F_CHAR( TOPDEF ),
                   C2F_CHAR( NO ), buff );
         if( myrow == ibpos )
         {
            iib += iblk;
            iib = MIN( iib, nrb );
         }
         ibpos = (ibpos+1) % nprow;
                                 /* loop over remaining block of rows */
         tmp0 =  (*ib)+(*m)-1;
         for( i=in+1; i<=tmp0; i+=desc_B[MB_] )
         {
             iblk = *m-i+(*ib);
             iblk = MIN( desc_B[MB_], iblk );
             pbdtrsm_( &ictxt, C2F_CHAR( &matblk ), side, uplo, transa,
                       diag, &iblk, n, &desc_A[MB_], alpha,
                       &A[iia-1+(jja-1)*desc_A[LLD_]], &desc_A[LLD_],
                       &B[iib-1+(jjb-1)*desc_B[LLD_]], &desc_B[LLD_],
                       &iarow, &iacol, &ibpos, C2F_CHAR( TOPDEF ),
                       C2F_CHAR( NO ), buff );
             if( myrow == ibpos )
             {
                iib += iblk;
                iib = MIN( iib, nrb );
             }
             ibpos = (ibpos+1) % nprow;
         }
      }
   }
}
