!-------------------------------------- LICENCE BEGIN ------------------------------------
!Environment Canada - Atmospheric Science and Technology License/Disclaimer, 
!                     version 3; Last Modified: May 7, 2008.
!This is free but copyrighted software; you can use/redistribute/modify it under the terms 
!of the Environment Canada - Atmospheric Science and Technology License/Disclaimer 
!version 3 or (at your option) any later version that should be found at: 
!http://collaboration.cmc.ec.gc.ca/science/rpn.comm/license.html 
!
!This software is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 
!without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 
!See the above mentioned License/Disclaimer for more details.
!You should have received a copy of the License/Disclaimer along with this software; 
!if not, you can write to: EC-RPN COMM Group, 2121 TransCanada, suite 500, Dorval (Quebec), 
!CANADA, H9P 1J3; or send e-mail to service.rpn@ec.gc.ca
!-------------------------------------- LICENCE END --------------------------------------
*** fgmres - flexible GMRES routine to allow a variable preconditioner
*

      subroutine fgmres (n,im,rhs,sol,i,vv,w,wk1, wk2,eps,maxits, 1
     *                                                iout,icode) 
      implicit none 

      integer n, im, i, maxits, iout, icode 
      real*8 rhs(*), sol(*), vv(n,im+1),w(n,im), wk1(n), wk2(n), eps
*
*author Y. Saad (modified by A. Malevsky Feb1, 1995)
*
*revision
* v3_30 - Abdessamad Qaddouri ; add RPN_comm calls
c-----------------------------------------------------------------------
c flexible GMRES routine. This is a version of GMRES which allows a 
c a variable preconditioner. Implemented with a reverse communication 
c protocole for flexibility -
c DISTRIBUTED VERSION (USES DISTDOT FOR DDOT) 
c explicit (exact) residual norms for restarts  
c written by Y. Saad, modified by A. Malevsky, version February 1, 1995
c  revision :
c            Abdessamad Qaddouri ; add RPN_comm calls
c-----------------------------------------------------------------------
c This Is A Reverse Communication Implementation. 
c------------------------------------------------- 
c USAGE: (see also comments for icode below). CGMRES
c should be put in a loop and the loop should be active for as
c long as icode is not equal to 0. On return fgmres will
c    1) either be requesting the new preconditioned vector applied
c       to wk1 in case icode.eq.1 (result should be put in wk2) 
c    2) or be requesting the product of A applied to the vector wk1
c       in case icode.eq.2 (result should be put in wk2) 
c    3) or be terminated in case icode .eq. 0. 
c on entry always set icode = 0. So icode should be set back to zero
c upon convergence.
c-----------------------------------------------------------------------
c Here is a typical way of running fgmres: 
c
c      icode = 0
c 1    continue
c      call fgmres (n,im,rhs,sol,i,vv,w,wk1, wk2,eps,maxits,iout,icode)
c
c      if (icode .eq. 1) then
c         call  precon(n, wk1, wk2)    <--- user's variable preconditioning
c         goto 1
c      else if (icode .ge. 2) then
c         call  matvec (n,wk1, wk2)    <--- user's matrix vector product. 
c         goto 1
c      else 
c         ----- done ---- 
c         .........
c-----------------------------------------------------------------------
c list of parameters 
c------------------- 
c
c n     == integer. the dimension of the problem
c im    == size of Krylov subspace:  should not exceed 100 in this
c          version (can be reset in code. looking at comment below)
c rhs   == vector of length n containing the right hand side
c sol   == initial guess on input, approximate solution on output
c vv    == work space of size n x (im+1)
c w     == work space of length n x im 
c wk1,
c wk2,  == two work vectors of length n each used for the reverse
c          communication protocole. When on return (icode .ne. 1)
c          the user should call fgmres again with wk2 = precon * wk1
c          and icode untouched. When icode.eq.1 then it means that
c          convergence has taken place.
c          
c eps   == tolerance for stopping criterion. process is stopped
c          as soon as ( ||.|| is the euclidean norm):
c          || current residual||/||initial residual|| <= eps
c
c maxits== maximum number of iterations allowed
c
c 
c icode = integer. indicator for the reverse communication protocole.
c         ON ENTRY : icode should be set to icode = 0.
c         ON RETURN: 
c       * icode .eq. 1 value means that fgmres has not finished
c         and that it is requesting a preconditioned vector before
c         continuing. The user must compute M**(-1) wk1, where M is
c         the preconditioing  matrix (may vary at each call) and wk1 is
c         the vector as provided by fgmres upun return, and put the 
c         result in wk2. Then fgmres must be called again without
c         changing any other argument. 
c       * icode .eq. 2 value means that fgmres has not finished
c         and that it is requesting a matrix vector product before
c         continuing. The user must compute  A * wk1, where A is the
c         coefficient  matrix and wk1 is the vector provided by 
c         upon return. The result of the operation is to be put in
c         the vector wk2. Then fgmres must be called again without
c         changing any other argument. 
c       * icode .eq. 0 means that fgmres has finished and sol contains 
c         the approximate solution.
c         comment: typically fgmres must be implemented in a loop
c         with fgmres being called as long icode is returned with 
c         a value .ne. 0. 
c-----------------------------------------------------------------------
c     local variables --
      real*8 hh(101,100),hhloc(101,100),c(100),s(100),conv,
     *     rs(101),t,tloc,ro,MSG_ddot,dsqrt,epsmac, ddot, eps1, gam ,r0
      integer n1, its, j, i1, k, k1, ii, jj, ierr,kk 
c      integer mproc, myproc ! use when printing out ..
c-------------------------------------------------------------
c     arnoldi size should not exceed 100 in this version..
c     to reset modify sizes of hh, c, s, rs       
c-------------------------------------------------------------
c
      save
c ## 
c     used for printing out only -- ignore 
c      call MPI_Comm_rank(MPI_COMM_WORLD,mproc,ierr)
c      myproc = mproc+1
c
      data epsmac/1.d-16/
c     
c     computed goto 
c     
      goto (100,200,300,11) icode +1
 100  continue
      n1 = n + 1
      its = 0
c-------------------------------------------------------------
c     **  outer loop starts here..
c--------------compute initial residual vector --------------
 10   continue
      call dcopy (n, sol, 1, wk1, 1) 
      icode = 3
      return
 11   continue
      do 21 j=1,n
         vv(j,1) = rhs(j) - wk2(j) 
 21   continue
 20   continue
       tloc=ddot(n, vv, 1, vv,1)
c      call MPI_allreduce(tloc,ro,1,MPI_double_precision,
c     *     MPI_sum,MPI_COMM_WORLD,ierr)
       call RPN_COMM_allreduce(tloc,ro,1,"MPI_double_precision",
     *     "MPI_sum","grid",ierr)
       ro = dsqrt(ro) 
c##
c      if (mproc .eq. 0) write (19,*) ro
      if (ro .eq. 0.0d0) goto 999 
      t = 1.0d0/ ro 
      call dscal(n,t,vv(1,1),1)
      if (its .eq. 0) eps1=eps*ro
      if (its .eq. 0) r0 = ro

      conv = ro/r0
      if (iout.gt.0) write(iout, 199) its, conv
c     
c     initialize 1-st term  of rhs of hessenberg system..
c     
      rs(1) = ro
      i = 0
 4    i=i+1
      its = its + 1
      i1 = i + 1
      call dcopy(n,vv(1,i),1,wk1,1)
c     
c     return
c     
      icode = 1
      return
 200  continue
      call dcopy(n, wk2, 1, w(1,i), 1)
c     
c     call matvec operation
c     
      icode = 2
      call dcopy(n, wk2, 1, wk1, 1)
c
c     return
c     
      return
 300  continue
c     
c     first call to ope corresponds to intialization goto back to 11.
c     
c      if (icode .eq. 3) goto 11
      call  dcopy (n, wk2, 1, vv(1,i1), 1) 
c     
c     classical gram - schmidt...
c     
      do 55 j=1, i
         hhloc(j,i) = ddot(n, vv(1,j), 1, vv(1,i1), 1)
 55   continue
c      call MPI_allreduce(hhloc(1,i),hh(1,i),i,MPI_double_precision,
c    *     MPI_sum,MPI_COMM_WORLD,ierr)
       call RPN_COMM_allreduce(hhloc(1,i),hh(1,i),i,"MPI_double_precision",
     *     "MPI_sum","grid",ierr)
            
      do 56 j=1, i
         call daxpy(n, -hh(j,i), vv(1,j), 1, vv(1,i1), 1)
 56   continue
      tloc = ddot(n, vv(1,i1), 1, vv(1,i1), 1)
c      
c      call MPI_allreduce(tloc,t,1,MPI_double_precision,
c     *     MPI_sum,MPI_COMM_WORLD,ierr)
       call RPN_COMM_allreduce(tloc,t,1,"MPI_double_precision",
     *     "MPI_sum","grid",ierr)
            
      t = sqrt(t)
      hh(i1,i) = t
      if (t .eq. 0.0d0) goto 58
      t = 1.0d0 / t
      call dscal(n,t,vv(1,i1),1)
c     
c     done with classical gram schimd and arnoldi step. 
c     now  update factorization of hh
c     
 58   if (i .eq. 1) goto 121
c     
c     perfrom previous transformations  on i-th column of h
c     
      do 66 k=2,i
         k1 = k-1
         t = hh(k1,i)
         hh(k1,i) = c(k1)*t + s(k1)*hh(k,i)
         hh(k,i) = -s(k1)*t + c(k1)*hh(k,i)
 66   continue
 121  gam = sqrt(hh(i,i)**2 + hh(i1,i)**2)
      if (gam .eq. 0.0d0) gam = epsmac
c-----------#determinenextplane rotation  #-------------------
      c(i) = hh(i,i)/gam
      s(i) = hh(i1,i)/gam
      rs(i1) = -s(i)*rs(i)
      rs(i) =  c(i)*rs(i)
c     
c     determine res. norm. and test for convergence-
c     
      hh(i,i) = c(i)*hh(i,i) + s(i)*hh(i1,i)
      ro = abs(rs(i1))
c##
c      if (mproc .eq. 0) write (19,*) ro
c
      conv = ro/r0
      if (iout .gt. 0 ) write(iout, 199) its, conv

      if ((i .lt. im) .and. (ro .gt. eps1)) then 
       
      goto 4
      endif
c     
c     now compute solution. first solve upper triangular system.
c     
      rs(i) = rs(i)/hh(i,i)
      do 30 ii=2,i
         k=i-ii+1
         k1 = k+1
         t=rs(k)
         do 40 j=k1,i
            t = t-hh(k,j)*rs(j)
 40      continue
         rs(k) = t/hh(k,k)
 30   continue
c     
c     done with back substitution..
c     now form linear combination to get solution
c     
      do 16 j=1, i
         t = rs(j)
         call daxpy(n, t, w(1,j), 1, sol,1)
 16   continue
c     
c     test for return 
      if (ro .le. eps1 .or. its .ge. maxits) goto 999
c     
c     else compute residual vector and continue..
c     
c      goto 10
      do 24 j=1,i
         jj = i1-j+1
         rs(jj-1) = -s(jj-1)*rs(jj)
         rs(jj) = c(jj-1)*rs(jj)
 24   continue
      do 25  j=1,i1
         t = rs(j)
         if (j .eq. 1)  t = t-1.0d0
         call daxpy (n, t, vv(1,j), 1,  vv, 1)
 25   continue
c     
c     restart outer loop.
c     
c       print*,'stop goto 20'
       goto 20
 999  icode = 0
c
 199  format('   -- fmgres its =', i4, ' res. norm =', d20.6)
*
      return 
c-----end-of-fgmres----------------------------------------------------- 
c-----------------------------------------------------------------------
      end