!-------------------------------------- LICENCE BEGIN ------------------------------------
!Environment Canada - Atmospheric Science and Technology License/Disclaimer, 
!                     version 3; Last Modified: May 7, 2008.
!This is free but copyrighted software; you can use/redistribute/modify it under the terms 
!of the Environment Canada - Atmospheric Science and Technology License/Disclaimer 
!version 3 or (at your option) any later version that should be found at: 
!http://collaboration.cmc.ec.gc.ca/science/rpn.comm/license.html 
!
!This software is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 
!without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 
!See the above mentioned License/Disclaimer for more details.
!You should have received a copy of the License/Disclaimer along with this software; 
!if not, you can write to: EC-RPN COMM Group, 2121 TransCanada, suite 500, Dorval (Quebec), 
!CANADA, H9P 1J3; or send e-mail to service.rpn@ec.gc.ca
!-------------------------------------- LICENCE END --------------------------------------
***s/r hzd_solmxma - parallel direct solution of high-order 
*                    diffusion equation with mxma8  
#include "model_macros_f.h"
*

      subroutine hzd_solmxma (F_sol, F_Rhs_8, F_Xevec_8 , 42
     %               F_a_8, F_c_8 ,F_deltai_8,
     %               minx1, maxx1, minx2, maxx2, nx1, nx2, nx3, F_pwr,
     %               Minx,Maxx,Miny,Maxy,Gnk,Gni,nil,njl,nkl,
     %               F_opsxp0_8, F_opsyp0_8,F_cdiff,F_npex,F_npey)
*
      implicit none
*
      integer minx1, maxx1, minx2, maxx2, nx1, nx2, nx3  ,
     $        Minx , Maxx , Miny , Maxy , Gnk, Gni, F_pwr,
     $        njl  , nkl  , nil  , F_npex, F_npey
      real*8 F_opsxp0_8(*), F_opsyp0_8(*),
     $            F_a_8(1:F_pwr,1:F_pwr,minx2:maxx2,nx3),
     $            F_c_8(1:F_pwr,1:F_pwr,minx2:maxx2,nx3),
     $       F_deltai_8(1:F_pwr,1:F_pwr,minx2:maxx2,nx3),
     $       F_Rhs_8(Minx:Maxx,Miny:Maxy,Gnk),F_Xevec_8(*)
      real F_cdiff, F_sol(Minx:Maxx,Miny:Maxy,Gnk)
*
*author
*     Abdessamad Qaddouri
*
*revision
* v2_10 - Qaddouri A.        - initial version
* v3_00 - Desgagne & Lee    - Lam configuration
* v3_10 - Corbeil & Desgagne & Lee - AIXport+Opti+OpenMP
* v3_11 - Corbeil L.        - new RPNCOMM transpose
*
*object
*
*arguments
*  Name        I/O                 Description
*----------------------------------------------------------------
* F_sol_8     I/O   - pre-r.h.s and result of horizontal diffusion
* F_rhs_8      I    - r.h.s. of elliptic equation and work vector
* F_Xevec_8    I    - eigenvectors
* F_a_8        I    - sub   diagonal of LU factorization
* F_b_8        I    -       diagonal of LU factorization
* F_c_8        I    - super diagonal of LU factorization
* F_deltai_8   I    - 
* minx1        I    - minimum index on local PEx for K (trp_12dmin)
* maxx1        I    - maximum index on local PEx for K (trp_12dmax)
* minx2        I    - minimum index on local PEy for I (trp_22min)
* maxx2        I    - maximum index on local PEy for I (trp_22max)
* Nx1          I    - number of points on local PEx for K (trp_12dn)
* Nx2          I    - number of points on local PEy for I (trp_22n)
* Nx3          I    - number of points along Y globally (G_nj)
* F_pwr        I    - power (high order) of factorization
* Minx         I    - minimum index on X for Rhs,Sol
* Maxx         I    - maximum index on X for Rhs,Sol
* Miny         I    - minimum index on Y for Rhs,Sol
* Maxy         I    - maximum index on Y for Rhs,Sol
* Gnk          I    - number of points along K globally (G_nk)
* Gni          I    - number of points along I globally (G_ni)
* Nil          I    - number of points on local PEx for I (l_ni)
* Njl          I    - number of points on local PEy for J (l_nj)
* Nkl          I    - number of points on local PEx for K (trp_12dn)
* F_opsxp0_8   I    - east-west operator for HO diffusion solver:Hzd_xp0_8
* F_opsyp0_8   I    - north-south operator for HO diffusion solver:Hzd_yp0_8
* F_cdiff      I    - Hzd_cdiff
* F_npex1      I    - number of processors on X
* F_npey1      I    - number of processors on Y
*
#include "ptopo.cdk"
#include "glb_ld.cdk"
#include "glb_pil.cdk"
*
      real*8, dimension(Miny:Maxy ,minx1:maxx1,Gni+F_npex) :: fdg1_8
      real*8, dimension(Miny:Maxy ,minx1:maxx1,Gni) :: fwft_8
      real*8, dimension(minx1:maxx1,minx2:maxx2,nx3+F_npey) :: fdg2_8
      real*8, dimension(minx1:maxx1,minx2:maxx2,F_pwr,nx3) ::dn3_8,sol_8
      real*8   ZERO_8, fact_8
      parameter( ZERO_8 = 0.0 )
      integer dim, o1,o2,i,j,k,l_pil_w,l_pil_e
      integer kkii,  kilon, ki0, kin, pin
**
*     __________________________________________________________________
*
*
c     call tmg_start(99,'hzd_solmxma')
      l_pil_w=0
      l_pil_e=0
      if (l_south) l_pil_w= Lam_pil_w
      if (l_north) l_pil_e= Lam_pil_e

*
*
      kilon = ( maxx2-l_pil_e-minx2+l_pil_w+1 + Ptopo_npeOpenMP)/Ptopo_npeOpenMP
      if (G_lam) then
          do k = 1, Gnk
          do j = 1+pil_s, njl-pil_n
          do i = 1+pil_w,nil-pil_e
             F_Rhs_8(i,j,k)=  ((-1)**F_pwr)*dble(F_cdiff)*dble(F_sol(i,j,k))
          enddo
          enddo
          enddo
          call rpn_comm_transpose ( F_Rhs_8, Minx, Maxx, Gni, (Maxy-Miny+1),
     %                          minx1, maxx1, Gnk, fdg1_8, 1,2 )
      else
          fact_8=((-1)**F_pwr)*dble(F_cdiff)
          call rpn_comm_transpose48 ( F_sol, Minx, Maxx, Gni,1, (Maxy-Miny+1),
     %                          (Maxy-Miny+1),minx1, maxx1, Gnk, fdg1_8,1,
     %                             fact_8,0.d0)
      endif

!$omp parallel private(kkii,ki0,kin,pin) shared(kilon)
!$omp do
      do i = 1+Lam_pil_w, Gni-Lam_pil_e
      do k = 1, nkl
      do j = 1+pil_s, njl-pil_n
         fdg1_8(j,k,i) = F_opsxp0_8(Gni+i)*fdg1_8(j,k,i)
      enddo
      enddo
      enddo
!$omp enddo
*
!$omp do
      do i = 1, Gni
         do k = minx1, maxx1 
         do j = Miny, 0+pil_s
            fdg1_8(j,k,i)= ZERO_8
         enddo
         do j = njl+1-pil_n, Maxy
            fdg1_8(j,k,i)= ZERO_8
         enddo
         do j = Miny , Maxy
            fwft_8(j,k,i)=ZERO_8
         enddo
         enddo
      enddo
!$omp enddo
*
*     projection ( wfft = x transposed * g )
*
c      do k=1,Nkl
c      call mxma8( F_Xevec_8,Gni-Lam_pil_w-Lam_pil_e, 1,
c     %      fdg1_8(Miny+pil_s,k,1+Lam_pil_w),(Maxy-Miny+1) * (maxx1-minx1+1), 1,
c     %      fwft_8(Miny+pil_s,k,1+Lam_pil_w),(Maxy-Miny+1) * (maxx1-minx1+1), 1,
c     %      Gni-Lam_pil_w-Lam_pil_e, Gni-Lam_pil_w-Lam_pil_e,
c     %      (Maxy-Miny+1-pil_s-pil_n))
c      enddo
*
!$omp do
      do k=1,Nkl
         call dgemm('N','N',(Maxy-Miny+1-pil_s-pil_n), 
     .            Gni-Lam_pil_w-Lam_pil_e,
     .            Gni-Lam_pil_w-Lam_pil_e,
     .            1._8, fdg1_8(Miny+pil_s,k,1+Lam_pil_w), 
     .            (Maxy-Miny+1) * (maxx1-minx1+1),F_Xevec_8, 
     .            Gni-Lam_pil_w-Lam_pil_e,
     .            0._8, fwft_8(Miny+pil_s,k,1+Lam_pil_w), 
     .            (maxy-miny+1) * (maxx1-minx1+1))
      enddo
!$omp enddo
!$omp single
      call rpn_comm_transpose(fwft_8,Miny,Maxy,nx3,(maxx1-minx1+1),
     %                                minx2, maxx2,Gni,fdg2_8,2,2)
!$omp end single
*
* cote droit
*
!$omp do
      do j  = 1, nx3
         do o1 = 1, F_pwr
         do i = minx2, maxx2
         do k = minx1, maxx1
            sol_8(k,i,o1,j)= ZERO_8
            dn3_8(k,i,o1,j)= ZERO_8
         enddo
         enddo
         enddo
      enddo
!$omp enddo
*
!$omp do
      do j = 1+Lam_pil_s, nx3-Lam_pil_n
      do i = 1+l_pil_w, nx2-l_pil_e
      do k = 1, nx1
         dn3_8(k,i,1,j)= F_opsyp0_8(nx3+j)*fdg2_8(k,i,j)
      enddo
      enddo
      enddo
!$omp enddo
*
* resolution du systeme blok-tridiagonal
*
* aller
!$omp do
      do o1= 1, F_pwr
      do i = 1+l_pil_w, nx2-l_pil_e
      do k = 1, nx1
         sol_8(k,i,o1,1+Lam_pil_s)=dn3_8(k,i,o1,1+Lam_pil_s)
      enddo
      enddo
      enddo
!$omp enddo
*
!$omp do
      do kkii = 1, Ptopo_npeOpenMP
         ki0 = 1 +l_pil_w + kilon*(kkii-1)
         kin = min(ki0+kilon-1, maxx2-l_pil_e)
         pin = min(ki0+kilon-1,   nx2-l_pil_e)

         do j = 2+Lam_pil_s, nx3-Lam_pil_n
            do o1= 1, F_pwr
            do o2= 1, F_pwr
            do i = ki0,kin
            do k = minx1, maxx1
               sol_8(k,i,o1,j)= sol_8(k,i,o1,j)
     $                          + F_a_8(o1,o2,i,j)*sol_8(k,i,o2,j-1)
            enddo
            enddo
            enddo
            enddo
         
            do o1= 1, F_pwr
            do i = ki0,pin
            do k = 1, nx1
               sol_8(k,i,o1,j)=dn3_8(k,i,o1,j)-sol_8(k,i,o1,j)
            enddo
            enddo
            enddo
         enddo
      enddo
!$omp enddo
*
* scale le cote droit pour retour
*
!$omp do
      do j= 1, nx3
         do o1= 1,F_pwr
         do i= minx2,maxx2
         do k= minx1,maxx1
            dn3_8(k,i,o1,j)= ZERO_8
         enddo
         enddo
         enddo
      enddo
!$omp enddo
!$omp do
      do j= 1+Lam_pil_s, nx3-Lam_pil_n
         do o2= 1, F_pwr
         do o1= 1, F_pwr
         do i = minx2+l_pil_w, maxx2-l_pil_e
         do k = minx1, maxx1
            dn3_8(k,i,o1,j)= dn3_8(k,i,o1,j)
     $                       + F_deltai_8(o1,o2,i,j)*sol_8(k,i,o2,j)
         enddo
         enddo
         enddo
         enddo
      enddo
!$omp enddo
*
* retour
*
!$omp do
      do j = 1, nx3
         if(j.eq.nx3-Lam_pil_n) then
            do o1= 1, F_pwr
            do i = 1+l_pil_w, nx2-l_pil_e
            do k = 1, nx1
               sol_8(k,i,o1,j)= dn3_8(k,i,o1,j)
            enddo
            enddo
            enddo
         else
            do o1= 1, F_pwr
            do i = 1+l_pil_w, nx2-l_pil_e
            do k = 1, nx1
               sol_8(k,i,o1,j)= 0.0
            enddo
            enddo
            enddo
         endif
      enddo
!$omp enddo 
*
!$omp do
      do kkii = 1, Ptopo_npeOpenMP
         ki0 = 1 +l_pil_w + kilon*(kkii-1)
         kin = min(ki0+kilon-1, maxx2-l_pil_e)
         pin = min(ki0+kilon-1,   nx2-l_pil_e)

         do j = nx3-1-Lam_pil_n, 1+Lam_pil_s, -1
            do o1= 1, F_pwr
            do o2= 1, F_pwr
            do i = ki0,kin
            do k = minx1, maxx1
               sol_8(k,i,o1,j)= sol_8(k,i,o1,j)
     $                       + F_c_8(o1,o2,i,j)*sol_8(k,i,o2,j+1)
            enddo
            enddo
            enddo
            enddo
            do o1= 1, F_pwr
            do i = ki0, pin
            do k = 1, nx1
               sol_8(k,i,o1,j)= dn3_8(k,i,o1,j) - sol_8(k,i,o1,j)
            enddo
            enddo
            enddo
         enddo
      enddo
!$omp enddo
*
!$omp do
      do j = 1+Lam_pil_s, nx3-Lam_pil_n
      do i = 1+l_pil_w, nx2-l_pil_e
      do k = 1, nx1
         fdg2_8(k,i,j)=sol_8(k,i,F_pwr,j)
      enddo
      enddo
      enddo
!$omp enddo
*
!$omp single
!       call tmg_stop(11)
       call rpn_comm_transpose(fwft_8,Miny,Maxy,nx3,(maxx1-minx1+1),
     %                                  minx2, maxx2,Gni,fdg2_8,-2,2)
!$omp end single
*
*     inverse projection ( r = x * w )
*
c      do k=1,Nkl
c        call mxma8( F_Xevec_8,1,Gni-Lam_pil_w-Lam_pil_e,
c     %       fwft_8(Miny+pil_s,k,1+Lam_pil_w),(Maxy-Miny+1)*(maxx1-minx1+1), 1,
c     %       fdg1_8(Miny+pil_s,k,1+Lam_pil_w),(Maxy-Miny+1)*(maxx1-minx1+1), 1,
c     %       Gni-Lam_pil_w-Lam_pil_e, Gni-Lam_pil_w-Lam_pil_e,
c     %       (Maxy-Miny+1-pil_s-pil_n) )
c      enddo
!$omp do
       do k= 1,Nkl
          call dgemm('N','T', (maxy-miny+1-pil_s-pil_n) ,
     .             gni-Lam_pil_w-Lam_pil_e,
     .             gni-Lam_pil_w-Lam_pil_e,
     .             1._8, fwft_8(Miny+pil_s,k,1+Lam_pil_w), 
     .             (maxy-miny+1) * (maxx1-minx1+1),F_Xevec_8,
     .             gni-Lam_pil_w-Lam_pil_e,
     .             0._8,fdg1_8(Miny+pil_s,k,1+Lam_pil_w), 
     .             (maxy-miny+1) * (maxx1-minx1+1))
       enddo
!$omp enddo
!$omp end parallel
      if (G_lam) then
      call rpn_comm_transpose(F_Rhs_8, Minx, Maxx, Gni, (Maxy-Miny+1),
     %                               minx1, maxx1, Gnk, fdg1_8, -1,2 )
      do k= 1, Gnk
      do j= 1+pil_s, njl-pil_n
      do i= 1+pil_w, nil-pil_e
         F_sol(i,j,k)= sngl(F_Rhs_8(i,j,k))
      enddo
      enddo
      enddo
      else
      call rpn_comm_transpose48 ( F_sol, Minx, Maxx, Gni,1, (Maxy-Miny+1),
     %                          (Maxy-Miny+1),minx1, maxx1, Gnk, fdg1_8, -1,
     %                             1.0d0,0.d0)
      endif

c     call tmg_stop(99)
*
*
*     __________________________________________________________________
*
      return
      end