!-------------------------------------- LICENCE BEGIN ------------------------------------
!Environment Canada - Atmospheric Science and Technology License/Disclaimer, 
!                     version 3; Last Modified: May 7, 2008.
!This is free but copyrighted software; you can use/redistribute/modify it under the terms 
!of the Environment Canada - Atmospheric Science and Technology License/Disclaimer 
!version 3 or (at your option) any later version that should be found at: 
!http://collaboration.cmc.ec.gc.ca/science/rpn.comm/license.html 
!
!This software is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 
!without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 
!See the above mentioned License/Disclaimer for more details.
!You should have received a copy of the License/Disclaimer along with this software; 
!if not, you can write to: EC-RPN COMM Group, 2121 TransCanada, suite 500, Dorval (Quebec), 
!CANADA, H9P 1J3; or send e-mail to service.rpn@ec.gc.ca
!-------------------------------------- LICENCE END --------------------------------------
***s/r adw_tritrunc_lag3d_vec - Tri-Lagrangian (truncated) interpolation.
*
#include "model_macros_f.h"
*

      subroutine adw_tritrunc_lag3d_vec ( F_out, F_in, F_x, F_y, F_z,
     %                              F_num, F_mono_L, i0, in, j0, jn, kn )
*
      implicit none
*
      logical F_mono_L
*
      integer F_num, i0, in, j0, jn, kn
*
      real F_in(*)
*
      real F_out (F_num), F_x(F_num), F_y(F_num), F_z(F_num)
*
*authors
*     McTaggart-Cowan
*
*     (Based on adw_tricub_lag3d and adw_trilin v_3.2.1 with modifications
*     as per ECMWF http://www.ecmwf.int/research/ifsdocs/CY28r1/Dynamics/Dynamics-4-02.html)
*      
*revision
* v3_30 - McTaggart-Cowan - initial version 
*
*object
*     see id section
*
*arguments
*______________________________________________________________________
*              |                                                 |     |
* NAME         | DESCRIPTION                                     | I/O |
*--------------|-------------------------------------------------|-----|
* F_out        | result of interpolation                         |  o  |
* F_in         | field to interpolate                            |  i  |
*              |                                                 |     |
* F_x          | interpolation target X coordinate               |  i  |
* F_y          | interpolation target Y coordinate               |  i  |
* F_z          | interpolation target Z coordinate               |  i  |
*              |                                                 |     |
* F_num        | number of points to interpolate                 |  i  |
*              |                                                 |     |
* F_mono_L     | switch: .true. : monotonic interpolation        |  i  |
*______________|_________________________________________________|_____|
*
*implicits
#include "glb_ld.cdk"
#include "adw.cdk"
#include "adw_comp.cdk"
*
*notes
*  This algorithm is a truncated version of the full 3D Lagrangian
*  interpolation procedure (adw_trilag_3d).  Full Lagrangian interpolation
*  requires that the local calculation be done in a 4-point 3D cube,
*  thereby needing 64 values for each point.  This imposes an enormous
*  load on memory access during the gather/scatter operation.  In this
*  truncation, a 3D diamond rather than a cube is required for the
*  interpolation operation (32 values).  Interpolation to the points closest to the
*  back-trajectory origin is done using cubic function; however, interpolation
*  to points further from the origin are done linearly.  As a result,
*  each 3D truncated interpolation uses only 7 Lagragian interpolations
*  and 10 linear interpolations (compared to 21 Lagrangian interpolations
*  in the full 3D Lagrangian algorithm).
*
*  The organization of the diamond is shown in plan form here, along with
*  the order of each interpolation for the 'inner' layers (immediately above
*  and below the point of interest, indeces k and k+1), and 'outer' layers
*  (indeces k-1 and k+2).  The origin of the back-trajectory is denoted with
*  {} braces around the interpolation method used in the y-direction to
*  obtain it.  Both layers are plotted in the horizontal plane.  Cube points
*  not accessed are denoted with a '0', and those addressed and used by the
*  truncated algorithm are denoted with an 'X'.
*
*  Inner layers 2 x (2 linear; 3 cubic):
*	0		X -- linear  -- X		0
*				|
*				|
*				|
*	X ------------- X --- cubic --- X ------------- X
*				|
*			     {cubic}
*				|
*       X ------------- X --- cubic --- X ------------- X
*				|
*				|
*				|
*	0		X -- linear  -- X		0
*
*  Outer layers 2 x (3 linear):
*	0		0		0		0
*
*
*
*	0		X -- linear  -- X		0
*			 	|
*			    {linear}
*				|
*	0		X -- linear  -- X		0
*
*
*
*	0		0		0		0
*
*  The vertical interpolation is cubic (Lagrangian) through the
*  four points obtained using the layers shown above.  This interpolation
*  constitues the 7th (and final) higher order interpolation performed
*  by the truncated algorithm.
*
* **********************************************************************
*     Internal declarations
* **********************************************************************
* Statement functions
      real za
      real(kind=8) :: triprd,zb,zc,zd
      triprd(za,zb,zc,zd)=(za-zb)*(za-zc)*(za-zd)
* Standard declarations
      integer :: nijk,nijag,i,j,k,nij,iimax,jjmax,kkmax,cnt,err,m
      integer, dimension(:), allocatable, save :: ii,jj,kk,n
      integer, dimension(:,:), allocatable, save :: o1,o2,o3,o4
      real, dimension(:), allocatable :: prmax,prmin
      real(kind=8),dimension(:,:), allocatable, save :: cap
      real(kind=8), dimension(:,:), allocatable :: a,b,c,d,p
      real(kind=8), dimension(:,:), allocatable, save :: ra,rb,rc,rd
      logical :: init=.true.
      logical, dimension(:), allocatable, save :: zcubic_L
*
*     ----------------------------------------------------
*
* **********************************************************************
*     Local grid sizing
* **********************************************************************
      nij   = l_ni*l_nj
      nijag = Adw_nit * Adw_njt
      nijk  = (kn-1)*nij + ((jn-1)*l_ni) + in
      cnt   = kn * (jn-j0+1) * (in-i0+1)
*
      iimax = G_ni+2*Adw_halox-2
      jjmax = G_nj+Adw_haloy
      kkmax = l_nk-1
*
* **********************************************************************
*     Initialize local positional arrays on startup
* **********************************************************************
      if (init) then
*
	allocate(ra(cnt,3),rb(cnt,3),rc(cnt,3),rd(cnt,3),cap(cnt,3),stat=err)
	if (err /= 0) write(6,*) "ADW_TRICUB_LAG3D_VEC: Allocation error (D,S)"
	allocate(ii(cnt),jj(cnt),kk(cnt),o1(cnt,4),o2(cnt,4),o3(cnt,4),o4(cnt,4),stat=err)
	if (err /= 0) write(6,*) "ADW_TRICUB_LAG3D_VEC: Allocation error (I,S)"
	allocate(zcubic_L(cnt),stat=err)
	if (err /= 0) write(6,*) "ADW_TRICUB_LAG3D_VEC: Allocation error (L,S)"
	allocate(n(cnt),stat=err)
	if (err /= 0) write(6,*) "ADW_TRICUB_LAG3D_VEC: Allocation error (I,S)"
	m = 1
	do k=1,kn
	  do j=j0,jn
	    do i=i0,in
	      n(m) = (k-1)*nij + ((j-1)*l_ni) + i
	      m = m+1
	    enddo
	  enddo
	enddo
*
	init = .false.			!execute on startup only
	adw_comp_cub_L = .true.		!need to initialize values (below)
*
      endif
*
* **********************************************************************
*	Set local array sizes and allocate
* *******************************************************************
      allocate(prmax(cnt),prmin(cnt),stat=err)
      if (err /= 0) write(6,*) "ADW_TRICUB_LAG3D_VEC: Allocation error (R,L)"
      allocate(a(cnt,2:3),b(cnt,4),c(cnt,4),d(cnt,2:3),p(cnt,4),stat=err)
      if (err /= 0) write(6,*) "ADW_TRICUB_LAG3D_VEC: Allocation error (D,L)"
*
* *******************************************************************
* 	Set positional parameters if required
* *******************************************************************
      if (adw_comp_cub_L) then
	do m=1,cnt
* Prepare x points
          ii(m) = ( F_x(n(m)) - Adw_x00_8 ) * Adw_ovdx_8
          ii(m) = Adw_lcx( ii(m)+1 ) + 1
	  if (F_x(n(m)) < Adw_bsx_8(ii(m))) ii(m) = ii(m) - 1
	  ii(m) = max(2,min(ii(m),iimax))
	  cap(m,1) = (F_x(n(m)) - Adw_bsx_8(ii(m))) / (Adw_bsx_8(ii(m)+1) - Adw_bsx_8(ii(m)))
* Prepare y points
	  jj(m) = ( F_y(n(m)) - Adw_y00_8 ) * Adw_ovdy_8
	  jj(m) = Adw_lcy( jj(m)+1 ) + 1
	  if (F_y(n(m)) < Adw_bsy_8(jj(m))) jj(m) = jj(m) - 1
	  jj(m) = max(Adw_haloy,min(jj(m),jjmax))
	  cap(m,2) = (F_y(n(m)) - Adw_bsy_8(jj(m))) / (Adw_bsy_8(jj(m)+1) - Adw_bsy_8(jj(m)))
* Prepare z points
	  kk(m) = ( F_z(n(m)) - Adw_z00_8 ) * Adw_ovdz_8
	  kk(m) = Adw_lcz( kk(m)+1 )
	  if (F_z(n(m)) < Adw_bsz_8(kk(m))) kk(m) = kk(m) - 1
	  kk(m) = min(kkmax-1,max(0,kk(m)))
	  cap(m,3) = (F_z(n(m)) - Adw_bsz_8(kk(m))) / (Adw_bsz_8(kk(m)+1) - Adw_bsz_8(kk(m)))
*
	  zcubic_L(m) = (kk(m) > 0) .and. (kk(m) < kkmax-1)
*
      	  o2(m,1) = (kk(m)-1)*nijag + (jj(m)-Adw_int_j_off-1)*Adw_nit + (ii(m)-Adw_int_i_off)
      	  o1(m,1) = o2(m,1)-Adw_nit	
      	  o3(m,1) = o2(m,1)+Adw_nit	
      	  o4(m,1) = o3(m,1)+Adw_nit
	  o1(m,2) = o1(m,1)+nijag  !unroll loop for vectorization
	  o2(m,2) = o2(m,1)+nijag
	  o3(m,2) = o3(m,1)+nijag
	  o4(m,2) = o4(m,1)+nijag
	  o1(m,3) = o1(m,2)+nijag
	  o2(m,3) = o2(m,2)+nijag
	  o3(m,3) = o3(m,2)+nijag
	  o4(m,3) = o4(m,2)+nijag
	  o1(m,4) = o1(m,3)+nijag
	  o2(m,4) = o2(m,3)+nijag
	  o3(m,4) = o3(m,3)+nijag
	  o4(m,4) = o4(m,3)+nijag
*
      	  ra(m,1) = Adw_bsx_8(ii(m)-1)
      	  rb(m,1) = Adw_bsx_8(ii(m)  )
      	  rc(m,1) = Adw_bsx_8(ii(m)+1)
      	  rd(m,1) = Adw_bsx_8(ii(m)+2)
      	  ra(m,2) = Adw_bsy_8(jj(m)-1)
	  rb(m,2) = Adw_bsy_8(jj(m)  )
      	  rc(m,2) = Adw_bsy_8(jj(m)+1)
      	  rd(m,2) = Adw_bsy_8(jj(m)+2)
	  if (zcubic_L(m)) then
	    ra(m,3) = Adw_bsz_8(kk(m)-1)
      	    rb(m,3) = Adw_bsz_8(kk(m)  )
      	    rc(m,3) = Adw_bsz_8(kk(m)+1)
      	    rd(m,3) = Adw_bsz_8(kk(m)+2)
	  endif
	enddo
*
        adw_comp_cub_L = .false.	!save position until next request
*
      endif
*
* *********************************************************************
*	Begin main interpolation loops
* *********************************************************************
      do m=1,cnt
*
* *********************************************************************
*     x interpolation
* *********************************************************************
      	p(m,1) = triprd(F_x(n(m)),rb(m,1),rc(m,1),rd(m,1))*Adw_xabcd_8(ii(m))
      	p(m,2) = triprd(F_x(n(m)),ra(m,1),rc(m,1),rd(m,1))*Adw_xbacd_8(ii(m))
      	p(m,3) = triprd(F_x(n(m)),ra(m,1),rb(m,1),rd(m,1))*Adw_xcabd_8(ii(m))
      	p(m,4) = triprd(F_x(n(m)),ra(m,1),rb(m,1),rc(m,1))*Adw_xdabc_8(ii(m))
*
        if (zcubic_L(m)) then
	  a(m,2) = (1.d0 - cap(m,1)) * F_in(o2(m,1)) + cap(m,1) * F_in(o2(m,1)+1)
	  a(m,3) = (1.d0 - cap(m,1)) * F_in(o3(m,1)) + cap(m,1) * F_in(o3(m,1)+1)
*
	  d(m,2) = (1.d0 - cap(m,1)) * F_in(o2(m,4)) + cap(m,1) * F_in(o2(m,4)+1)
	  d(m,3) = (1.d0 - cap(m,1)) * F_in(o3(m,4)) + cap(m,1) * F_in(o3(m,4)+1)
        endif
*
	b(m,1) = (1.d0 - cap(m,1)) * F_in(o1(m,2)) + cap(m,1) * F_in(o1(m,2)+1)
      	b(m,2) = p(m,1) * F_in  (o2(m,2)-1) + p(m,2) * F_in  (o2(m,2)) + p(m,3) * F_in  (o2(m,2)+1) + p(m,4) * F_in  (o2(m,2)+2)
      	b(m,3) = p(m,1) * F_in  (o3(m,2)-1) + p(m,2) * F_in  (o3(m,2)) + p(m,3) * F_in  (o3(m,2)+1) + p(m,4) * F_in  (o3(m,2)+2)
	b(m,4) = (1.d0 - cap(m,1)) * F_in(o4(m,2)) + cap(m,1) * F_in(o4(m,2)+1)
*
	c(m,1) = (1.d0 - cap(m,1)) * F_in(o1(m,3)) + cap(m,1) * F_in(o1(m,3)+1)
      	c(m,2) = p(m,1) * F_in  (o2(m,3)-1) + p(m,2) * F_in  (o2(m,3)) + p(m,3) * F_in  (o2(m,3)+1) + p(m,4) * F_in  (o2(m,3)+2)
     	c(m,3) = p(m,1) * F_in  (o3(m,3)-1) + p(m,2) * F_in  (o3(m,3)) + p(m,3) * F_in  (o3(m,3)+1) + p(m,4) * F_in  (o3(m,3)+2)
	c(m,4) = (1.d0 - cap(m,1)) * F_in(o4(m,3)) + cap(m,1) * F_in(o4(m,3)+1)
*
      enddo
*
* *********************************************************************
*     y interpolation
* *********************************************************************
      do m=1,cnt
      	p(m,1) = triprd(F_y(n(m)),rb(m,2),rc(m,2),rd(m,2))*Adw_yabcd_8(jj(m))
      	p(m,2) = triprd(F_y(n(m)),ra(m,2),rc(m,2),rd(m,2))*Adw_ybacd_8(jj(m))
      	p(m,3) = triprd(F_y(n(m)),ra(m,2),rb(m,2),rd(m,2))*Adw_ycabd_8(jj(m))
      	p(m,4) = triprd(F_y(n(m)),ra(m,2),rb(m,2),rc(m,2))*Adw_ydabc_8(jj(m))
*
    	if (zcubic_L(m)) then
	  a(m,2) = (1.d0 - cap(m,2)) * a(m,2) + cap(m,2) * a(m,3)
	  d(m,2) = (1.d0 - cap(m,2)) * d(m,2) + cap(m,2) * d(m,3)
	endif
*
      	b(m,1) = p(m,1) * b(m,1) + p(m,2) * b(m,2) + p(m,3) * b(m,3) + p(m,4) * b(m,4)
      	c(m,1) = p(m,1) * c(m,1) + p(m,2) * c(m,2) + p(m,3) * c(m,3) + p(m,4) * c(m,4)
      enddo
*
* *********************************************************************
*     z interpolation
* *********************************************************************
      do m=1,cnt
	if (zcubic_L(m)) then
	  p(m,1) = triprd(F_z(n(m)),rb(m,3),rc(m,3),rd(m,3))*Adw_zabcd_8(kk(m)+1)
	  p(m,2) = triprd(F_z(n(m)),ra(m,3),rc(m,3),rd(m,3))*Adw_zbacd_8(kk(m)+1)
          p(m,3) = triprd(F_z(n(m)),ra(m,3),rb(m,3),rd(m,3))*Adw_zcabd_8(kk(m)+1)
	  p(m,4) = triprd(F_z(n(m)),ra(m,3),rb(m,3),rc(m,3))*Adw_zdabc_8(kk(m)+1)
*
	  p(m,1) = p(m,1) * a(m,2) + p(m,2) * b(m,1) + p(m,3) * c(m,1) + p(m,4) * d(m,2) 	!recycle p1
*
	else	
*	
	  p(m,1) = (1.d0 - cap(m,3)) * b(m,1) + cap(m,3) * c(m,1)  !recycle p1
*
	endif
*
* *********************************************************************
*	End of main loops
* *********************************************************************
      enddo
*
* *********************************************************************
* 	Final data assignment
* *********************************************************************
      if (.not.F_mono_L) then
	do m=1,cnt
	  F_out(n(m)) = p(m,1)
	enddo
      else
	do m=1,cnt
      	  prmax(m) = max(F_in(o2(m,2)),F_in(o2(m,2)+1),F_in(o3(m,2)),F_in(o3(m,2)+1))
      	  prmin(m) = min(F_in(o2(m,2)),F_in(o2(m,2)+1),F_in(o3(m,2)),F_in(o3(m,2)+1))
	  prmax(m) = max(prmax(m),F_in(o2(m,3)),F_in(o2(m,3)+1),F_in(o3(m,3)),F_in(o3(m,3)+1))
      	  prmin(m) = min(prmin(m),F_in(o2(m,3)),F_in(o2(m,3)+1),F_in(o3(m,3)),F_in(o3(m,3)+1))
	  F_out(n(m)) = max (dble(prmin(m)),min(dble(prmax(m)),p(m,1)))
	enddo
      endif
*
* *********************************************************************
* 	Clear stack
* *********************************************************************
      deallocate(prmin,prmax)
      if (err /= 0) write(6,*) "ADW_TRICUB_LAG3D_VEC: Deallocation error (R,L)"
      deallocate(a,b,c,d,p,stat=err)
      if (err /= 0) write(6,*) "ADW_TRICUB_LAG3D_VEC: Deallocation error (D,L)"
*
      return
      end