!--------------------------------------- LICENCE BEGIN -----------------------------------
!Environment Canada - Atmospheric Science and Technology License/Disclaimer,
!                     version 3; Last Modified: May 7, 2008.
!This is free but copyrighted software; you can use/redistribute/modify it under the terms
!of the Environment Canada - Atmospheric Science and Technology License/Disclaimer
!version 3 or (at your option) any later version that should be found at:
!http://collaboration.cmc.ec.gc.ca/science/rpn.comm/license.html
!
!This software is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
!without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
!See the above mentioned License/Disclaimer for more details.
!You should have received a copy of the License/Disclaimer along with this software;
!if not, you can write to: EC-RPN COMM Group, 2121 TransCanada, suite 500, Dorval (Quebec),
!CANADA, H9P 1J3; or send e-mail to service.rpn@ec.gc.ca
!-------------------------------------- LICENCE END --------------------------------------

!--------------------------------------------------------------------------
! MODULE minimization (Minimization for variational assimilation.  prefix="min")
!
! Purpose: 
!
! Subroutines (public):
!    min_setup
!    min_minvar
!
! Dependencies:
!
!--------------------------------------------------------------------------

MODULE minimization_mod 5,10
  use MathPhysConstants_mod
  use timeCoord_mod
  use columnData_mod
  use obsSpaceData_mod
  use controlVector_mod
  use mpivar_mod
  use HorizontalCoord_mod
  use gridStateVector_mod
  use bmatrix_mod
  use tovs_nl_mod
  implicit none
  save
  private

  ! public variables
  public              :: min_lvarqc,min_niter,min_nsim
  ! public procedures
  public              :: min_Setup, min_minvar

  type struct_dataptr
    type(struct_obs),pointer        :: lobsSpaceData
    type(struct_columnData),pointer :: lcolumn
    type(struct_columnData),pointer :: lcolumng
  end type struct_dataptr

  logical             :: initialized = .false.
  real*8, pointer     :: dg_vbar(:)

  integer             :: nmtra,nwork,min_nsim
  integer             :: nvadim_mpilocal ! for mpi
  integer             :: min_niter
  integer             :: dataptr_int_size=0
  integer,external    :: get_max_rss
  logical             :: preconFileExists
  character(len=20)   :: preconFileName    = './preconin'  
  character(len=20)   :: preconFileNameOut = './pm1q'  
  integer             :: n1gc = 3

  ! namelist variables
  INTEGER NVAMAJ,NITERMAX,NSIMMAX,NCYCLEHESSMODE,NITERMAX_PERT
  LOGICAL lxbar,lwrthess,lgrtest,lvazx
  REAL*8 REPSG,rdf1fac
  LOGICAL min_lvarqc,lvarqc
  integer :: nwoqcv

  NAMELIST /NAMMIN/NVAMAJ,NITERMAX,NSIMMAX,NCYCLEHESSMODE,NITERMAX_PERT
  NAMELIST /NAMMIN/LGRTEST
  NAMELIST /NAMMIN/lxbar,lwrthess,lvazx
  NAMELIST /NAMMIN/REPSG,rdf1fac
  NAMELIST /NAMMIN/LVARQC,NWOQCV

CONTAINS


  SUBROUTINE min_setup(nvadim_mpilocal_in) 1,3
    implicit none
    integer :: nvadim_mpilocal_in

    integer :: ierr,nulnam
    integer :: fnom,fclos

    if(nvadim_mpilocal_in.ne.cvm_nvadim) then
      write(*,*) 'nvadim_mpilocal,cvm_nvadim=',nvadim_mpilocal,cvm_nvadim
      call abort3d('aborting in min_setup: control vector dimension not consistent')
    endif

    nvadim_mpilocal=nvadim_mpilocal_in

    ! set default values for namelist variables
    nvamaj = 6
    nitermax = 400
    nitermax_pert = -1
    rdf1fac  = 0.25d0
    nsimmax  = 500
    lgrtest  = .false.
    lwrthess = .true.
    lxbar    = .false.
    lvazx    = .false.
    repsg    = 1d-5
    lvarqc   = .false.
    nwoqcv   = 5
    ncyclehessmode = 0

    ! read in the namelist NAMMIN
    nulnam=0
    ierr=fnom(nulnam,'./flnml','FTN+SEQ+R/O',0)
    read(nulnam,nml=nammin,iostat=ierr)
    if(ierr.ne.0) call abort3d('min_setup: Error reading namelist')
    write(*,nml=nammin)
    ierr=fclos(nulnam)

    ! if nitermax_pert not specified, make it equal to value for control analysis
    if(nitermax_pert.le.0) nitermax_pert = nitermax

    min_lvarqc=lvarqc

    IF(N1GC.EQ.3)THEN
      NMTRA = (4 + 2*NVAMAJ)*nvadim_mpilocal
    ELSE
      call abort3d('min_setup: only N1GC=3 currently supported!') 
    END IF
    WRITE(*,9401)N1GC,NVAMAJ,NMTRA
 9401 FORMAT(4X,'N1GC = ',I2,4X,'NVAMAJ = ',I3,/5X,"NMTRA =",1X,I14)

    if(LVARQC .and. mpi_myid.eq.0) write(*,*) 'VARIATIONAL QUALITY CONTROL ACTIVATED.'

    initialized=.true.

  END SUBROUTINE min_setup



  SUBROUTINE min_minvar(lcolumn,lcolumng,lobsSpaceData,isControlAnalysis) 1,19
    !
    ! Purpose:
    ! 3D/En VAR minimization
    !
      IMPLICIT NONE

      type(struct_columnData),target :: lcolumn,lcolumng
      type(struct_obs),target :: lobsSpaceData
      logical :: isControlAnalysis
      type(struct_dataptr) :: dataptr 
      integer,allocatable  :: dataptr_int(:) ! obs array used to transmit pointer
      integer              :: nulout = 6
      integer              :: impres
      INTEGER              :: NGRANGE = 10 ! range of powers of 10 used for gradient test

      INTEGER IZS(1),IZTRL(10)
      REAL ZZSUNUSED(1)

      real*8,allocatable :: vazg(:)
      real*8,allocatable :: vazx(:)
      real*8,allocatable :: vatra(:)

      real*8 :: dlds(1)
      logical :: lltest, llvarqc, lldf1, lrdvatra, llvazx, llxbar

      integer :: imode, itermax, iterdone, itermaxtodo, isimmax, indic, iitnovqc, jj
      integer :: ierr, itertot, isimdone, isimtot, jdata, isimnovqc
      integer :: ibrpstamp, isim3d, ilen
      real*8 :: zjsp, zxmin, zdf1, zeps0, zeps1
      real*8 :: dlgnorm, dlxnorm, zjotov

      integer fnom,fclos, remove_c
      external fnom,fclos
      external  n1qn3
      real*8 :: zeps0_000,zdf1_000
      integer :: iterdone_000,isimdone_000
      character(len=128) :: clfname
 
      min_nsim=0 

      if(mpi_myid.eq.0) then
        impres=5
      else 
        impres=0
      endif

      ! change name of precon file to read if not control analysis and ncyclehessmode > 0 and hessian was written
      if(.not.isControlAnalysis .and. nCycleHessMode.gt.0 .and. lwrthess) then
        preconFileName = preconFileNameOut
        if(mpi_myid.eq.0) write(*,*) 'min_minvar: setting the precon file name to be read to: ',preconFileName
      endif

      ! Check for preconditioning file
      inquire(file=preconFileName,exist=preconFileExists)
      if(preconFileExists) then
        if(mpi_myid.eq.0) write(*,*) 'PRECONDITIONING FILE FOUND:',preconFileName
      else
        if(mpi_myid.eq.0) write(*,*) 'NO PRECONDITIONING FILE FOUND:',preconFileName
      endif

      ! deactivate VARQC if this is not a control analysis
      if(.not.isControlAnalysis) then
        if(mpi_myid.eq.0) write(*,*) 'min_minvar: Variational QC de-activated for ensemble analyses!'
        if(mpi_myid.eq.0) write(*,*) ' '
        lvarqc = .false.
        min_lvarqc=lvarqc
      endif

      ! allocate control vector related arrays (these are all mpilocal)

      allocate(vazx(nvadim_mpilocal),stat=ierr)
      if(ierr.ne.0) then
        write(*,*) 'minimization: Problem allocating memory! id=3',ierr
        call abort3d('aborting in min_minvar')
      endif

      allocate(dg_vbar(nvadim_mpilocal),stat=ierr)
      if(ierr.ne.0) then
        write(*,*) 'minimization: Problem allocating memory! id=1',ierr
        call abort3d('aborting in min_minvar')
      endif

      allocate(vazg(nvadim_mpilocal),stat=ierr)
      if(ierr.ne.0) then
        write(*,*) 'minimization: Problem allocating memory! id=2',ierr
        call abort3d('aborting in min_minvar')
      endif

      allocate(vatra(nmtra),stat=ierr)
      if(ierr.ne.0.or.nmtra.le.0) then
        write(*,*) 'minimization: Problem allocating memory! id=4',ierr
        write(*,*) 'minimization: nmtra=',nmtra
        call abort3d('aborting in min_minvar')
      endif

      ! recast pointer to obsSpaceData as an integer array, so it can be passed through n1qn3 to simvar
      dataptr%lobsSpaceData => lobsSpaceData
      dataptr%lcolumn       => lcolumn
      dataptr%lcolumng      => lcolumng
      dataptr_int_size = size(transfer(dataptr,dataptr_int))
      allocate(dataptr_int(dataptr_int_size),stat=ierr)
      if(ierr.ne.0) then
        write(*,*) 'minimization: Problem allocating memory! id=2',ierr
        call abort3d('aborting in min_minvar')
      endif
      dataptr_int(1:dataptr_int_size)=transfer(dataptr,dataptr_int)

      ! Set-up the minimization

      ! initialize iteration/simulation counters to zero
      ITERTOT  = 0
      isimtot = 0

      ! initialize control vector related arrays to zero
      vazx(:)=0.0d0
      ! extra mpilocal portion of control vector: NOT NEEDED?
      !vazx(1:nvadim_mpilocal)=cvm_vazx(1:nvadim_mpilocal)
      dg_vbar(:)=0.0d0
      vazg(:)=0.0d0
      vatra(:)=0.0d0

      ! save user-requested varqc switch
      llvarqc = lvarqc

      ! If minimization start without qcvar : turn off varqc to compute
      ! innovations and test the gradients

      lldf1 = .true.
      if (preconFileExists) then
        if(mpi_myid.eq.0) write(*,*) 'min_minvar : Preconditioning mode'
        lrdvatra = .true.
        imode = 2
        llvazx = lvazx ! from namelist (default is .false.)
        llxbar = lxbar ! from namelist (default is .false.)
      else
        lrdvatra = .false.
        imode = 0
        zeps0 = repsg
      endif

      ! read the hessian from preconin file
      if (lrdvatra) then
        ibrpstamp = tim_getDatestamp() ! ibrpstamp is a I/O argument of rw_vatra

        call rw_vatra (preconFileName,0,                     &
          isim3d,ibrpstamp,zeps0_000,zdf1_000,iterdone_000,  &
          isimdone_000,iztrl,vatra,dg_vbar,                  &
          vazx,llxbar,llvazx,n1gc,imode)

        if (ibrpstamp == tim_getDatestamp() .and. lxbar) then
          ! use vbar for true outer loop
          zeps0  = zeps0_000
          zdf1   = zdf1_000
          lldf1 = .false.     ! don't re-compute df1 base on Cost function
        else
          zeps0 = repsg
          lldf1 = .true.      ! Compute df1 base on Cost function
        endif
      endif

      iterdone = 0
      isimdone = 0
      if(isControlAnalysis) then
        itermax = nitermax
      else
        itermax = nitermax_pert
      endif
      itermaxtodo = itermax
      isimmax = nsimmax

      if (nwoqcv > 0) lvarqc = .false.

      ! do the gradient test for the starting point of minimization
      if(lgrtest .and. isControlAnalysis) then
        call grtest2(simvar,nvadim_mpilocal,vazx,ngrange,dataptr_int)
      endif

      zeps1 = zeps0

      itertot = iterdone
      isimtot = isimdone

      INDIC =2
      call simvar(indic,nvadim_mpilocal,vazx,zjsp,vazg,dataptr_int(1))

      if (lldf1) ZDF1     =  rdf1fac * ABS(ZJSP)

!     Put QCVAR logical to its original values
      lvarqc=llvarqc

      CALL PRSCAL(nvadim_mpilocal,VAZG,VAZG,DLGNORM)
      DLGNORM = DSQRT(DLGNORM)
      CALL PRSCAL(nvadim_mpilocal,VAZX,VAZX,DLXNORM)
      DLXNORM = DSQRT(DLXNORM)
      WRITE(*,*)' |X| = ', DLXNORM
      WRITE(*,FMT=9220) ZJSP, DLGNORM
 9220 FORMAT(/4X,'J(X) = ',G23.16,4X,'|Grad J(X)| = ',G23.16)

      ! Iterations of the minimization algorithm

      ZXMIN = epsilon(ZXMIN)
      WRITE(*,FMT=9320)ZXMIN,ZDF1,ZEPS0,IMPRES,ITERMAX,NSIMMAX

 9320 FORMAT(//,10X,' Minimization N1QN3 starts ...',/  &
             10x,'DXMIN =',G23.16,2X,'DF1 =',G23.16,2X,'EPSG =',G23.16  &
             /,10X,'IMPRES =',I3,2X,'NITER = ',I3,2X,'NSIM = ',I3)

      ! Begin the minimization

      ! First do iterations without var-QC
      if (lvarqc .and. nwoqcv > 0 .and. iterdone < nwoqcv) then
        iitnovqc = min(nwoqcv - iterdone,itermax)
        isimnovqc = isimmax
        lvarqc = .false.
        call tmg_start(70,'QN')
        call n1qn3(simvar, dscalqn, dcanonb, dcanab, nvadim_mpilocal, vazx,  &
            zjsp,vazg, zxmin, zdf1, zeps1, impres, nulout, imode,       &
            iitnovqc, isimnovqc ,iztrl, vatra, nmtra, dataptr_int(1),   &
            zzsunused, dlds)
        call tmg_stop(70)
        call fool_optimizer(lobsSpaceData)

        isimnovqc = isimnovqc - 1
        itermaxtodo = itermaxtodo - iitnovqc + 1
        isimmax = isimmax - isimnovqc + 1

        itertot = itertot + iitnovqc
        isimtot = isimtot + isimnovqc

        zeps1 = zeps0/zeps1
        zeps0 = zeps1
        lvarqc = .true.

        if (imode == 4 .and. itertot < itermax) then
          imode = 2
          INDIC = 2
          call simvar(indic,nvadim_mpilocal,vazx,zjsp,vazg,dataptr_int(1))
        else
          call abort3d(" MINIMIZATION_MOD: n1qn3 mode ne 4")
        endif
      endif

      ! Now do main minimization with var-QC
      call tmg_start(70,'QN')
      call n1qn3(simvar, dscalqn, dcanonb, dcanab, nvadim_mpilocal, vazx,  &
          zjsp,vazg, zxmin, zdf1, zeps1, impres, nulout, imode,   &
          itermaxtodo,isimmax, iztrl, vatra, nmtra, dataptr_int(1), zzsunused,   &
          dlds)
      call tmg_stop(70)
      call fool_optimizer(lobsSpaceData)

      itertot = itertot + itermaxtodo
      isimtot = isimtot + isimmax

      zeps1 = zeps0/zeps1

      WRITE(*,FMT=9500) imode,iterdone,itertot-iterdone,itertot,isimdone,isimtot-isimdone,isimtot
 9500 FORMAT(//,20X,20('*'),2X    &
        ,/,20X,'              Minimization ended with MODE:',I4  &
        ,/,20X,' Number of iterations done in previous job:',I4  &
        ,/,20X,'          Number of iterations in this job:',I4  &
        ,/,20X,'                Total number of iterations:',I4  &
        ,/,20X,'Number of simulations done in previous job:',I4  &
        ,/,20X,'         Number of simulations in this job:',I4  &
        ,/,20X,'               Total number of simulations:',I4)

      min_niter = itertot

      ! Test the gradient at the final point
      if ((lgrtest .and. isControlAnalysis)) then
        WRITE(*,FMT=9400)
 9400   FORMAT(//,12X,40('**'),/,12X,'TESTING THE GRADIENT AT THE FINAL POINT',/,40('**'))
        call grtest2(simvar,nvadim_mpilocal,vazx,ngrange,dataptr_int)
      end if

      do jdata = 1, nvadim_mpilocal
        dg_vbar(jdata) = vazx(jdata) + dg_vbar(jdata)
      enddo

      ! Write out the Hessian to file (only control member writes when nCycleHessMode is not 1)
      if(lwrthess .and. (isControlAnalysis .or. nCycleHessMode.eq.1) ) then
        if(mpi_myid.eq.0) write(*,*) 'Memory Used: ',get_max_rss()/1024,'Mb'
        call rw_vatra (preconFileNameOut,1,  &
          min_nsim,tim_getDatestamp(),zeps1,zdf1,itertot,isimtot,  &
          iztrl,vatra,dg_vbar,vazx,.true.,llvazx,n1gc,imode)
        if(mpi_myid.eq.0) write(*,*) 'Memory Used: ',get_max_rss()/1024,'Mb'
      endif

      ! Put final control vector into control vector module for use elsewhere
      call tmg_start(89,'MIN_COMM')
      cvm_vazx(:)=vazx(1:nvadim_mpilocal)      
      call tmg_stop(89)

      ! deallocate the control vector related arrays
      deallocate(vazx)
      deallocate(vazg)
      deallocate(vatra)
      deallocate(dg_vbar)

      deallocate(dataptr_int)

  END SUBROUTINE min_minvar



  SUBROUTINE simvar(na_indic,na_dim,da_v,da_J,da_gradJ,dataptr_int) 2,21
    implicit none
    ! Argument declarations
    integer :: na_dim ! Dimension of the control vector in forecast error coraviances space
    ! Value of na_indic
    ! Note: 1 and 4 are reserved values for call back from m1qn3.
    !       For direct calls use other value than 1 and 4.
    ! =1 No action taken; =4 Both J(u) and its gradient are computed.
    ! =2 Same as 4 (compute J and gradJ) but do not interrupt timer of the
    !    minimizer.
    ! =3 Compute Jo and gradJo only.
    integer :: na_indic 
    real*8  :: da_J ! Cost function of the Variational algorithm
    real*8, dimension(na_dim) :: da_gradJ ! Gradient of the Variational Cost funtion
    real*8, dimension(na_dim) :: da_v ! Control variable in forecast error covariances space
    integer :: dataptr_int(dataptr_int_size)  ! integer work area used to transmit a pointer to the obsSpaceData
    !
    ! Purpose: Implement the Variational solver as described in
    ! Courtier, 1997, Dual formulation of four-dimentional variational assimilation,
    ! Q.J.R., pp2449-2461.
    !
    ! Author : Simon Pellerin *ARMA/MSC October 2005
    !          (Based on previous versions of evaljo.ftn, evaljg.ftn and evaljgns.ftn).
    !
    ! Local declaration
    integer :: ierr,jj
    real*8, dimension(na_dim) :: dl_v
    real*8 :: dl_Jb, dl_Jo
    type(struct_gsv) :: statevector
    type(struct_dataptr) :: dataptr
    type(struct_obs),pointer :: lobsSpaceData
    type(struct_columnData),pointer :: lcolumn,lcolumng
    type(struct_hco), pointer :: hco_anl

    ! Convert the integer array dataptr_int back into a pointer to the obsSpaceData
    dataptr=transfer(dataptr_int(1:dataptr_int_size),dataptr)
    lobsSpaceData => dataptr%lobsSpaceData
    lcolumn       => dataptr%lcolumn
    lcolumng      => dataptr%lcolumng

    if (na_indic .eq. 1 .or. na_indic .eq. 4) call tmg_stop(70)

    call tmg_start(80,'MIN_SIMVAR')
    if (na_indic .ne. 1) then ! No action taken if na_indic == 1
       min_nsim = min_nsim + 1

       if(mpi_myid == 0) then
         write(*,*) 'Entering simvar for simulation ',min_nsim
         write(*,*) 'Memory Used: ',get_max_rss()/1024,'Mb'
         call flush(6)
       endif

       ! note: dg_vbar = sum(v) of previous outer-loops
       dl_v(1:nvadim_mpilocal) = da_v(1:nvadim_mpilocal) + dg_vbar(1:nvadim_mpilocal)     

       ! Computation of background term of cost function:
       dl_Jb = dot_product(dl_v(1:nvadim_mpilocal),dl_v(1:nvadim_mpilocal))/2.d0  
       call tmg_start(89,'MIN_COMM')       
       call mpi_allreduce_sumreal8scalar(dl_Jb,"GRID")
       call tmg_stop(89)

       call gsv_setVco(statevector,col_getVco(lcolumng))
       hco_anl => hco_get('Analysis')
       call gsv_setHco(statevector,hco_anl)
       call gsv_allocate(statevector,tim_nstepobsinc,mpi_local=.true.)

       call bmat_sqrtB(da_v,nvadim_mpilocal,statevector)

       call tmg_start(30,'OBS_INTERP')
       call oda_L(statevector,lcolumn,lcolumng,lobsSpaceData)  ! put in column H_horiz dx
       call tmg_stop(30)

       call tmg_start(40,'OBS_TL')
       call oda_H(lcolumn,lcolumng,lobsSpaceData)  ! Save as OBS_WORK: H_vert H_horiz dx = Hdx
       call tmg_stop(40)

       call oda_res(lobsSpaceData)  ! Calculate OBS_OMA from OBS_WORK : Hdx-d
     
       call oda_sqrtRm1(lobsSpaceData,OBS_WORK,OBS_OMA)  ! Save as OBS_WORK : R**-1/2 (Hdx-d)
     
       call oda_Jo(lobsSpaceData)  ! Store J-obs in OBS_JOBS : 1/2 * R**-1 (Hdx-d)**2
     
       IF (LVARQC) THEN          
          call oda_qcv(lobsSpaceData)  ! Store modified J-obs in OBS_JOBS : -ln((gamma-exp(J))/(gamma+1)) 
       endif

       dl_Jo = 0.d0
       call oda_sumJo(lobsSpaceData,dl_Jo)
       da_J = dl_Jb + dl_Jo
       if (na_indic .eq. 3) then
          da_J = dl_Jo
          IF(mpi_myid == 0) write(*,FMT='(6X,"SIMVAR:  JO = ",G23.16,6X)') dl_Jo
       else
          da_J = dl_Jb + dl_Jo
          IF(mpi_myid == 0) write(*,FMT='(6X,"SIMVAR:  Jb = ",G23.16,6X,"JO = ",G23.16,6X,"Jt = ",G23.16)') dl_Jb,dl_Jo,da_J
       endif

       call oda_sqrtRm1(lobsSpaceData,OBS_WORK,OBS_WORK)  ! Modify OBS_WORK : R**-1 (Hdx-d)

       IF (LVARQC) THEN
          call oda_qcvad(lobsSpaceData)
       endif

       call col_zero(lcolumn)

       call tmg_start(41,'OBS_AD')
       call oda_HT(lcolumn,lcolumng,lobsSpaceData)   ! Put in column : H_vert**T R**-1 (Hdx-d)
       call tmg_stop(41)

       call tmg_start(31,'OBS_INTERPAD')
       call oda_LT(statevector,lcolumn,lcolumng,lobsSpaceData)  ! Put in statevector H_horiz**T H_vert**T R**-1 (Hdx-d)
       call tmg_stop(31)

       da_gradJ(:) = 0.d0
       call bmat_sqrtBT(da_gradJ,nvadim_mpilocal,statevector)
       call gsv_deallocate(statevector)

       if (na_indic .ne. 3) then
          da_gradJ(1:nvadim_mpilocal) = dl_v(1:nvadim_mpilocal) + da_gradJ(1:nvadim_mpilocal)
       endif
    endif
    call tmg_stop(80)
    if (na_indic .eq. 1 .or. na_indic .eq. 4) call tmg_start(70,'QN')

    if(mpi_myid.eq.0) write(*,*) 'end of simvar'

  END SUBROUTINE simvar



  SUBROUTINE DSCALQN(KDIM,PX,PY,DDSC,KZS, PZS, DDZS),1
    !***s/r DSCALQN: inner product in canonical space
    !*    ------------------- 
    !**    Purpose: interface for the inner product to be used
    !*     .        by the minimization subroutines N1QN3.
    !*
    !*Arguments
    !*     i : KDIM      : dimension of the vectors
    !*     i : PX, PY    : vector for which <PX,PY> is being calculated
    !*     o : DDSC      : result of the inner product
    !*     --------------
    !*     i :  KZS(1)   : unused working space for INTEGER  (not used)
    !*     i :  PZS(1)   : unused working space for REAL     (not used)
    !*     i : PDZS(1)   : unused working space for REAL*8   (not used)
    IMPLICIT NONE

    REAL PZS(1)
    INTEGER KZS(1)
    REAL*8  DDZS(1)

    INTEGER KDIM
    REAL*8 PX(KDIM), PY(KDIM)
    REAL*8 DDSC

    CALL PRSCAL(KDIM,PX,PY,DDSC)
    RETURN
  END SUBROUTINE DSCALQN



  SUBROUTINE PRSCAL(KDIM,PX,PY,DDSC) 3,1
    !***s/r PRSCAL: inner product in canonical space
    !*
    !*Author  : P. Gauthier *ARMA/AES  January 27, 1993
    !**    Purpose: evaluation of the inner product used in the
    !*     .        minimization
    !*
    !*Arguments
    !*     i : KDIM     : dimension of the vectors
    !*     i : PX, PY   : vector for which <PX,PY> is being calculated
    !*     o : DDSC     : result of the inner product
    !*
    !* Implicit argument: SCALP(KDIM) assumed to be unity

    IMPLICIT NONE

    INTEGER KDIM, J, RR, IERR
    REAL*8 PX(KDIM), PY(KDIM)
    REAL*8 DDSC
    REAL*8 partialsum(128)
    INTEGER mythread,numthreads,jstart,jend
    INTEGER omp_get_thread_num,omp_get_num_threads

    call tmg_start(71,'QN_PRSCAL')
    DDSC = 0.D0

    do j=1,nvadim_mpilocal
      DDSC = DDSC + PX(J)*PY(J)
    ENDDO

    call tmg_start(79,'QN_COMM')
    call mpi_allreduce_sumreal8scalar(ddsc,"GRID")
    call tmg_stop(79)

    call tmg_stop(71)

    RETURN
  END SUBROUTINE PRSCAL



  SUBROUTINE DCANAB(KDIM,PY,PX,KZS,PZS,PDZS)
    !***s/r DCANAB  - Change of variable associated with the canonical
    !*     .         inner product
    !*
    !*Author    JM Belanger CMDA/SMC   May 2001
    !*     .    Double precision version based on single precision CTCAB.
    !*          Refered to  as dummy argument DTCAB by N1QN3 minimization
    !*          package.
    !*    -------------------
    !**    Purpose: to compute PX = L^-1 * Py with L related to the inner product
    !*     .        <PX,PY> = PX^t  L^t  L PY
    !*     .        (see the modulopt documentation aboutn DTCAB)
    !*     NOTE: L is assumed to be the identity!
    IMPLICIT NONE

    INTEGER KDIM, KZS(1)
    REAL PZS(1)
    REAL*8 PX(KDIM), PY(KDIM)
    REAL*8 PDZS(1)

    INTEGER JDIM

    DO JDIM = 1, KDIM
      PX(JDIM) = PY(JDIM)
    ENDDO

    RETURN
  END SUBROUTINE DCANAB



  SUBROUTINE DCANONB(KDIM,PX,PY,KZS,PZS,PDZS)
    !***s/r DCANONB  - Change of variable associated with the canonical
    !*     .          inner product
    !*
    !*Author    JM Belanger CMDA/SMC  May 2001
    !*     .    Double precision version based on single precision CANONB.
    !*          Refered to as dummy argument DTONB by N1QN3 minimization
    !*          package.
    !*    -------------------
    !**    Purpose: to compute PY = L * PX with L related to the inner product
    !*     .        <PX,PY> = PX^t  L^t  L PY
    !*     .        (see the modulopt documentation about DTONB)
    !*     .

    IMPLICIT NONE
    INTEGER KDIM, KZS(1)
    REAL PZS(1)
    REAL*8 PX(KDIM), PY(KDIM)
    REAL*8 PDZS(1)

    INTEGER JDIM

    DO JDIM = 1, KDIM
      PY(JDIM) = PX(JDIM)
    ENDDO

    RETURN
  END SUBROUTINE DCANONB



  SUBROUTINE rw_vatra (cfname,status,                       & 2,12
          nsim,kbrpstamp,zeps1,zdf1,itertot,isimtot,      &
          nztrl,vatra,vazxbar,vazx,llxbar,llvazx,k1gc,imode)

    !***s/r RW_VATRA  - Read-Write VAZXBAR and VATRA on file
    !*
    !*
    !*Author  : M. Tanguay RPN January 2005
    !*
    !*Arguments
    !*     i   cfname    : precon file
    !*     i   status    : = 0 if READ, = 1 if WRITE
    !*     i   nsim      : Number of simulations in N1QN3
    !*     io  kbrpstamp : Date
    !*     i   zeps1     : Parameter in N1QN3
    !*     i   zdf1      : Parameter in N1QN3
    !*     i   itertot   : Parameter in N1QN3
    !*     i   isimtot   : Parameter in N1QN3
    !*     i   nztrl     : Localisation parameters for Hessian
    !*     i   vatra     : Hessian
    !*     i   vazxbar   : Vazx of previous loop
    !*     i   vazx      : Current state of the minimization
    !*     i   llxbar    : read in vaxzbar if dates are compatible
    !*     i   llvazx    : Logical to read vazx
    !*     i   k1gc      : Minimizer ID (2: m1qn2, 3: m1qn3)
    !*     o   imode     : If status=0, set imode=0 (no prec) or 2 (prec)
    IMPLICIT NONE

    logical llxbar,llvazx

    integer status,kbrpstamp,nsim,itertot,isimtot
    integer, dimension(10), target :: nztrl
    integer k1gc, imode
    real*8 zeps1,zdf1
    real*8, dimension(nvadim_mpilocal), target :: vazxbar, vazx
    real*8, dimension(nmtra), target :: vatra
    real*4, allocatable :: vatravec_r4_mpiglobal(:)
    real*8, allocatable :: vatravec_r8_mpiglobal(:)
    real*8, allocatable :: vazxbar_mpiglobal(:),vazx_mpiglobal(:)

    integer ibrpstamp,ireslun, ierr, fnom, fclos
    integer :: nvadim_mpiglobal,nmtra_mpiglobal
    integer :: cvDim_return
    integer :: ivadim, itrunc
    integer :: imtra,ivamaj
    integer :: jvec, i1gc,ictrlvec,ii,jproc
    integer, dimension(10), target, save :: iztrl

    character(len=*) :: cfname
    character(len=3) :: cl_version

    call tmg_start(88,'MIN_RWHESS')
    if(status.eq.0) then
      if(mpi_myid.eq.0) write(*,*) 'Read  VATRA in RW_VATRA'
    elseif(status.eq.1) then
      if(mpi_myid.eq.0) write(*,*) 'Write VATRA in RW_VATRA'
    else
      call abort3d(" RW_VATRA: status not valid ")
    endif

    call rpn_comm_allreduce(nvadim_mpilocal,nvadim_mpiglobal,1,"mpi_integer","mpi_sum","GRID",ierr)
    call rpn_comm_allreduce(nmtra,          nmtra_mpiglobal, 1,"mpi_integer","mpi_sum","GRID",ierr)

    ireslun=0

    !*     Read Hessian
    !*     ------------

    if(status.eq.0) then

      ierr = fnom(ireslun,cfname,'FTN+SEQ+UNF+OLD+R/O',0)

      ! Checking version number
      read (ireslun) cl_version,i1gc
      if(trim(cl_version) /= 'V2') then
        if(trim(cl_version) == 'V3') then
          if(mpi_myid.eq.0) write(*,*) 'RW_VATRA: using single precision V3 Hessian'
        elseif(trim(cl_version) == 'V4') then
          if(mpi_myid.eq.0) write(*,*) 'RW_VATRA: using single precision V4 Hessian'
        else
          call abort3d(" RW_VATRA: invalid Hessian version")
        endif
      endif
      if (i1gc == 3 .and. i1gc == k1gc) then
        if(mpi_myid.eq.0) write(*,*) trim(cl_version),' M1QN3'
      else
        write(*,*) 'Version, n1gc =',trim(cl_version),i1gc
        call abort3d(" RW_VATRA: Inconsistant input hessian")
      endif

      rewind (ireslun)

      read(ireslun) cl_version,i1gc,nsim,ibrpstamp,zeps1,zdf1,itertot,isimtot,ivadim,itrunc
      if(trim(cl_version) == 'V4') then
        read(ireslun) ivamaj,iztrl
        if((ivamaj.ne.nvamaj).or.(nvadim_mpiglobal.ne.ivadim)) then
          write(*,*) nvamaj,ivamaj,nvadim_mpiglobal,ivadim
          call abort3d(" RW_VATRA : ERROR, size of V4 Hessian not consistent")
        endif
      else
        read(ireslun) imtra,iztrl
        if(.not.(nmtra_mpiglobal.eq.imtra.or.(nmtra_mpiglobal+nvamaj).eq.imtra).or.nvadim_mpiglobal.ne.ivadim) then
          write(*,*) nmtra_mpiglobal,imtra,nvadim_mpiglobal,ivadim
          write(*,*) ' RW_VATRA : ERROR, size of Hessian not consistent, but not stopping!'
!          call abort3d(" RW_VATRA : ERROR, size of Hessian not consistent")
        endif
      endif

      if(k1gc.eq.3) ictrlvec = 2*nvamaj+1

      ! here the mpiglobal vectors are read in and only the mpilocal 
      ! portion is copied from vatravec into vatra
      if(mpi_myid.eq.0) write(*,*) 'RW_VATRA : reading Hessian'
      if(trim(cl_version) == 'V2') then
        allocate(vatravec_r8_mpiglobal(nvadim_mpiglobal))
        do jvec = 1, ictrlvec
          read(ireslun) vatravec_r8_mpiglobal
          call bmat_reduceToMPILocal(  &
            vatra(((jvec-1)*nvadim_mpilocal+1):jvec*nvadim_mpilocal),  &
            vatravec_r8_mpiglobal,cvDim_return)
        enddo
        deallocate(vatravec_r8_mpiglobal)       
      elseif(trim(cl_version) == 'V3' .or. trim(cl_version) == 'V4') then
        allocate(vatravec_r4_mpiglobal(nvadim_mpiglobal))
        allocate(vatravec_r8_mpiglobal(nvadim_mpiglobal))
        ! set to zero, except 1 for "diag" (first vector) for overdimensioned elements
        do jvec = 1, ictrlvec
          read(ireslun) vatravec_r4_mpiglobal
          vatravec_r8_mpiglobal(:)=real(vatravec_r4_mpiglobal(:),8)
          call bmat_reduceToMPILocal(  &
            vatra(((jvec-1)*nvadim_mpilocal+1):jvec*nvadim_mpilocal),  &
            vatravec_r8_mpiglobal,cvDim_return)
        enddo
        deallocate(vatravec_r4_mpiglobal)       
        deallocate(vatravec_r8_mpiglobal)       
      endif

      imode = 2

      if(k1gc.eq.3) then
        nztrl(1) = nvadim_mpilocal
        nztrl(2) = 0
        nztrl(3) = nvamaj
        nztrl(4) = iztrl(4)
        nztrl(5) = iztrl(5)
      endif

      if(ibrpstamp == kbrpstamp .and. llxbar) then
        if(mpi_myid.eq.0) write(*,*) 'RW_VATRA : reading vazxbar'
        allocate(vazxbar_mpiglobal(nvadim_mpiglobal))
        read(ireslun) vazxbar_mpiglobal
        call bmat_reduceToMPILocal(  &
          vazxbar,  &
          vazxbar_mpiglobal,cvDim_return)
        deallocate(vazxbar_mpiglobal)
      endif

      if(ibrpstamp == kbrpstamp .and. llvazx) then
        if(mpi_myid.eq.0) write(*,*) 'RW_VATRA : reading vazx'
        allocate(vazx_mpiglobal(nvadim_mpiglobal))
        read(ireslun) vazx_mpiglobal
        call bmat_reduceToMPILocal(  &
          vazx,  &
          vazx_mpiglobal,cvDim_return)
        deallocate(vazx_mpiglobal)
      endif

      if(ibrpstamp.ne.kbrpstamp) then
        kbrpstamp = ibrpstamp
      endif

      ierr = fclos(ireslun)

      !*     Write Hessian
      !*     -------------

    elseif(status.eq.1) then
      if(mpi_myid.eq.0) ierr = fnom(ireslun,cfname, 'FTN+SEQ+UNF' , 0)
      cl_version = 'V4'
      itrunc=0
      if(mpi_myid.eq.0) write(ireslun) cl_version,k1gc,nsim,kbrpstamp,zeps1,zdf1,itertot,isimtot,nvadim_mpiglobal,itrunc
      if(mpi_myid.eq.0) write(ireslun) nvamaj,nztrl
      if(k1gc.eq.3) ictrlvec = 2*nvamaj+1

      if(mpi_myid.eq.0) allocate(vatravec_r4_mpiglobal(nvadim_mpiglobal))
      allocate(vatravec_r8_mpiglobal(nvadim_mpiglobal))
      do jvec = 1, ictrlvec
        call bmat_expandToMPIGlobal(  &
          vatra(((jvec-1)*nvadim_mpilocal+1):jvec*nvadim_mpilocal),  &
          vatravec_r8_mpiglobal,cvDim_return)
        if(mpi_myid.eq.0) vatravec_r4_mpiglobal(:)=real(vatravec_r8_mpiglobal(:),4)
        if(mpi_myid.eq.0) write(ireslun) vatravec_r4_mpiglobal
      enddo
      if(mpi_myid.eq.0) deallocate(vatravec_r4_mpiglobal)
      deallocate(vatravec_r8_mpiglobal)

      allocate(vazxbar_mpiglobal(nvadim_mpiglobal))
      call bmat_expandToMPIGlobal(  &
        vazxbar,  &
        vazxbar_mpiglobal,cvDim_return)
      if(mpi_myid.eq.0) write(ireslun) vazxbar_mpiglobal(1:nvadim_mpiglobal)
      deallocate(vazxbar_mpiglobal)

      allocate(vazx_mpiglobal(nvadim_mpiglobal))
      call bmat_expandToMPIGlobal(  &
        vazx,  &
        vazx_mpiglobal,cvDim_return)
      if(mpi_myid.eq.0) write(ireslun) vazx_mpiglobal(1:nvadim_mpiglobal)
      deallocate(vazx_mpiglobal)

      if(mpi_myid.eq.0) ierr = fclos(ireslun)
    else
      call abort3d(" RW_VATRA: status not valid ")
    endif

    call tmg_stop(88)

    return
  END SUBROUTINE RW_VATRA


  subroutine grtest2(simul,na_dim,da_x0,na_range,dataptr) 2,1
  implicit none
  ! Dummies
  integer, intent(in) :: na_dim ! Size of the control vector
  integer, intent(in) :: na_range ! the test will be carried over values of
                                  ! ALPHA ranging between
                                  ! 10**(-NA_RANGE) < ALPHA < 0.1
  integer, intent(inout) :: dataptr(:)
  real*8,  intent(in), dimension(na_dim) :: da_x0 ! Control vector
  external simul ! simulator: return cost function estimate and its gradient
  !
  !Purpose:
  !to compare the variation of the functional against what the gradient
  !gives for small changes in the control variable. This test should be
  !accurate for values as small as DLALPHA =  SQRT(machine precision).
  !(see Courtier, 1987)
  !
  !Author  : P. Gauthier *ARMA/AES  June 9, 1992
  !
  !Revision:
  !
  !     JM Belanger CMDA/SMC  Oct 2000
  !                   . 32 bits conversion
  !     P. Gauthier ARMA/MSC July 2003
  !                   . Set the output unit through an argument
  !     S. Pellerin ARMA/MSC Oct. 2005
  !                   . Introduction of call back simulator
  !                   . Automatic array (argument cleanup)
  !                   . F90 free style and ODA Norm coding
  !
  ! Local delcarations
  integer :: nl_indic, nl_j,ierr
  real*8  :: dl_wrk(na_dim),dl_gradj0(na_dim), dl_x(na_dim)
  real*8  :: dl_J0, dl_J, dl_test, dl_start,dl_end
  real*8  :: dl_alpha, dl_gnorm0


  ! 1. Initialize dl_gradj0 at da_x0
  !    ------------------------------------

  nl_indic = 2
  call simul(nl_indic,na_dim,da_x0,dl_j0,dl_gradj0,dataptr(1))
  dl_gnorm0 = dot_product(dl_gradj0,dl_gradj0)
  call mpi_allreduce_sumreal8scalar(dl_gnorm0,"GRID")
  dl_start = 1.d0
  dl_end   = 10.0d0**(-na_range)
  write(*,FMT=9100) dl_start,dl_end, dl_j0, dl_gnorm0
  ! 2. Perform the test
  !    ----------------

  if(dl_gnorm0.eq.0.d0)then
     write(*,FMT=9101)
     return
  end if
  write(*,FMT=9200)
  do  nl_j = 1, na_range
     dl_alpha = 10.0d0**(- nl_j)
     dl_x(:) = da_x0(:) - dl_alpha*dl_gradJ0(:)
     call simul(nl_indic,na_dim,dl_x,dl_j,dl_wrk,dataptr(1))
     dl_test = (dl_j-dl_j0)/(-dl_alpha * dl_gnorm0)
     write(*,FMT=9201)nl_j, dl_alpha, dl_j, dl_test
  end do

9100 format(//,4X,&
          'GRTEST- The gradient is being tested for',&
          G23.16,' <= ALPHA <= ',G23.16,/,12X,&
          'Initial value of J(X):',1x,G23.16,4x,&
          'Norm of GRAD J(X)**2: ',G23.16)
9101 format(/,4X,'-In GRTEST: gradient vanishes exactly',&
          '. Gradient test cannot be performed at this point')
9200 format(/,4X,'J',8X,'ALPHA',11X,'J(X)',12X,'TEST')

9201 format(2X,'GRTEST: step',2X,I3,4X,G23.16,4X,G23.16,4X,&
          G23.16)
  return
end subroutine grtest2



END MODULE minimization_mod