C $Header: /u/gcmpack/MITgcm/pkg/exch2/w2_cumulsum_z_tile.F,v 1.4 2012/09/03 19:40:09 jmc Exp $
C $Name:  $

#include "CPP_EEOPTIONS.h"
#include "W2_OPTIONS.h"

C--   File w2_cumulsum_z_tile.F: Routines that perform cumulated sum
C                                on a tiled array, corner grid-cell location
C      Contents
C      o W2_CUMULSUM_Z_TILE_RL
C      o W2_CUMULSUM_Z_TILE_RS <- not yet coded

C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|
CBOP
C     !ROUTINE: W2_CUMULSUM_Z_TILE_RL

C     !INTERFACE:
      SUBROUTINE W2_CUMULSUM_Z_TILE_RL(
     O                       psiZ, psiLoc,
     I                       dPsiX, dPsiY, myThid )

C     !DESCRIPTION:
C     *==========================================================*
C     | SUBROUTINE W2\_CUMULSUM\_Z\_TILE\_RL
C     | o Handle cumulated sum for _RL tile data.
C     *==========================================================*
C     | Cumulate sum on tiled array, corner grid-cell location:
C     |  Starts from 1rst tile and, going through all tiles & all
C     |  the processes, add increment in both directions
C     *==========================================================*

C     !USES:
      IMPLICIT NONE

C     == Global data ==
#include "SIZE.h"
#include "EEPARAMS.h"
#include "EESUPPORT.h"
#include "W2_EXCH2_SIZE.h"
#include "W2_EXCH2_PARAMS.h"
#include "W2_EXCH2_TOPOLOGY.h"
#include "CUMULSUM.h"

C     !INPUT/OUTPUT PARAMETERS:
C     == Routine arguments ==
C     psiZ    :: results of cumulated sum, corresponds to tile South-East corner
C     psiLoc  :: cumulated sum at special locations
C     dPsiX   :: tile increment in X direction
C     dPsiY   :: tile increment in Y direction
C     myThid  :: my Thread Id. number
      _RL     psiZ  (nSx,nSy)
      _RL     psiLoc(2)
      _RL     dPsiX (nSx,nSy)
      _RL     dPsiY (nSx,nSy)
      INTEGER myThid

C     !LOCAL VARIABLES:
C     == Local variables ==
C     bi,bj   :: tile indices
C- type declaration of: loc[1,2]Buf and shareBufCS[1,2]_R8 :
C         all 4 needs to have the same length as MPI_DOUBLE_PRECISION
      INTEGER bi,bj
      INTEGER tN, tS
      Real*8  globalBuf(3,W2_maxNbTiles)
#ifndef W2_CUMSUM_USE_MATRIX
      Real*8 facetXYSum(2,W2_maxNbFacets)
      Real*8 facet_CSum(  W2_maxNbFacets)
      INTEGER fNx, fNy, nbTx, nbTy
      INTEGER i, j
#endif
#ifdef ALLOW_USE_MPI
      INTEGER np, pId
      INTEGER lbuf1, lbuf2, idest, itag, ready_to_receive
      INTEGER istatus(MPI_STATUS_SIZE), ierr
      Real*8  loc1Buf  (nSx,nSy)
      Real*8  loc2Buf(2,nSx,nSy)
#endif /* ALLOW_USE_MPI */
CEOP

C--   Initialise to zero:
      psiLoc(1) = 0.
      psiLoc(2) = 0.
      DO tN = 1,exch2_nTiles
        globalBuf(1,tN) = 0.
        globalBuf(2,tN) = 0.
        globalBuf(3,tN) = 0.
      ENDDO

C--   write input into shared-buffer array
      DO bj = myByLo(myThid), myByHi(myThid)
       DO bi = myBxLo(myThid), myBxHi(myThid)
         shareBufCS2_R8(1,bi,bj) = dPsiX(bi,bj)
         shareBufCS2_R8(2,bi,bj) = dPsiY(bi,bj)
       ENDDO
      ENDDO

C--   Master thread cannot start until everyone is ready:
      CALL BAR2( myThid )
      _BEGIN_MASTER( myThid )

#ifdef ALLOW_USE_MPI
      IF ( usingMPI ) THEN

       lbuf1 = nSx*nSy
       lbuf2 = 2*lbuf1
       idest = 0
       itag  = 0
       ready_to_receive = 0

       IF ( mpiMyId.NE.0 ) THEN

C--   All proceses except 0 wait to be polled then send local array
#ifndef DISABLE_MPI_READY_TO_RECEIVE
           CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER,
     &                    idest, itag, MPI_COMM_MODEL, istatus, ierr)
#endif
           CALL MPI_SEND (shareBufCS2_R8, lbuf2, MPI_DOUBLE_PRECISION,
     &                    idest, itag, MPI_COMM_MODEL, ierr)

C--   All proceses except 0 receive result from process 0
           CALL MPI_RECV (shareBufCS1_R8, lbuf1, MPI_DOUBLE_PRECISION,
     &                    idest, itag, MPI_COMM_MODEL, istatus, ierr)

       ELSE

C--   Process 0 polls and receives data from each process in turn
         DO np = 2, nPx*nPy
           pId = np - 1
#ifndef DISABLE_MPI_READY_TO_RECEIVE
           CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER,
     &                    pId, itag, MPI_COMM_MODEL, ierr)
#endif
           CALL MPI_RECV (loc2Buf, lbuf2, MPI_DOUBLE_PRECISION,
     &                    pId, itag, MPI_COMM_MODEL, istatus, ierr)

C--   Process 0 gathers the local arrays into a global array.
           DO bj=1,nSy
            DO bi=1,nSx
              tN = W2_procTileList(bi,bj,np)
              globalBuf(1,tN) = loc2Buf(1,bi,bj)
              globalBuf(2,tN) = loc2Buf(2,bi,bj)
            ENDDO
           ENDDO
C-       end loop on np
         ENDDO

C--   end if process not 0 / else = 0
       ENDIF

      ENDIF
#endif /* ALLOW_USE_MPI */

      IF ( myProcId.EQ.0 ) THEN

C--   Process 0 fills-in its local data
         DO bj=1,nSy
          DO bi=1,nSx
            tN = W2_myTileList(bi,bj)
            globalBuf(1,tN) = shareBufCS2_R8(1,bi,bj)
            globalBuf(2,tN) = shareBufCS2_R8(2,bi,bj)
          ENDDO
         ENDDO

C--   Cumulate Sum over all tiles:
#ifdef W2_CUMSUM_USE_MATRIX
C-    Using tile x tile matrix:
         DO tN = 1,exch2_nTiles
           globalBuf(3,tN) = 0.
           DO tS = 1,exch2_nTiles
             globalBuf(3,tN) = globalBuf(3,tN)
     &                       + W2_cumSum_tiles(1,tS,tN)*globalBuf(1,tS)
     &                       + W2_cumSum_tiles(2,tS,tN)*globalBuf(2,tS)
           ENDDO
         ENDDO
#else /* W2_CUMSUM_USE_MATRIX */
C-    Cumulate per facet and then use facet x facet matrix:

         DO j=1,W2_maxNbFacets
           facetXYSum(1,j) = 0
           facetXYSum(2,j) = 0
           facet_CSum(j) = 0
         ENDDO

C-    First within each face:
         DO j=1,nFacets
          fNx = facet_dims(2*j-1)
          fNy = facet_dims( 2*j )
          IF ( fNx*fNy .GE. 1 ) THEN
           nbTx = fNx/sNx
           nbTy = fNy/sNy

           DO bi=1,nbTx-1
             tN = facet_owns(1,j) + bi-1
             globalBuf(3,tN+1) = globalBuf(3,tN) + globalBuf(1,tN)
           ENDDO
           DO bj=1,nbTy-1
            tS = facet_owns(1,j) - 1 + (bj-1)*nbTx
            DO bi=1,nbTx
             tN = tS + bi
             globalBuf(3,tN+nbTx) = globalBuf(3,tN) + globalBuf(2,tN)
            ENDDO
           ENDDO

C-    facet increment in X & Y dir
           DO bi=1,nbTx
            tN = facet_owns(1,j) + bi-1
            facetXYSum(1,j) = facetXYSum(1,j) + globalBuf(1,tN)
           ENDDO
           DO bj=1,nbTy
            tN = facet_owns(1,j) + (bj-1)*nbTx
            facetXYSum(2,j) = facetXYSum(2,j) + globalBuf(2,tN)
           ENDDO

          ENDIF
         ENDDO

C-    Calculate cumulated sum at facet origin using facet matrix:
         DO j=1,nFacets
          DO i=1,nFacets
           facet_CSum(j) = facet_CSum(j)
     &                   + W2_cumSum_facet(1,i,j)*facetXYSum(1,i)
     &                   + W2_cumSum_facet(2,i,j)*facetXYSum(2,i)
          ENDDO
         ENDDO

C-    Finally, add cumulated sum at facet origin:
         DO tN = 1,exch2_nTiles
          j = exch2_myFace(tN)
          IF ( j.NE.0 ) THEN
            globalBuf(3,tN) = globalBuf(3,tN) + facet_CSum(j)
          ENDIF
         ENDDO
#endif /* W2_CUMSUM_USE_MATRIX */

C-    Value at Special location (e.g., Missing-Corner values)
         IF ( W2_tMC1.GE.1 )
     &     psiLoc(1) = globalBuf(3,W2_tMC1) + globalBuf(2,W2_tMC1)
         IF ( W2_tMC2.GE.1 )
     &     psiLoc(2) = globalBuf(3,W2_tMC2) + globalBuf(1,W2_tMC2)

C--   Process 0 fills-in its local data
         DO bj=1,nSy
          DO bi=1,nSx
            tN = W2_myTileList(bi,bj)
            shareBufCS1_R8(bi,bj) = globalBuf(3,tN)
          ENDDO
         ENDDO

#ifdef ALLOW_USE_MPI
        IF ( usingMPI ) THEN
C--   Process 0 sends result to all other processes
         DO np = 2, nPx*nPy
           pId = np - 1
C-    fill local array with relevant portion of global array
           DO bj=1,nSy
            DO bi=1,nSx
              tN = W2_procTileList(bi,bj,np)
              loc1Buf(bi,bj) = globalBuf(3,tN)
            ENDDO
           ENDDO
           CALL MPI_SEND (loc1Buf, lbuf1, MPI_DOUBLE_PRECISION,
     &                    pId, itag, MPI_COMM_MODEL, ierr)
         ENDDO

        ENDIF
#endif /* ALLOW_USE_MPI */

C--   end if process 0
      ENDIF

      _END_MASTER( myThid )
C--   Everyone wait for Master thread to be ready
      CALL BAR2( myThid )

C--   set result for every threads
      DO bj = myByLo(myThid), myByHi(myThid)
       DO bi = myBxLo(myThid), myBxHi(myThid)
         psiZ(bi,bj) = shareBufCS1_R8(bi,bj)
       ENDDO
      ENDDO

      RETURN
      END