C $Header: /u/gcmpack/MITgcm/pkg/exch2/w2_cumulsum_z_tile.F,v 1.4 2012/09/03 19:40:09 jmc Exp $
C $Name: $
#include "CPP_EEOPTIONS.h"
#include "W2_OPTIONS.h"
C-- File w2_cumulsum_z_tile.F: Routines that perform cumulated sum
C on a tiled array, corner grid-cell location
C Contents
C o W2_CUMULSUM_Z_TILE_RL
C o W2_CUMULSUM_Z_TILE_RS <- not yet coded
C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|
CBOP
C !ROUTINE: W2_CUMULSUM_Z_TILE_RL
C !INTERFACE:
SUBROUTINE W2_CUMULSUM_Z_TILE_RL(
O psiZ, psiLoc,
I dPsiX, dPsiY, myThid )
C !DESCRIPTION:
C *==========================================================*
C | SUBROUTINE W2\_CUMULSUM\_Z\_TILE\_RL
C | o Handle cumulated sum for _RL tile data.
C *==========================================================*
C | Cumulate sum on tiled array, corner grid-cell location:
C | Starts from 1rst tile and, going through all tiles & all
C | the processes, add increment in both directions
C *==========================================================*
C !USES:
IMPLICIT NONE
C == Global data ==
#include "SIZE.h"
#include "EEPARAMS.h"
#include "EESUPPORT.h"
#include "W2_EXCH2_SIZE.h"
#include "W2_EXCH2_PARAMS.h"
#include "W2_EXCH2_TOPOLOGY.h"
#include "CUMULSUM.h"
C !INPUT/OUTPUT PARAMETERS:
C == Routine arguments ==
C psiZ :: results of cumulated sum, corresponds to tile South-East corner
C psiLoc :: cumulated sum at special locations
C dPsiX :: tile increment in X direction
C dPsiY :: tile increment in Y direction
C myThid :: my Thread Id. number
_RL psiZ (nSx,nSy)
_RL psiLoc(2)
_RL dPsiX (nSx,nSy)
_RL dPsiY (nSx,nSy)
INTEGER myThid
C !LOCAL VARIABLES:
C == Local variables ==
C bi,bj :: tile indices
C- type declaration of: loc[1,2]Buf and shareBufCS[1,2]_R8 :
C all 4 needs to have the same length as MPI_DOUBLE_PRECISION
INTEGER bi,bj
INTEGER tN, tS
Real*8 globalBuf(3,W2_maxNbTiles)
#ifndef W2_CUMSUM_USE_MATRIX
Real*8 facetXYSum(2,W2_maxNbFacets)
Real*8 facet_CSum( W2_maxNbFacets)
INTEGER fNx, fNy, nbTx, nbTy
INTEGER i, j
#endif
#ifdef ALLOW_USE_MPI
INTEGER np, pId
INTEGER lbuf1, lbuf2, idest, itag, ready_to_receive
INTEGER istatus(MPI_STATUS_SIZE), ierr
Real*8 loc1Buf (nSx,nSy)
Real*8 loc2Buf(2,nSx,nSy)
#endif /* ALLOW_USE_MPI */
CEOP
C-- Initialise to zero:
psiLoc(1) = 0.
psiLoc(2) = 0.
DO tN = 1,exch2_nTiles
globalBuf(1,tN) = 0.
globalBuf(2,tN) = 0.
globalBuf(3,tN) = 0.
ENDDO
C-- write input into shared-buffer array
DO bj = myByLo(myThid), myByHi(myThid)
DO bi = myBxLo(myThid), myBxHi(myThid)
shareBufCS2_R8(1,bi,bj) = dPsiX(bi,bj)
shareBufCS2_R8(2,bi,bj) = dPsiY(bi,bj)
ENDDO
ENDDO
C-- Master thread cannot start until everyone is ready:
CALL BAR2( myThid )
_BEGIN_MASTER( myThid )
#ifdef ALLOW_USE_MPI
IF ( usingMPI ) THEN
lbuf1 = nSx*nSy
lbuf2 = 2*lbuf1
idest = 0
itag = 0
ready_to_receive = 0
IF ( mpiMyId.NE.0 ) THEN
C-- All proceses except 0 wait to be polled then send local array
#ifndef DISABLE_MPI_READY_TO_RECEIVE
CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER,
& idest, itag, MPI_COMM_MODEL, istatus, ierr)
#endif
CALL MPI_SEND (shareBufCS2_R8, lbuf2, MPI_DOUBLE_PRECISION,
& idest, itag, MPI_COMM_MODEL, ierr)
C-- All proceses except 0 receive result from process 0
CALL MPI_RECV (shareBufCS1_R8, lbuf1, MPI_DOUBLE_PRECISION,
& idest, itag, MPI_COMM_MODEL, istatus, ierr)
ELSE
C-- Process 0 polls and receives data from each process in turn
DO np = 2, nPx*nPy
pId = np - 1
#ifndef DISABLE_MPI_READY_TO_RECEIVE
CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER,
& pId, itag, MPI_COMM_MODEL, ierr)
#endif
CALL MPI_RECV (loc2Buf, lbuf2, MPI_DOUBLE_PRECISION,
& pId, itag, MPI_COMM_MODEL, istatus, ierr)
C-- Process 0 gathers the local arrays into a global array.
DO bj=1,nSy
DO bi=1,nSx
tN = W2_procTileList(bi,bj,np)
globalBuf(1,tN) = loc2Buf(1,bi,bj)
globalBuf(2,tN) = loc2Buf(2,bi,bj)
ENDDO
ENDDO
C- end loop on np
ENDDO
C-- end if process not 0 / else = 0
ENDIF
ENDIF
#endif /* ALLOW_USE_MPI */
IF ( myProcId.EQ.0 ) THEN
C-- Process 0 fills-in its local data
DO bj=1,nSy
DO bi=1,nSx
tN = W2_myTileList(bi,bj)
globalBuf(1,tN) = shareBufCS2_R8(1,bi,bj)
globalBuf(2,tN) = shareBufCS2_R8(2,bi,bj)
ENDDO
ENDDO
C-- Cumulate Sum over all tiles:
#ifdef W2_CUMSUM_USE_MATRIX
C- Using tile x tile matrix:
DO tN = 1,exch2_nTiles
globalBuf(3,tN) = 0.
DO tS = 1,exch2_nTiles
globalBuf(3,tN) = globalBuf(3,tN)
& + W2_cumSum_tiles(1,tS,tN)*globalBuf(1,tS)
& + W2_cumSum_tiles(2,tS,tN)*globalBuf(2,tS)
ENDDO
ENDDO
#else /* W2_CUMSUM_USE_MATRIX */
C- Cumulate per facet and then use facet x facet matrix:
DO j=1,W2_maxNbFacets
facetXYSum(1,j) = 0
facetXYSum(2,j) = 0
facet_CSum(j) = 0
ENDDO
C- First within each face:
DO j=1,nFacets
fNx = facet_dims(2*j-1)
fNy = facet_dims( 2*j )
IF ( fNx*fNy .GE. 1 ) THEN
nbTx = fNx/sNx
nbTy = fNy/sNy
DO bi=1,nbTx-1
tN = facet_owns(1,j) + bi-1
globalBuf(3,tN+1) = globalBuf(3,tN) + globalBuf(1,tN)
ENDDO
DO bj=1,nbTy-1
tS = facet_owns(1,j) - 1 + (bj-1)*nbTx
DO bi=1,nbTx
tN = tS + bi
globalBuf(3,tN+nbTx) = globalBuf(3,tN) + globalBuf(2,tN)
ENDDO
ENDDO
C- facet increment in X & Y dir
DO bi=1,nbTx
tN = facet_owns(1,j) + bi-1
facetXYSum(1,j) = facetXYSum(1,j) + globalBuf(1,tN)
ENDDO
DO bj=1,nbTy
tN = facet_owns(1,j) + (bj-1)*nbTx
facetXYSum(2,j) = facetXYSum(2,j) + globalBuf(2,tN)
ENDDO
ENDIF
ENDDO
C- Calculate cumulated sum at facet origin using facet matrix:
DO j=1,nFacets
DO i=1,nFacets
facet_CSum(j) = facet_CSum(j)
& + W2_cumSum_facet(1,i,j)*facetXYSum(1,i)
& + W2_cumSum_facet(2,i,j)*facetXYSum(2,i)
ENDDO
ENDDO
C- Finally, add cumulated sum at facet origin:
DO tN = 1,exch2_nTiles
j = exch2_myFace(tN)
IF ( j.NE.0 ) THEN
globalBuf(3,tN) = globalBuf(3,tN) + facet_CSum(j)
ENDIF
ENDDO
#endif /* W2_CUMSUM_USE_MATRIX */
C- Value at Special location (e.g., Missing-Corner values)
IF ( W2_tMC1.GE.1 )
& psiLoc(1) = globalBuf(3,W2_tMC1) + globalBuf(2,W2_tMC1)
IF ( W2_tMC2.GE.1 )
& psiLoc(2) = globalBuf(3,W2_tMC2) + globalBuf(1,W2_tMC2)
C-- Process 0 fills-in its local data
DO bj=1,nSy
DO bi=1,nSx
tN = W2_myTileList(bi,bj)
shareBufCS1_R8(bi,bj) = globalBuf(3,tN)
ENDDO
ENDDO
#ifdef ALLOW_USE_MPI
IF ( usingMPI ) THEN
C-- Process 0 sends result to all other processes
DO np = 2, nPx*nPy
pId = np - 1
C- fill local array with relevant portion of global array
DO bj=1,nSy
DO bi=1,nSx
tN = W2_procTileList(bi,bj,np)
loc1Buf(bi,bj) = globalBuf(3,tN)
ENDDO
ENDDO
CALL MPI_SEND (loc1Buf, lbuf1, MPI_DOUBLE_PRECISION,
& pId, itag, MPI_COMM_MODEL, ierr)
ENDDO
ENDIF
#endif /* ALLOW_USE_MPI */
C-- end if process 0
ENDIF
_END_MASTER( myThid )
C-- Everyone wait for Master thread to be ready
CALL BAR2( myThid )
C-- set result for every threads
DO bj = myByLo(myThid), myByHi(myThid)
DO bi = myBxLo(myThid), myBxHi(myThid)
psiZ(bi,bj) = shareBufCS1_R8(bi,bj)
ENDDO
ENDDO
RETURN
END