C $Header: /u/gcmpack/MITgcm/eesupp/src/cumulsum_z_tile.F,v 1.3 2012/09/06 15:25:01 jmc Exp $ C $Name: $ #include "PACKAGES_CONFIG.h" #include "CPP_EEOPTIONS.h" C-- File cumulsum_z_tile.F: Routines that perform cumulated sum C on a tiled array, corner grid-cell location C Contents C o CUMULSUM_Z_TILE_RL C o CUMULSUM_Z_TILE_RS <- not yet coded C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----| CBOP C !ROUTINE: CUMULSUM_Z_TILE_RL C !INTERFACE: SUBROUTINE CUMULSUM_Z_TILE_RL( O psiZ, psiLoc, I dPsiX, dPsiY, myThid ) C !DESCRIPTION: C *==========================================================* C | SUBROUTINE CUMULSUM\_Z\_TILE\_RL C | o Handle cumulated sum for _RL tile data. C *==========================================================* C | Cumulate sum on tiled array, corner grid-cell location: C | Starts from 1rst tile and, going through all tiles & all C | the processes, add increment in both directions C *==========================================================* C !USES: IMPLICIT NONE C == Global data == #include "SIZE.h" #include "EEPARAMS.h" #include "EESUPPORT.h" #include "CUMULSUM.h" C !INPUT/OUTPUT PARAMETERS: C == Routine arguments == C psiZ :: results of cumulated sum, corresponds to tile South-East corner C psiLoc :: cumulated sum at special locations C dPsiX :: tile increment in X direction C dPsiY :: tile increment in Y direction C myThid :: my Thread Id. number _RL psiZ (nSx,nSy) _RL psiLoc(2) _RL dPsiX (nSx,nSy) _RL dPsiY (nSx,nSy) INTEGER myThid C !LOCAL VARIABLES: #ifndef ALLOW_EXCH2 C == Local variables == C bi,bj :: tile indices C- type declaration of: loc[1,2]Buf and shareBufCS[1,2]_R8 : C all 4 needs to have the same length as MPI_DOUBLE_PRECISION INTEGER bi,bj INTEGER nf #ifdef ALLOW_USE_MPI INTEGER biG, bjG, npe, np1 INTEGER lbuf1, lbuf2, idest, itag, ready_to_receive INTEGER istatus(MPI_STATUS_SIZE), ierr Real*8 loc1Buf (nSx,nSy) Real*8 loc2Buf(2,nSx,nSy) Real*8 globalBuf(3,nSx*nPx,nSy*nPy) #endif /* ALLOW_USE_MPI */ #endif /* ALLOW_EXCH2 */ CEOP #ifdef ALLOW_EXCH2 CALL W2_CUMULSUM_Z_TILE_RL( O psiZ, psiLoc, I dPsiX, dPsiY, myThid ) #else /* ALLOW_EXCH2 */ C-- write input into shared-buffer array DO bj = myByLo(myThid), myByHi(myThid) DO bi = myBxLo(myThid), myBxHi(myThid) shareBufCS2_R8(1,bi,bj) = dPsiX(bi,bj) shareBufCS2_R8(2,bi,bj) = dPsiY(bi,bj) ENDDO ENDDO psiLoc(1) = 0. psiLoc(2) = 0. C-- Master thread cannot start until everyone is ready: CALL BAR2( myThid ) _BEGIN_MASTER( myThid ) #ifdef ALLOW_USE_MPI IF ( usingMPI ) THEN lbuf1 = nSx*nSy lbuf2 = 2*lbuf1 idest = 0 itag = 0 ready_to_receive = 0 IF ( mpiMyId.NE.0 ) THEN C-- All proceses except 0 wait to be polled then send local array #ifndef DISABLE_MPI_READY_TO_RECEIVE CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER, & idest, itag, MPI_COMM_MODEL, istatus, ierr) #endif CALL MPI_SEND (shareBufCS2_R8, lbuf2, MPI_DOUBLE_PRECISION, & idest, itag, MPI_COMM_MODEL, ierr) C-- All proceses except 0 receive result from process 0 CALL MPI_RECV (shareBufCS1_R8, lbuf1, MPI_DOUBLE_PRECISION, & idest, itag, MPI_COMM_MODEL, istatus, ierr) ELSE C-- Process 0 fills-in its local data np1 = 1 DO bj=1,nSy DO bi=1,nSx biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj globalBuf(1,biG,bjG) = shareBufCS2_R8(1,bi,bj) globalBuf(2,biG,bjG) = shareBufCS2_R8(2,bi,bj) ENDDO ENDDO C-- Process 0 polls and receives data from each process in turn DO npe = 1, numberOfProcs-1 #ifndef DISABLE_MPI_READY_TO_RECEIVE CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER, & npe, itag, MPI_COMM_MODEL, ierr) #endif CALL MPI_RECV (loc2Buf, lbuf2, MPI_DOUBLE_PRECISION, & npe, itag, MPI_COMM_MODEL, istatus, ierr) C-- Process 0 gathers the local arrays into a global array. np1 = npe + 1 DO bj=1,nSy DO bi=1,nSx biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj globalBuf(1,biG,bjG) = loc2Buf(1,bi,bj) globalBuf(2,biG,bjG) = loc2Buf(2,bi,bj) ENDDO ENDDO ENDDO C-- Cumulate Sum over all tiles: globalBuf(3,1,1) = 0. bj = 1 DO bi = 1,nSx*nPx-1 globalBuf(3,1+bi,bj) = globalBuf(3,bi,bj) & + globalBuf(1,bi,bj) ENDDO DO bj = 1,nSy*nPy-1 DO bi = 1,nSx*nPx globalBuf(3,bi,1+bj) = globalBuf(3,bi,bj) & + globalBuf(2,bi,bj) ENDDO ENDDO C-- Process 0 fills-in its local data np1 = 1 DO bj=1,nSy DO bi=1,nSx biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj shareBufCS1_R8(bi,bj) = globalBuf(3,biG,bjG) ENDDO ENDDO C-- Process 0 sends result to all other processes DO npe = 1, numberOfProcs-1 C- fill local array with relevant portion of global array np1 = npe + 1 DO bj=1,nSy DO bi=1,nSx biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj loc1Buf(bi,bj) = globalBuf(3,biG,bjG) ENDDO ENDDO CALL MPI_SEND (loc1Buf, lbuf1, MPI_DOUBLE_PRECISION, & npe, itag, MPI_COMM_MODEL, ierr) ENDDO ENDIF ELSEIF (useCubedSphereExchange) THEN #else /* not USE_MPI */ IF (useCubedSphereExchange) THEN #endif /* ALLOW_USE_MPI */ C-- assume 1 tile / face, from bi=1 to 6, no MPI shareBufCS1_R8(1,1) = 0. bj = 1 DO bi = 1,nSx-1 nf = 1 + MOD(1+bi,2) shareBufCS1_R8(1+bi,bj) = shareBufCS1_R8(bi,bj) & + shareBufCS2_R8(nf,bi,bj) ENDDO C- fill in missing corner: 1 = North-West corner of face 1 C- 2 = South-East corner of face 2 bi = 1 psiLoc(1) = shareBufCS1_R8(bi,bj) + shareBufCS2_R8(2,bi,bj) bi = MIN(2,nSx) psiLoc(2) = shareBufCS1_R8(bi,bj) + shareBufCS2_R8(1,bi,bj) ELSE C-- Cumulate Sum over all tiles: shareBufCS1_R8(1,1) = 0. bj = 1 DO bi = 1,nSx-1 shareBufCS1_R8(1+bi,bj) = shareBufCS1_R8(bi,bj) & + shareBufCS2_R8(1,bi,bj) ENDDO DO bj = 1,nSy-1 DO bi = 1,nSx shareBufCS1_R8(bi,1+bj) = shareBufCS1_R8(bi,bj) & + shareBufCS2_R8(2,bi,bj) ENDDO ENDDO ENDIF _END_MASTER( myThid ) C-- Everyone wait for Master thread to be ready CALL BAR2( myThid ) C-- set result for every threads DO bj = myByLo(myThid), myByHi(myThid) DO bi = myBxLo(myThid), myBxHi(myThid) psiZ(bi,bj) = shareBufCS1_R8(bi,bj) ENDDO ENDDO #endif /* ALLOW_EXCH2 */ RETURN END