C $Header: /u/gcmpack/MITgcm/pkg/exch2/exch2_rx2_cube.template,v 1.17 2012/09/03 19:39:25 jmc Exp $
C $Name:  $

#include "CPP_EEOPTIONS.h"
#undef LOCAL_DBUG

CBOP
C     !ROUTINE: EXCH_RX2_CUBE

C     !INTERFACE:
      SUBROUTINE EXCH2_RX2_CUBE(
     U            array1, array2,
     I            signOption, fieldCode,
     I            myOLw, myOLe, myOLs, myOLn, myNz,
     I            exchWidthX, exchWidthY,
     I            cornerMode, myThid )

C     !DESCRIPTION:
C     Two components vector field Exchange:
C     Fill-in tile-edge overlap-region of a 2 component vector field
C     with corresponding near-edge interior data point

C     !USES:
      IMPLICIT NONE

C     == Global data ==
#include "SIZE.h"
#include "EEPARAMS.h"
#include "EESUPPORT.h"
#include "W2_EXCH2_SIZE.h"
#include "W2_EXCH2_TOPOLOGY.h"
#include "W2_EXCH2_BUFFER.h"

C     !INPUT/OUTPUT PARAMETERS:
C     array1      :: 1rst component array with edges to exchange.
C     array2      :: 2nd  component array with edges to exchange.
C     signOption  :: Flag controlling whether vector is signed.
C     fieldCode   :: field code (position on staggered grid)
C     myOLw,myOLe :: West and East overlap region sizes.
C     myOLs,myOLn :: South and North overlap region sizes.
C     exchWidthX  :: Width of data region exchanged in X.
C     exchWidthY  :: Width of data region exchanged in Y.
C     cornerMode  :: halo-corner-region treatment: update/ignore corner region
C     myThid      :: Thread number of this instance of S/R EXCH...

      INTEGER myOLw, myOLe, myOLs, myOLn, myNz
      _RX array1(1-myOLw:sNx+myOLe,
     &           1-myOLs:sNy+myOLn,
     &           myNz, nSx, nSy)
      _RX array2(1-myOLw:sNx+myOLe,
     &           1-myOLs:sNy+myOLn,
     &           myNz, nSx, nSy)
      LOGICAL signOption
      CHARACTER*2 fieldCode
      INTEGER exchWidthX
      INTEGER exchWidthY
      INTEGER cornerMode
      INTEGER myThid

C     !LOCAL VARIABLES:
C     e2_msgHandles :: Synchronization and coordination data structure used to
C                   :: coordinate access to e2Bufr1_RX or to regulate message
C                   :: buffering. In PUT communication sender will increment
C                   :: handle entry once data is ready in buffer. Receiver will
C                   :: decrement handle once data is consumed from buffer.
C                   :: For MPI MSG communication MPI_Wait uses handle to check
C                   :: Isend has cleared. This is done in routine after receives.
C     note: a) current implementation does not use e2_msgHandles for communication
C              between threads: all-threads barriers are used (see CNH note below).
C              For a 2-threads synchro communication (future version),
C              e2_msgHandles should be shared (in common block, moved to BUFFER.h)
      INTEGER bi, bj
C     Variables for working through W2 topology
      INTEGER e2_msgHandles( 2, W2_maxNeighbours, nSx, nSy )
      INTEGER thisTile, farTile, N, nN, oN
      INTEGER tIlo1, tIhi1, tJlo1, tJhi1, oIs1, oJs1
      INTEGER tIlo2, tIhi2, tJlo2, tJhi2, oIs2, oJs2
      INTEGER tIStride, tJStride
      INTEGER tKlo, tKhi, tKStride
      INTEGER i1Lo, i1Hi, j1Lo, j1Hi, k1Lo, k1Hi
      INTEGER i2Lo, i2Hi, j2Lo, j2Hi, k2Lo, k2Hi
      LOGICAL updateCorners

#ifdef ALLOW_USE_MPI
      INTEGER iBufr1, iBufr2, nri, nrj
C     MPI stuff (should be in a routine call)
      INTEGER mpiStatus(MPI_STATUS_SIZE)
      INTEGER mpiRc
      INTEGER wHandle
#endif
CEOP

      updateCorners = cornerMode .EQ. EXCH_UPDATE_CORNERS
C-    Tile size of arrays to exchange:
      i1Lo  = 1-myOLw
      i1Hi  = sNx+myOLe
      j1Lo  = 1-myOLs
      j1Hi  = sNy+myOLn
      k1Lo  = 1
      k1Hi  = myNz
      i2Lo  = 1-myOLw
      i2Hi  = sNx+myOLe
      j2Lo  = 1-myOLs
      j2Hi  = sNy+myOLn
      k2Lo  = 1
      k2Hi  = myNz

C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|

C     Prevent anyone to access shared buffer while an other thread modifies it
      CALL BAR2( myThid )

C--   Post sends into buffer (buffer level 1):
      DO bj=myByLo(myThid), myByHi(myThid)
       DO bi=myBxLo(myThid), myBxHi(myThid)
        thisTile=W2_myTileList(bi,bj)
        nN=exch2_nNeighbours(thisTile)
        DO N=1,nN
          farTile=exch2_neighbourId(N,thisTile)
          oN = exch2_opposingSend(N,thisTile)
#ifdef LOCAL_DBUG
          WRITE(errorMessageUnit,'(A,3I3,A,4I4,A,2I5)')
     &      'send_0 bi,N=', bi,bj, N, ' , tI,J_lo,hi=',
     &                  exch2_iLo(oN,farTile), exch2_iHi(oN,farTile),
     &                  exch2_jLo(oN,farTile), exch2_jHi(oN,farTile),
     &      ' , oIs,oJs=', exch2_oi(N,thisTile), exch2_oj(N,thisTile)
#endif
          CALL EXCH2_GET_UV_BOUNDS(
     I               fieldCode, exchWidthX, updateCorners,
     I               farTile, oN,
     O               tIlo1, tIhi1, tJlo1, tJhi1,
     O               tIlo2, tIhi2, tJlo2, tJhi2,
     O               tiStride, tjStride,
     O               oIs1, oJs1, oIs2, oJs2,
     I               myThid )
#ifdef LOCAL_DBUG
          WRITE(errorMessageUnit,'(A,3I3,A,4I4,A,2I5)')
     &      'send_1 bi,N=', bi,bj, N, ' , tI,J_lo,hi=',
     &      tIlo1, tIhi1, tJlo1, tJhi1, ' , oIs,oJs=', oIs1, oJs1
          WRITE(errorMessageUnit,'(A,3I3,A,4I4,A,2I5)')
     &      'send_2 bi,N=', bi,bj, N, ' , tI,J_lo,hi=',
     &      tIlo2, tIhi2, tJlo2, tJhi2, ' , oIs,oJs=', oIs2, oJs2
#endif
          tKLo=1
          tKHi=myNz
          tKStride=1
C-    Put my points in buffer for neighbour N to fill points
C     (tIlo:tIhi:tiStride,tJlo:tJhi,tJStride,tKlo:tKhi,tKStride)
C     in its copy of "array1" & "array2".
          CALL EXCH2_PUT_RX2(
     I               tIlo1, tIhi1, tIlo2, tIhi2, tiStride,
     I               tJlo1, tJhi1, tJlo2, tJhi2, tjStride,
     I               tKlo, tKhi, tkStride,
     I               oIs1, oJs1, oIs2, oJs2,
     I               thisTile, N,
     I               e2BufrRecSize,
     O               iBuf1Filled(N,bi,bj),    iBuf2Filled(N,bi,bj),
     O               e2Bufr1_RX(1,N,bi,bj,1), e2Bufr2_RX(1,N,bi,bj,1),
     I               array1(1-myOLw,1-myOLs,1,bi,bj),
     I               array2(1-myOLw,1-myOLs,1,bi,bj),
     I               i1Lo, i1Hi, j1Lo, j1Hi, k1Lo, k1Hi,
     I               i2Lo, i2Hi, j2Lo, j2Hi, k2Lo, k2Hi,
     O               e2_msgHandles(1,N,bi,bj),
     I               W2_myCommFlag(N,bi,bj), signOption, myThid )
        ENDDO
       ENDDO
      ENDDO

C     Wait until all threads finish filling buffer
      CALL BAR2( myThid )

C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|

#ifdef ALLOW_USE_MPI
      IF ( usingMPI ) THEN

      _BEGIN_MASTER( myThid )

C--   Send my data (in buffer, level 1) to target Process
      DO bj=1,nSy
       DO bi=1,nSx
        thisTile=W2_myTileList(bi,bj)
        nN=exch2_nNeighbours(thisTile)
        DO N=1,nN
C-    Skip the call if this is an internal exchange
         IF ( W2_myCommFlag(N,bi,bj) .EQ. 'M' ) THEN
          CALL EXCH2_SEND_RX2(
     I               thisTile, N,
     I               e2BufrRecSize,
     I               iBuf1Filled(N,bi,bj),    iBuf2Filled(N,bi,bj),
     I               e2Bufr1_RX(1,N,bi,bj,1), e2Bufr2_RX(1,N,bi,bj,1),
     O               e2_msgHandles(1,N,bi,bj),
     I               W2_myCommFlag(N,bi,bj), myThid )
         ENDIF
        ENDDO
       ENDDO
      ENDDO

C--   Receive data (in buffer, level 2) from source Process
      DO bj=1,nSy
       DO bi=1,nSx
        thisTile=W2_myTileList(bi,bj)
        nN=exch2_nNeighbours(thisTile)
        DO N=1,nN
C-    Skip the call if this is an internal exchange
         IF ( W2_myCommFlag(N,bi,bj) .EQ. 'M' ) THEN
          CALL EXCH2_GET_UV_BOUNDS(
     I               fieldCode, exchWidthX, updateCorners,
     I               thisTile, N,
     O               tIlo1, tIhi1, tJlo1, tJhi1,
     O               tIlo2, tIhi2, tJlo2, tJhi2,
     O               tiStride, tjStride,
     O               oIs1, oJs1, oIs2, oJs2,
     I               myThid )
          nri = 1 + (tIhi1-tIlo1)/tiStride
          nrj = 1 + (tJhi1-tJlo1)/tjStride
          iBufr1 = nri*nrj*myNz
          nri = 1 + (tIhi2-tIlo2)/tiStride
          nrj = 1 + (tJhi2-tJlo2)/tjStride
          iBufr2 = nri*nrj*myNz
C       Receive from neighbour N to fill buffer and later on the array
          CALL EXCH2_RECV_RX2(
     I               thisTile, N,
     I               e2BufrRecSize,
     I               iBufr1, iBufr2,
     I               e2Bufr1_RX(1,N,bi,bj,2), e2Bufr2_RX(1,N,bi,bj,2),
     I               W2_myCommFlag(N,bi,bj), myThid )
         ENDIF
        ENDDO
       ENDDO
      ENDDO

C--   Clear message handles/locks
      DO bj=1,nSy
       DO bi=1,nSx
        thisTile=W2_myTileList(bi,bj)
        nN=exch2_nNeighbours(thisTile)
        DO N=1,nN
C     Note: In a between process tile-tile data transport using
C           MPI the sender needs to clear an Isend wait handle here.
C           In a within process tile-tile data transport using true
C           shared address space/or direct transfer through commonly
C           addressable memory blocks the receiver needs to assert
C           that he has consumed the buffer the sender filled here.
         farTile=exch2_neighbourId(N,thisTile)
         IF     ( W2_myCommFlag(N,bi,bj) .EQ. 'M' ) THEN
          wHandle = e2_msgHandles(1,N,bi,bj)
          CALL MPI_Wait( wHandle, mpiStatus, mpiRc )
          wHandle = e2_msgHandles(2,N,bi,bj)
          CALL MPI_Wait( wHandle, mpiStatus, mpiRc )
         ELSEIF ( W2_myCommFlag(N,bi,bj) .EQ. 'P' ) THEN
         ELSE
         ENDIF
        ENDDO
       ENDDO
      ENDDO

      _END_MASTER( myThid )
C     Everyone waits until master-thread finishes receiving
      CALL BAR2( myThid )

      ENDIF
#endif /* ALLOW_USE_MPI */

C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|

C--   Extract from buffer (either from level 1 if local exch,
C                     or level 2 if coming from an other Proc)
      DO bj=myByLo(myThid), myByHi(myThid)
       DO bi=myBxLo(myThid), myBxHi(myThid)
        thisTile=W2_myTileList(bi,bj)
        nN=exch2_nNeighbours(thisTile)
        DO N=1,nN
#ifdef LOCAL_DBUG
          WRITE(errorMessageUnit,'(A,3I3,A,4I4,A,2I5)')
     &      'recv_0 bi,N=', bi,bj, N, ' , tI,J_lo,hi=',
     &          exch2_iLo(N,thisTile), exch2_iHi(N,thisTile),
     &          exch2_jLo(N,thisTile), exch2_jHi(N,thisTile)
#endif
          CALL EXCH2_GET_UV_BOUNDS(
     I               fieldCode, exchWidthX, updateCorners,
     I               thisTile, N,
     O               tIlo1, tIhi1, tJlo1, tJhi1,
     O               tIlo2, tIhi2, tJlo2, tJhi2,
     O               tiStride, tjStride,
     O               oIs1, oJs1, oIs2, oJs2,
     I               myThid )
#ifdef LOCAL_DBUG
          WRITE(errorMessageUnit,'(A,3I3,A,4I4,A,2I5)')
     &      'recv_1 bi,N=', bi,bj, N, ' , tI,J_lo,hi=',
     &      tIlo1, tIhi1, tJlo1, tJhi1
          WRITE(errorMessageUnit,'(A,3I3,A,4I4,A,2I5)')
     &      'recv_2 bi,N=', bi,bj, N, ' , tI,J_lo,hi=',
     &      tIlo2, tIhi2, tJlo2, tJhi2
#endif
          tKLo=1
          tKHi=myNz
          tKStride=1

C     From buffer, get my points
C     (tIlo:tIhi:tiStride,tJlo:tJhi,tJStride,tKlo:tKhi,tKStride) in "array1,2":
C     Note: when transferring data within a process:
C      o e2Bufr entry to read is entry associated with opposing send record
C      o e2_msgHandle entry to read is entry associated with opposing send record.
          CALL EXCH2_GET_RX2(
     I               tIlo1, tIhi1, tIlo2, tIhi2, tiStride,
     I               tJlo1, tJhi1, tJlo2, tJhi2, tjStride,
     I               tKlo, tKhi, tkStride,
     I               thisTile, N, bi, bj,
     I               e2BufrRecSize, W2_maxNeighbours, nSx, nSy,
     I               e2Bufr1_RX, e2Bufr2_RX,
     U               array1(1-myOLw,1-myOLs,1,bi,bj),
     U               array2(1-myOLw,1-myOLs,1,bi,bj),
     I               i1Lo, i1Hi, j1Lo, j1Hi, k1Lo, k1Hi,
     I               i2Lo, i2Hi, j2Lo, j2Hi, k2Lo, k2Hi,
     U               e2_msgHandles,
     I               W2_myCommFlag(N,bi,bj), myThid )
        ENDDO
       ENDDO
      ENDDO

      RETURN
      END

C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|

CEH3 ;;; Local Variables: ***
CEH3 ;;; mode:fortran ***
CEH3 ;;; End: ***
