c -*- mode: fortran -*- onestep.m4 c Perform one timesteps of the 3d update (3 1-d sweeps) c c m 4 command: m 4 -B99999 onestep.m4 >onestep.f c use real*4 precision define(ifsngl,1) c ----------------------------- choose wisely: c use sppm kernel define(ifsppm,0) c use Paul's new kernel define(ifppmvec,0) c use ppmlib define(ifppmlib,1) c ----------------------------- c use f90 allocate define(ifmema,1) c ---------------------------------------------------------------- changequote(<<,>>) define(SWEEP,<< subroutine $1sweep( $1l, dddold, dddnew, gamma, dtime, & ix_first, ix_last, iy_first, iy_last, iz_first, iz_last, & nofx, nofy, nofz, ipen, nthreads, nstep, mythread,idosum ) implicit none INCLUDE 'iq.h' c Thread loop counters kept here. Eventually we will use this information c to remove some of the barriers - for now, there is an explicit barrier c between sweeps. integer nthreads, nstep, idosum c ipen will be pointing at one of the counters in /sweepc/ integer ipen, mypen integer npens, mythread INCLUDE 'myomp.h' integer IFETCHADD external IFETCHADD c -- integer i$1_first, i$2_first, i$3_first integer i$1_last, i$2_last, i$3_last integer nof$1, nof$2, nof$3 integer n$1todo, n$2todo, n$3todo integer i$2pen, i$3pen, n$2pens, n$3pens integer ix, iy, iz, kx, ky, kz, k, i integer jx,jy,jz integer i$1start, i$1stop, noff$1 integer i$2start, i$2stop, noff$2 integer i$3start, i$3stop, noff$3 integer j$1start, j$1stop integer j$2start, j$2stop, j$3start, j$3stop real gamma, dtime, cournt integer NROWS parameter( NROWS=8 ) ifelse(ifppmvec,1,<< real*8 SMLRHO,SMALLP,SMALLU,SMALLE real*8 dtime8,cournt8,gamma8,difcon8,shkjmp8,wavhaf8 parameter (difcon8=0.3) parameter (shkjmp8=0.25) parameter (wavhaf8=1.0) integer nflopcounts(8) c nflopcounts(1) = nflops c nflopcounts(2) = nadds c nflopcounts(3) = nmults c nflopcounts(4) = nrecips c nflopcounts(5) = ncvmgms c nflopcounts(6) = nsqrts c nflopcounts(7) = nrsqrts c nflopcounts(8) = nexps c c>>,<< real SMLRHO,SMALLP,SMALLU,SMALLE c>>) parameter( SMLRHO=1.0e-06) parameter( SMALLP=1.0e-06) parameter( SMALLU=1.0e-06) parameter( SMALLE=1.0e-06) c The following param. serves solely to shorten the 'call ppm' line. integer o parameter (o=1-nbdy) c c These are the source and destination global storage: real dddold(nvar, & 1-nbdyf:npx+nbdyf, 1-nbdyf:npy+nbdyf, 1-nbdyf:npz+nbdyf) real dddnew(nvar, & 1-nbdyf:npx+nbdyf, 1-nbdyf:npy+nbdyf, 1-nbdyf:npz+nbdyf) real*8 $1l(1-nbdy:nof$1+nbdy+1 ) c Poor old ppm is real*4, so must have this copied real*4 $1l4(1-nbdy:nof$1+nbdy+1) c For the OPENmp implementation, we need to 'share' these variables which c are computed by the thread which enters the sweep routine first. c The parallel thread(s) then reference these in the parallel do. c save n$1todo,n$2todo,n$3todo, n$2pens,n$3pens c save $1l4, npens c This is one new strip of zones. c Only i$1start:i$1stop is actually retu rned. ifelse(ifppmvec,1,<< c real*8 rhonu,pnu,uxnu,uynu,uznu,diagnu c real*8 rrho,pp,uux,uuy,uuz,diag c>>,<< real rhonu,pnu,uxnu,uynu,uznu,diagnu real rrho,pp,uux,uuy,uuz,diag c>>) ifelse(ifmema,1,<< real ifelse(ifppmvec,1,*8), allocatable, dimension(:):: rhonu,pnu,uxnu,uynu,uznu,diagnu c >>,<< c instead of nof$1, use nnmm real ifelse(ifppmvec,1,*8) rhonu,pnu,uxnu,uynu,uznu,diagnu dimension rhonu(1-nbdy:nof$1+nbdy) dimension pnu(1-nbdy:nof$1+nbdy) dimension uxnu(1-nbdy:nof$1+nbdy) dimension uynu(1-nbdy:nof$1+nbdy) dimension uznu(1-nbdy:nof$1+nbdy) dimension diagnu(1-nbdy:nof$1+nbdy) c>>) c This holds one new, updated pencil of zones. c The 'o' is for convenience in loop addressing. c ifelse(ifmema,1,<< real, allocatable, dimension(:,:,:,:):: dddnupen c >>,<< c nof$1 real dddnupen( nvar, 1-nbdy:nof$1+nbdy, NROWS, NROWS ) c>>) c This holds one pencil, to be used in ppm calls - c updates to dddnupen, above. The 2nd two dimensions could really c use nbdy2, but this matches the convention in David'd ppm ifelse(ifmema,1,<< real ifelse(ifppmvec,1,*8), allocatable, dimension(:,:,:):: rrho,pp,uux,uuy,uuz,diag c>>,<< real ifelse(ifppmvec,1,*8) rrho,pp,uux,uuy,uuz,diag dimension rrho( 1-nbdy:nof$1+nbdy, & 1-nbdy:NROWS+nbdy, 1-nbdy:NROWS+nbdy) dimension pp( 1-nbdy:nof$1+nbdy, & 1-nbdy:NROWS+nbdy, 1-nbdy:NROWS+nbdy) dimension uux( 1-nbdy:nof$1+nbdy, & 1-nbdy:NROWS+nbdy, 1-nbdy:NROWS+nbdy) dimension uuy( 1-nbdy:nof$1+nbdy, & 1-nbdy:NROWS+nbdy, 1-nbdy:NROWS+nbdy) dimension uuz( 1-nbdy:nof$1+nbdy, & 1-nbdy:NROWS+nbdy, 1-nbdy:NROWS+nbdy) dimension diag( 1-nbdy:nof$1+nbdy, & 1-nbdy:NROWS+nbdy, 1-nbdy:NROWS+nbdy) c>>) external getMillisec integer*8 nflops(32) integer startppm, stopppm, timeppm(32), timesweep(32), getMillisec common /ppmtimer/ timeppm, nflops, timesweep real*8 psumkntc, psumprs, dens,velx,vely,velz real*8 sumkntc(32), sumprs(32) common /stats/ sumkntc, sumprs c The tile xy position in the 2-D decomposition integer itx, ity, goaway c The re turned max courant number globally and per thread real courmxs(MAXTHREADS) common /conestep/ goaway, itx, ity, courmxs c c================================================================ c write(0,*) "*******!!!!!!****** started $1 sweep" c write(0,*) "nstep=", nstep c Update strategy c =============== c c The update proceeds in a sweep of 'pencils', each nrows x nrows c in the axes $2 and $3. Each pencil is n$1 real zones long. c c c ................................ c c Copy pencil to un-interleaved (terleaved?) pencil buffers c c Number of zones to compute in c the three directions are c n$1todo = i$1_last - i$1_first + 1 n$2todo = i$2_last - i$2_first + 1 n$3todo = i$3_last - i$3_first + 1 c Number of pencils (including c partials) in the $2 direction is c n$2pens = ( n$2todo + NROWS -1 ) / NROWS n$3pens = ( n$3todo + NROWS -1 ) / NROWS c ifelse(ifmema,1,<< call t_lock(7) c$omp critical allocate( rhonu(1-nbdy:nof$1+nbdy) ) allocate( pnu(1-nbdy:nof$1+nbdy) ) allocate( uxnu(1-nbdy:nof$1+nbdy) ) allocate( uynu(1-nbdy:nof$1+nbdy) ) allocate( uznu(1-nbdy:nof$1+nbdy) ) allocate(diagnu(1-nbdy:nof$1+nbdy) ) allocate( dddnupen(nvar,1-nbdy:nof$1+nbdy,NROWS,NROWS) ) allocate( rrho(1-nbdy:nof$1+nbdy, 1-nbdy:NROWS+nbdy, 1-nbdy:NROWS+nbdy) ) allocate( pp(1-nbdy:nof$1+nbdy, 1-nbdy:NROWS+nbdy, 1-nbdy:NROWS+nbdy) ) allocate( uux(1-nbdy:nof$1+nbdy, 1-nbdy:NROWS+nbdy, 1-nbdy:NROWS+nbdy) ) allocate( uuy(1-nbdy:nof$1+nbdy, 1-nbdy:NROWS+nbdy, 1-nbdy:NROWS+nbdy) ) allocate( uuz(1-nbdy:nof$1+nbdy, 1-nbdy:NROWS+nbdy, 1-nbdy:NROWS+nbdy) ) allocate( diag(1-nbdy:nof$1+nbdy, 1-nbdy:NROWS+nbdy, 1-nbdy:NROWS+nbdy) ) call t_unlock(7) c$omp end critical c>>) c write(0,*) ' thru allocs ', mythread c******* c if ( mythread.ne.2 ) goto 9999 c Setup invariant stuff for ppm call. ppmvec uses $1l directly. do i= 1-nbdy, nof$1+nbdy $1l4(i)= $1l(i) enddo $1l4(nof$1+nbdy+1)= $1l(nof$1+nbdy+1) npens= n$2pens * n$3pens c ---------------------------------------------------------------- 1000 continue mypen = IFETCHADD( ipen, 1 ) + 1 if ( mypen .gt. npens ) goto 2000 c mypen= ipen+1 c ipen= mypen c c$omp do schedule(static,1) c do 2000 mypen= 1, npens c mythread= 1 c!$ mythread= omp_get_thread_num() + 1 c write(0,*) mythread, " omp_get_num_threads()=", omp_get_num_threads() c c (start loop over n$3pens cccp do 2000 i$3pen = 1, n$3pens c (start loop over n$2pens cccp do 2100 i$2pen = 1, n$2pens cccp do 2000 i$3$2pen = 1, n$3pens * n$2pens i$3pen = ( mypen + n$2pens - 1) / n$2pens i$2pen = mypen - (i$3pen-1)*n$2pens c c Compute pencil bounds to update (istart:istop) and the additive offset from c global bounds to pencil bounds (noff) c c The size of each pencil is normally NROWSxNROWS, but it won't come out c evenly necessarily. c c Also, compute the global bounds to fetch (jstart:jstop) from dddold i$3start = i$3_first + (i$3pen-1) * NROWS i$3stop = i$3start + NROWS - 1 i$3stop = min (i$3stop,i$3_last) j$3start = i$3start - nbdy2 j$3stop = i$3stop + nbdy2 noff$3 = 1 - i$3start i$2start = i$2_first + (i$2pen-1) * NROWS i$2stop = i$2start + NROWS - 1 i$2stop = min (i$2stop,i$2_last) j$2start = i$2start - nbdy2 j$2stop = i$2stop + nbdy2 noff$2 = 1 - i$2start i$1start = i$1_first i$1stop = i$1_last j$1start = i$1start - nbdy j$1stop = i$1stop + nbdy noff$1 = 1 - i$1start c o = (1 - nbdy) c write(0,*) mythread,"************pencil ",i$2pen, i$3pen, mypen c write(0,*) "===================calchyd$1========================" c write(0,*) "i$3start, i$3stop, noff$3=", i$3start, i$3stop, noff$3 c write(0,*) "i$2start, i$2stop, noff$2=", i$2start, i$2stop, noff$2 c write(0,*) "i$1start, i$1stop, noff$1=", i$1start, i$1stop, noff$1 c write(0,*) "&" c write(0,*) "noff$3+j$3start, j$3stop=", noff$3+j$3start, noff$3+j$3stop c write(0,*) "noff$2+j$2start, j$2stop=", noff$2+j$2start, noff$2+j$2stop c write(0,*) "noff$1+j$1start, j$1stop=", noff$1+j$1start, noff$1+j$1stop c write(0,*) "nvar, nbdy, nbdy2, nbdyf=", nvar,nbdy,nbdy2,nbdyf c write(0,*) "===================calchyd$1========================" c ---------------------------------------------------------------- c Extract the pencil including surrounds c write(0,*) mythread,"***",jzstart,jzstop, jystart,jystop, jxstart,jxstop do iz = jzstart, jzstop jz= iz+noffz do iy = jystart, jystop jy= iy+noffy do ix = jxstart, jxstop jx= ix+noffx c write(0,*) jx,jy,jz, ix,iy,iz, " rrrrrrrrhhhhh o " rrho(j$1,j$2,j$3) = dddold(1,ix,iy,iz) pp(j$1,j$2,j$3) = dddold(2,ix,iy,iz) uux(j$1,j$2,j$3) = dddold(3,ix,iy,iz) uuy(j$1,j$2,j$3) = dddold(4,ix,iy,iz) uuz(j$1,j$2,j$3) = dddold(5,ix,iy,iz) diag(j$1,j$2,j$3) = dddold(6,ix,iy,iz) enddo enddo enddo c write(0,*) mythread, "extracted" c ---------------------------------------------------------------- c Apply hydro to the extracted pencil ifelse(ifppmvec,1,<< do i= 1, 8 nflopcounts(i)= 0 enddo c>>) startppm= getMillisec() cournt= 0 c Use private sums psumkntc= 0 psumprs= 0 do 1200 k$3 = 1, i$3stop-i$3start+1 do 1100 k$2 = 1, i$2stop-i$2start+1 c dhp bke c (+u2/-u2) <--> (o/d) <--> (t/b) c (+u3/-u3) <--> (t/b) <--> (p/m) c write(0,*) "calling ppm" ifelse(ifsppm,1,<< call sppm( $1l4(o), rrho(o,k$2,k$3), rrho(o,k$2,k$3-2), rrho(o,k$2,k$3+2), & rrho(o,k$2-2,k$3),rrho(o,k$2+2,k$3), & pp(o,k$2,k$3),pp(o,k$2,k$3-2),pp(o,k$2,k$3+2),pp(o,k$2-2,k$3),pp(o,k$2+2,k$3), & uu$1(o,k$2,k$3),uu$1(o,k$2,k$3-2),uu$1(o,k$2,k$3+2),uu$1(o,k$2-2,k$3),uu$1(o,k$2+2,k$3), & uu$2(o,k$2,k$3),uu$2(o,k$2,k$3-1),uu$2(o,k$2,k$3+1),uu$2(o,k$2,k$3-2),uu$2(o,k$2,k$3+2), & uu$2(o,k$2-2,k$3),uu$2(o,k$2+2,k$3), & uu$3(o,k$2,k$3),uu$3(o,k$2,k$3-2),uu$3(o,k$2,k$3+2),uu$3(o,k$2-1,k$3),uu$3(o,k$2+1,k$3), & uu$3(o,k$2-2,k$3),uu$3(o,k$2+2,k$3), & rhonu(o),pnu(o),u$1nu(o), u$2nu(o), u$3nu(o), dtime,cournt,gamma, & smlrho, smalle, smallp, smallu, n$1todo, nbdy ) do i= 1, n$1todo diagnu(i)= diag(i,k$2,k$3) enddo c>>) ifelse(ifppmvec,1,<< dtime8= dtime gamma8= gamma cournt8= cournt call ppmvec( $1l,rrho,pp,uu$1,uu$2,uu$3,diag, & rhonu(o),pnu(o),u$1nu(o),u$2nu(o),u$3nu(o),diagnu(o), & dtime8,cournt8,gamma8, SMLRHO,SMALLE,SMALLP,SMALLU, & difcon8, shkjmp8, wavhaf8, n$1todo, nbdy, nflopcounts, k$2,k$3 ) c -- or dummyup: c cournt8= 0.8 c do i= 1, n$1todo c rhonu(i)= rrho(i,k$2,k$3) c pnu(i)= pp(i,k$2,k$3) c uxnu(i)= uux(i,k$2,k$3) c uynu(i)= uuy(i,k$2,k$3) c uznu(i)= uuz(i,k$2,k$3) c diagnu(i)= diagnu(i) c enddo c call ppmvec ( $1l,rrho(o,k$2,k$3-2),rrho(o,k$2,k$3),rrho(o,k$2,k$3+2), c & rrho(o,k$2+2,k$3),rrho(o,k$2-2,k$3), c & pp(o,k$2,k$3-2),pp(o,k$2,k$3),pp(o,k$2,k$3+2),pp(o,k$2+2,k$3),pp(o,k$2-2,k$3), c & uu$1(o,k$2,k$3-2),uu$1(o,k$2,k$3),uu$1(o,k$2,k$3+2),uu$1(o,k$2+2,k$3),uu$1(o,k$2-2,k$3), c & uu$2(o,k$2,k$3-2),uu$2(o,k$2,k$3-1),uu$2(o,k$2,k$3),uu$2(o,k$2,k$3+1),uu$2(o,k$2,k$3+2), c & uu$2(o,k$2+2,k$3),uu$2(o,k$2-2,k$3), c & uu$3(o,k$2,k$3-2),uu$3(o,k$2,k$3),uu$3(o,k$2,k$3+2),uu$3(o,k$2+2,k$3),uu$3(o,k$2+1,k$3), c & uu$3(o,k$2-1,k$3),uu$3(o,k$2-2,k$3), c & diag(o,k$2,k$3), c & rhonu(o),pnu(o),u$1nu(o), u$2nu(o), u$3nu(o), diagnu(o), c & dtime8,cournt8,gamma8, SMLRHO,SMALLE,SMALLP,SMALLU, c & difcon8, shkjmp8, wavhaf8, n$1todo, nbdy, nflopcounts ) cournt= cournt8 c Note all (floating point) arguments are real*8 c subroutine ppmvec(vvv1,rhozbb,vvv2,rhoztt,rhozff,rhoznn, c & pzbb,vvv3,pztt,pzff,pznn, c & uxzbb,vvv4,uxztt,uxzff,uxznn, c & uyzbb,uyzb,vvv5,uyzt,uyztt,uyzff,uyznn, c & uzzbb,vvv6,uzztt,uzzff,uzzf,uzzn,uzznn, c & fwlzbb,fwllzb,vvv7,fwllzt,fwlztt, c & fwlzff,fwllzf,fwllzn,fwlznn, c & rhonu,pnu,uxnu,uynu,uznu, c & ccc1,ccc2,ccc3,ccc4,ccc5,ccc6,ccc7, c & ccdifcon, ccshkjmp, ccwavhaf, iii1,iii2, c & nflopcounts) c>>) ifelse(ifppmlib,1,<< call ifelse(ifsngl,1,,d_)do_ppm_de0_3dles_gam ( & $1l4, rrho,pp,uu$1,uu$2,uu$3,diag, & rhonu,pnu,u$1nu,u$2nu,u$3nu,diagnu, & gamma, dtime, & smlrho, smallp, smallu, smalle, cournt, & n$1todo, nbdy, NROWS, k$2, k$3 ) c call ifelse(ifsngl,1,,d_)do_ppm_de0_3dles_gam ( c 1 $1l4(o), rrho(o,k$2-2,k$3), rrho(o,k$2,k$3), rrho(o,k$2+2,k$3), c 1 rrho(o,k$2,k$3-2), rrho(o,k$2,k$3+2), c 1 pp(o,k$2-2,k$3), pp(o,k$2,k$3), pp(o,k$2+2,k$3), c 1 pp(o,k$2,k$3-2), pp(o,k$2,k$3+2), c 1 uu$1(o,k$2-2,k$3), uu$1(o,k$2,k$3), uu$1(o,k$2+2,k$3), c 1 uu$1(o,k$2,k$3-2), uu$1(o,k$2,k$3+2), c 1 uu$2(o,k$2-2,k$3), uu$2(o,k$2-1,k$3), uu$2(o,k$2,k$3), c 1 uu$2(o,k$2+1,k$3), uu$2(o,k$2+2,k$3), uu$2(o,k$2,k$3-2), c 1 uu$2(o,k$2,k$3+2), c 1 uu$3(o,k$2,k$3-2), uu$3(o,k$2,k$3-1), uu$3(o,k$2,k$3), c 1 uu$3(o,k$2,k$3+1), uu$3(o,k$2,k$3+2), uu$3(o,k$2-2,k$3), c 1 uu$3(o,k$2+2,k$3), c 1 rhonu(o), pnu(o), u$1nu(o), u$2nu(o), u$3nu(o), gamma, dtime, c 1 smlrho, smallp, smallu, smalle, cournt, c 1 n$1todo, nbdy) c>>) if ( cournt .le. 0 .or. cournt .gt. 1. ) then write(0,*) "===================calchyd$1========================" write(0,*) "i$3start, i$3stop, noff$3=", i$3start, i$3stop, noff$3 write(0,*) "i$2start, i$2stop, noff$2=", i$2start, i$2stop, noff$2 write(0,*) "i$1start, i$1stop, noff$1=", i$1start, i$1stop, noff$1 write(0,*) "&" write(0,*) "noff$3+j$3start, j$3stop=", noff$3+j$3start, noff$3+j$3stop write(0,*) "noff$2+j$2start, j$2stop=", noff$2+j$2start, noff$2+j$2stop write(0,*) "noff$1+j$1start, j$1stop=", noff$1+j$1start, noff$1+j$1stop write(0,*) "nvar, nbdy, nbdy2, nbdyf=", nvar,nbdy,nbdy2,nbdyf write(0,*) "===================calchyd$1========================" write(0,*) "o,pencil,nbdy ",o, i$2pen, i$3pen, nbdy write(0,*) "**********Ack: $1sweep k$2,k$3=",k$2,k$3,"cournt=",cournt write(0,*) " *********dtime,gamma=",dtime,gamma write(0,2) "i","xl", "rho", "p","uux","uuy","uuz" 2 format(a5,1x,6a12) do i= 1-nbdy, n$1todo+nbdy write(0,3) i,$1l(i),rrho(i,k$2,k$3),pp(i,k$2,k$3), & uux(i,k$2,k$3),uuy(i,k$2,k$3),uuz(i,k$2,k$3) enddo 3 format(i5,1x,6e12.5) stop endif do i= i$1start+noff$1, i$1stop+noff$1 dddnupen(1,i, k$2, k$3 ) = rhonu(i) dddnupen(2,i, k$2, k$3 ) = pnu(i) dddnupen(3,i, k$2, k$3 ) = uxnu(i) dddnupen(4,i, k$2, k$3 ) = uynu(i) dddnupen(5,i, k$2, k$3 ) = uznu(i) dddnupen(6,i, k$2, k$3 ) = diagnu(i) dens= rhonu(i) velx= uxnu(i) vely= uynu(i) velz= uznu(i) psumkntc = psumkntc + dens * (velx*velx + vely*vely + velz*velz ) psumprs = psumprs + pnu(i) c psumkntc = psumkntc + rhonu(i) * c & ( uxnu(i)*uxnu(i) + uynu(i)*uynu(i) + uznu(i)*uznu(i) ) c psumprs = psumprs + pnu(i) enddo 1100 continue 1200 continue stopppm= getMillisec() if ( idosum .ne. 0 ) then sumkntc(mythread)= sumkntc(mythread)+psumkntc sumprs(mythread)= sumprs(mythread)+psumprs endif courmxs(mythread)= max( cournt, courmxs(mythread) ) timeppm(mythread)= timeppm(mythread) + (stopppm-startppm) ifelse(ifppmvec,1,<< nflops(mythread)= nflops(mythread) + nflopcounts(1) c>>) c ---------------------------------------------------------------- c Store the pencil all at once to best use the cache. do iz = izstart, izstop jz= iz+noffz do iy = iystart, iystop jy= iy+noffy do ix = ixstart, ixstop jx= ix + noffx c unrolled c do k=1,nvar dddnew(1, ix, iy, iz) = dddnupen(1, j$1,j$2,j$3 ) dddnew(2, ix, iy, iz) = dddnupen(2, j$1,j$2,j$3 ) dddnew(3, ix, iy, iz) = dddnupen(3, j$1,j$2,j$3 ) dddnew(4, ix, iy, iz) = dddnupen(4, j$1,j$2,j$3 ) dddnew(5, ix, iy, iz) = dddnupen(5, j$1,j$2,j$3 ) dddnew(6, ix, iy, iz) = dddnupen(6, j$1,j$2,j$3 ) c enddo d if ( dddnew(1,ix,iy,iz) .eq. 0. ) then d write(0,*) "STORED ZERO RHO at ix,iy,iz",ix,iy,iz d write(0,*) "noff$1$2$3=", noff$1, noff$2, noff$3 d write(0,*) "i$3start, i$3stop, noff$3=", i$3start, i$3stop, noff$3 d write(0,*) "i$2start, i$2stop, noff$2=", i$2start, i$2stop, noff$2 d write(0,*) "i$1start, i$1stop, noff$1=", i$1start, i$1stop, noff$1 d endif enddo enddo enddo c 2100 continue c if ( mythread.eq.1 ) write(0,*) "pencil done" c goto 1000 c ) end n$3pen loop 2000 continue !$omp end do c if ( mythread.eq.1 ) write(0,*) "done, freeing" c ---------------------------------------------------------------- 9999 continue ifelse(ifmema,1,<< call t_lock(7) c$omp critical deallocate( dddnupen ) deallocate( rrho ) deallocate( pp ) deallocate( uux ) deallocate( uuy ) deallocate( uuz ) deallocate( diag ) deallocate(diagnu) deallocate(pnu) deallocate(rhonu) deallocate(uxnu) deallocate(uynu) deallocate(uznu) call t_unlock(7) c$omp end critical c>>) c write(0,*) mythread, "end $1 sweep courmx=", courmxs(mythread) call ABARRIER(nthreads) return end cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c) end of sweep macro c >>) SWEEP(x,y,z) SWEEP(y,x,z) SWEEP(z,x,y) c c To perform a SINGLE timestep update, the 'master' thread should c set /conestep/ goaway, itx, ity, courmx[nthreads] c (setting courmx to zero, and the tilexy to the tilenumber to update). c c is the current timestep number, and controls the order of the c sweeps: (x,y,z) if nstep is even, (z,y,x) if nstep is odd. c c The assisting threads, 2..nthreads do not exit this routine, but c loop and sit on the semaphore'd barrier. c c Note that the result is in ddd1() for a set of 3 sweeps. c subroutine onestep( mythread ) implicit none c My thread number, 1.. integer mythread INCLUDE 'iq.h' INCLUDE 'context.h' INCLUDE 'globals.h' INCLUDE 'thingftn.h' integer nid,myid c The tile xy position in the 2-D decomposition integer itx, ity, goaway c The returned max courant number globally and per thread real courmx, courmxs(MAXTHREADS) common /conestep/ goaway, itx, ity, courmxs real*8 sumkntc(32), sumprs(32) common /stats/ sumkntc, sumprs c integer i real cournt integer ox,oy,oz, nofx,nofy,nofz integer ix_first, ix_last integer iy_first, iy_last integer iz_first, iz_last integer nbdyl_pass, nbdyt_pass, nbdy_total integer starttime, stoptime, nzones, getMillisec external getMillisec real elapsed, flops INCLUDE 'myomp.h' c Thread loop counters kept here. Eventually we will use this information c to remove some of the barriers - for now, there is an explicit barrier c between sweeps. c integer izypen1, izxpen2, iyxpen3, iyxpen4, izxpen5, izypen6 c volatile izypen1, izxpen2, iyxpen3, iyxpen4, izxpen5, izypen6 c common /sweepc/ izypen1, izxpen2, iyxpen3, iyxpen4, izxpen5, izypen6 c Thread loop counters are now internal to the 'fetch' routines. real elapsedppm1,elapsedppm2 integer timeppm(32) integer*8 nflops(32) common /ppmtimer/ timeppm, nflops logical first save first data first/.true./ c ---------------- c write(0,*) "onesteps thread,nthreads,nstep ",mythread,nthreads,nstep if ( first ) then call setstack c call omp_set_dynamic(.false.) c call omp_set_nested(.false.) c call omp_set_num_threads(nthreads) c write(0,*) "set num threads to ",omp_get_num_threads() c ,omp_get_num_procs() ? first= .false. endif c reset loop counters c if ( mythread .eq. 1 ) then call fetchset( 1, 0 ) call fetchset( 2, 0 ) call fetchset( 3, 0 ) call fetchset( 4, 0 ) call fetchset( 5, 0 ) call fetchset( 6, 0 ) c izypen1= 0 c izxpen2= 0 c iyxpen3= 0 c iyxpen4= 0 c izxpen5= 0 c izypen6= 0 starttime= getMillisec() endif nzones= 0 1111 continue call ABARRIER(nthreads) c Check termination flag if ( goaway.ne.0 ) return c if ( mythread.eq.1 ) c write(0,*) mythread,' running update',itx,ity, izypen1,izxpen2, c & iyxpen3, iyxpen4, izxpen5, izypen6 c 'o?' are the origins implied by the tile's position globally, for c use in indexing the coord. array ox = (itx)-1 + (1-nbdyf) oy = (ity)-1 + (1-nbdyf) oz = (1-nbdyf) C**** C***** The 'first/last' zones are the zones to update in the sweep. C**** It is understood that +/- nbdy zones for the sweep direction, C**** (and +/- nbdy2 zones for the transverse direction) are valid input. c**** define(IMODD,<<(iand($1,1).ne.0)>>) if ( IMODD(nstep) ) then iz_first = 1-nbdyf + nbdy iz_last = npz+nbdyf - nbdy iy_first = 1-nbdyf + nbdy2 iy_last = npy+nbdyf - nbdy2 ix_first = 1-nbdyf + nbdy2 ix_last = npx+nbdyf - nbdy2 else ix_first = 1-nbdyf + nbdy ix_last = npx+nbdyf - nbdy iy_first = 1-nbdyf + nbdy2 iy_last = npy+nbdyf - nbdy2 iz_first = 1-nbdyf + nbdy2 iz_last = npz+nbdyf - nbdy2 endif nofx= ix_last - ix_first + 1 nofy= iy_last - iy_first + 1 nofz= iz_last - iz_first + 1 nzones= nzones + nofx*nofy*nofz courmxs(mythread) = 0.0 timeppm(mythread) = 0 nflops(mythread)= 0 if ( IMODD( nstep ) ) goto 2000 call xsweep( xxl(ox), ddd0, ddd1, gamma, dtime, & ix_first, ix_last, iy_first, iy_last, iz_first, iz_last, & nofx, nofy, nofz, 1, nthreads,nstep, mythread,0 ) c call ABARRIER(nthreads) c c Reserve needed boundaries for the next sweep c ix_first = ix_first + nbdy2 ix_last = ix_last - nbdy2 iy_first = iy_first + nbdy iy_last = iy_last - nbdy iz_first = iz_first + nbdy2 iz_last = iz_last - nbdy2 c nofx= ix_last - ix_first + 1 nofy= iy_last - iy_first + 1 nofz= iz_last - iz_first + 1 nzones= nzones + nofx*nofy*nofz c c NOTE ddd0 and ddd1 are exchanged. This pattern is repeated.. call ysweep( yyl(oy), ddd1, ddd0, gamma, dtime, & ix_first, ix_last, iy_first, iy_last, iz_first, iz_last, & nofx, nofy, nofz, 2, nthreads,nstep, mythread,0 ) c call ABARRIER(nthreads) ix_first = ix_first + nbdy2 ix_last = ix_last - nbdy2 iy_first = iy_first + nbdy2 iy_last = iy_last - nbdy2 iz_first = iz_first + nbdy iz_last = iz_last - nbdy nofx= ix_last - ix_first + 1 nofy= iy_last - iy_first + 1 nofz= iz_last - iz_first + 1 nzones= nzones + nofx*nofy*nofz call zsweep( zzl(oz), ddd0, ddd1, gamma, dtime, & ix_first, ix_last, iy_first, iy_last, iz_first, iz_last, & nofx, nofy, nofz, 3, nthreads,nstep,mythread,1 ) c call ABARRIER(nthreads) c ix_first = ix_first + nbdy2 c ix_last = ix_last - nbdy2 c iy_first = iy_first + nbdy2 c iy_last = iy_last - nbdy2 c iz_first = iz_first + nbdy c iz_last = iz_last - nbdy c c nofx= ix_last - ix_first + 1 c nofy= iy_last - iy_first + 1 c nofz= iz_last - iz_first + 1 c nzones= nzones + nofx*nofy*nofz goto 3000 c - odd timestep, use other sweeps order 2000 continue call zsweep( zzl(oz), ddd0, ddd1, gamma, dtime, & ix_first, ix_last, iy_first, iy_last, iz_first, iz_last, & nofx, nofy, nofz, 4, nthreads,nstep,mythread,0 ) c call ABARRIER(nthreads) ix_first = ix_first + nbdy2 ix_last = ix_last - nbdy2 iy_first = iy_first + nbdy iy_last = iy_last - nbdy iz_first = iz_first + nbdy2 iz_last = iz_last - nbdy2 nofx= ix_last - ix_first + 1 nofy= iy_last - iy_first + 1 nofz= iz_last - iz_first + 1 nzones= nzones + nofx*nofy*nofz call ysweep( yyl(oy), ddd1, ddd0, gamma, dtime, & ix_first, ix_last, iy_first, iy_last, iz_first, iz_last, & nofx, nofy, nofz, 5, nthreads,nstep,mythread,0 ) c call ABARRIER(nthreads) ix_first = ix_first + nbdy ix_last = ix_last - nbdy iy_first = iy_first + nbdy2 iy_last = iy_last - nbdy2 iz_first = iz_first + nbdy2 iz_last = iz_last - nbdy2 nofx= ix_last - ix_first + 1 nofy= iy_last - iy_first + 1 nofz= iz_last - iz_first + 1 nzones= nzones + nofx*nofy*nofz call xsweep( xxl(ox), ddd0, ddd1, gamma, dtime, & ix_first, ix_last, iy_first, iy_last, iz_first, iz_last, & nofx, nofy, nofz, 6, nthreads,nstep,mythread,1 ) c call ABARRIER(nthreads) 3000 continue if ( mythread.ne.1 ) goto 1111 do i = 2, nthreads courmxs(1)= max( courmxs(1), courmxs(i) ) c timeppm(1) = max( timeppm(1), timeppm(i) ) nflops(1) = nflops(1) + nflops(i) enddo stoptime= getMillisec() elapsed= (stoptime-starttime)/1000.0 elapsedppm1= timeppm(1)/1000.0 elapsedppm2= timeppm(2)/1000.0 if ( mythread.eq.1 ) then nid = getnodecount_thing() myid= getid_thing() c if ( myid.eq. nid/2 ) then flops= nflops(1) / (elapsed*1.0e+06) if ( ipadme.eq.1 ) then write(0,42) nstep, elapsed, elapsedppm1,elapsedppm2, & nzones/elapsed,flops endif 42 format(i6,f8.3,2f8.2,2x,1pe12.6,0p,f8.2, & " <=nstep, elapsed, ppm1, ppm2, zones/sec, Mflops") c 42 format("cm: elapsed time: ",f8.3," ppm only: ",2f8.2, c & " (sec), zones/sec=",1pe12.6,0p," Mflops=",f7.2) endif return end