c -- cm.f -- c c Rudimentary prototypical compute manager. It has a simple context creation c phase phollowed by an update loop c subroutine cm( myid ) implicit none integer myid c include 'iq.h' include 'context.h' include 'task.h' include 'globals.h' include 'thingftn.h' integer workType c This is the node 'asked' for work. -1 indicates no message will be sent, c we will just wait for an assignment. integer tm integer maxsize(9) integer ierr, i c 'goaway' is all this node needs to tell the worker (update_tile has more c to say). integer goaway common /conestep/ goaway external onestep save threadcounter integer threadcounter(MAXTHREADS) c -------------------------------- -------------------------------- c write(0,*) "cm: ================ Compute manager started", myid c write(6,*) "cm: ================ Compute manager started", myid c write(0,*) "cm: rootDir=", rootDir tm= 0 goaway= 0 c Spawn helper compute thread if required. (not needed for omp version) if ( nthreads .gt. 1 ) then do i= 2, nthreads threadcounter(i)= i call create( threadcounter(i), onestep, ierr ) c write(0,*) "cm: !!!!!!!!! spawned helper thread", i, ierr enddo endif if ( restart .ne. 0 ) call checkin(myid) 1000 continue ierr= getWork_thing( tm, task, ntask, FOREVER ) workType= task(3) c write(0,fmt="('cm:',i3,' received work type ',i2)") Myid, workType if ( ierr.lt.0 ) goto 1000 goto (1001,1002,1003,1004,1005,1006, 1007,1666), workType 1001 continue c // WAKEUP_TASK - the only thing to do is reply done task(1)= 0 task(2)= 0 ierr= workDone_thing( 0, task, ntask ) tm= 0 goto 1000 1002 continue c // SLEEP_TASK - no reply, just wait again WITHOUT sending a c work request message to the tm. tm= -1 goto 1000 1003 continue c // UPDATE_TASK ipadme= myid call update_tile(myid) ierr= workDone_thing( 0, task, ntask ) c write(0,*) "cm: ",myid," updated task ",task(1),task(2), ierr goto 1000 1004 continue c // INIT_TASK - fill things with physics call initialize_tile(myid) c write(0,*) "cm:",myid," initialized task ",task(1),task(2), ierr ierr= workDone_thing( 0, task, ntask ) goto 1000 1005 continue c // SYNC_TASK - after the init_task, to ensure all nodes know about all things. c Enters 'sleep' mode afterwards c call synchAllCreated_thing() task(1)= 0 task(2)= 0 ierr= workDone_thing( 0, task, ntask ) c write(0,*) "cm:",myid," syncd task ",task(1),task(2), ierr goto 1000 1006 continue c // CREATE_TASK c Make list of expected maximum size of each object (in reals): maxsize(1)= (npx*npy*nvar) * (nz+2*nbdyf) maxsize(2)= (npx*nbdyf*nvar) * (nz+2*nbdyf) maxsize(3)= maxsize(2) maxsize(4)= (npy*nbdyf*nvar) * (nz+2*nbdyf) maxsize(5)= maxsize(4) maxsize(6)= (nbdyf*nbdyf*nvar) * (nz+2*nbdyf) maxsize(7)= maxsize(6) maxsize(8)= maxsize(7) maxsize(9)= maxsize(8) c write(0,*) "cm:",Myid," creating tile ",task(1), task(2) call create_tile( task, maxsize ) ierr= workDone_thing( 0, task, ntask ) goto 1000 1007 continue c // PING - just ask for work task(1)= 0 task(2)= 0 ierr= workDone_thing( 0, task, ntask ) write(0,*) "cm",myid," pinged task ",task(1),task(2), ierr goto 1000 1666 continue c // TERMN8_TASK - this, I can do. c c Kill compute threads goaway= 1 if ( nthreads .gt. 1 ) call ABARRIER(nthreads) ierr= workDone_thing( 0, task, ntask ) return end c c c If restarting, we examine the list of things for a complete set of c tile records, and having found same, inform the TM of its presence. c Incomplete tile records should generate an error message. c c A complete tile record consists of 9 files named: c d-txty-(ddd,bxl,bxr,byb,ybt,bxlyb,bxlyt,bxryt,bxryb) c c where 'd' is 0 if nstep/ncycles is even and 1 otherwise c txty is the tile location in the 2d decomposition, (1..,1..) c subroutine checkin(myid) implicit none c integer myid include 'iq.h' include 'context.h' include 'task.h' include 'globals.h' include 'thingftn.h' integer ntx, nty, ntiles, homenode integer iset, ierr, i, ity,itx character*256 name character*1 set c ------- c write(0,*) "cm: checkin ",myid c if ( SharedFileSystem .and. myid.ne.1 ) return c Only CM 1 does the checkin, as it wil have all the names courtesy c of start_thing c if ( myid .ne.1 ) return ntx= (nx+npx-1)/npx nty= (ny+npy-1)/npy ntiles= ntx*nty iset= mod( nstep/ncycles, 2 ) set= '1' if ( iset.eq. 0 ) set= '0' c Call thing_info(), which returns things one at a time. The name is c compared with the checklist, and when all 9 files are in a checkin message c is sent to the TM. c c Actually, for now just look for files of the form 'iset-txty-ddd' eg c "0-11-ddd". If the required boundary files aren't there, we'll hear about it c pretty soon. c c call list_thing() i= 0 1000 continue i= i+1 ierr= getnameof_thing( i, name, homenode ) if ( ierr .gt. 0 ) then if ( ierr.ne.10 ) goto 1000 if ( name(1:1).ne.set .or. name(8:10) .ne. 'ddd' ) goto 1000 itx= ichar(name(3:3)) - ichar('0') itx= itx*10 + (ichar(name(4:4)) - ichar('0')) ity= ichar(name(5:5)) - ichar('0') ity= ity*10 + (ichar(name(6:6)) - ichar('0')) c write(0,*) "cm",myid," checked in thing: ", name(1:ierr), itx,ity task(1)= itx task(2)= ity task(4)= homenode ierr= workDone_thing( 0, task, ntask ) goto 1000 endif return end c c ---------------------------------------------------------------- c Initialize the problem state variables. c c Argument(s) are work vector cast in various types: c c (1) IN: tile x c (2) IN: tile y c (3) IN: tasktype - UPDATE_TASK, here c (4) IN: write cdump if <> 0 c (5) IN: nstep c (6) IN: dtime (real), OUT: courmx (real) c (7-8) OUT: dtask(4) sumkntc (double) c (9-10) OUT: dtask(5) sumprs (double) c subroutine initialize_tile(myid) implicit none integer myid include 'thingftn.h' include 'iq.h' include 'context.h' include 'globals.h' include 'task.h' integer k, iz, iz0,iz1,zupdate logical first data first/.true./ real*8 sumkntc,sumprs, sumkntc2,sumprs2 real dmin(nvar), dmax(nvar) integer idump integer itx,ity,itz, tileOffset(3), pen0(3), pen1(3) integer status_back(NWSTATUS_THING,27) integer writeBack, nwrites external writeBack common /fubar1/ status_back c ---- c write(0,*) "CM initialize_tile: ", task if ( first ) then first= .false. call setupturbglobals c write(0,*) "turb globals setup" endif c // consider 'pencils' to be whole z-planes do k= 1, nvar dmin(k)= 1.0e+06 dmax(k)= -dmin(k) enddo nwrites= 0 sumkntc= 0 sumkntc2= 0 sumprs= 0 sumprs2= 0 itx= task(1) ity= task(2) itz= 1 idump = task(4) nstep= task(5)+ncycles tileOffset(1)= npx * (itx-1) tileOffset(2)= npy * (ity-1) pen0(1)= 1 pen0(2)= 1 pen1(1)= npx pen1(2)= npy c Since the tower may consist of many ddd sized contexts, each npz high: zupdate= 1 1000 continue iz0= zupdate iz1= zupdate + npz - 1 if ( iz1 .gt. nz ) iz1= nz tileOffset(3)= zupdate do iz= 1, iz1-iz0+1 c write (0,*) iz pen0(3)= iz pen1(3)= iz call setupturbpencil( pen0,pen1, tileOffset, sumkntc2,sumprs2, & dmin, dmax) c call loadturbpencil( pen0,pen1, tileOffset, sumkntc2,sumprs2, c & dmin, dmax) enddo c write(0,*) "setupturb sumkntc, sumprs= ",sumkntc2, sumprs2 c tileOffset: Global coords for this tile c pen0,pen1 Limits in tile coords for this pencil c sumkntc,prs returned energy sums (accumulated w/input values) c dmin,dmax variable extrema returned (composite w/input) c c subroutine setupturbpencil( pen0,pen1, c & tileOffset, sumkntc,sumprs,dmin,dmax ) c // ddd0() is now set up. Write it to the '0' set c ccc call waitOn( status_back, nwrites ) c // write back, extracting to ccc1 and boundaries, c // writing set 0 given zrange nwrites= writeBack( ddd0, 0, 0, itx, ity, iz0, iz1, status_back ) c now returned from setup (or hydrostep) sumkntc= sumkntc2 sumprs= sumprs2 c write(0,*) "... about to cdump" if ( idump .ge. 0 ) call cdump( ddd0, idump, itx,ity,itz ) c write(0,*) "$$$$$$$$$$$cm ",myid," waiting for writeback", nwrites call waitOn( myid, status_back, nwrites ) c write(0,*) "$$$$$$$$$$$cm ",myid," finished writeback" zupdate= zupdate + npz itz = itz + 1 if ( zupdate .le. nz ) goto 1000 task(6)= 0 dtask(4)= sumkntc dtask(5)= sumprs c write(0,*) "**** Done with initialize_tile ", task(1), task(2) c write(0,*) "writeBack sums: ", sumkntc, sumprs c write(0,*) "setup sums: ", sumkntc2, sumprs2 return end c c c ---------------------------------------------------------------- c Argument is work assignment vector (see task.h) c c (1) IN: tile x c (2) IN: tile y c (3) IN: tasktype - UPDATE_TASK, here c (4) IN: write cdump if <> 0 c (5) IN: nstep c (6) IN: dtime (real), OUT: courmx (real) c (7-8) OUT: dtask(4) sumkntc (double) c (9-10) OUT: dtask(5) sumprs (double) c subroutine update_tile(myid) implicit none integer myid include 'thingftn.h' include 'iq.h' include 'context.h' include 'globals.h' include 'task.h' integer idump integer i,j, mythread integer ix0,iy0,iz0, itx, ity, itz integer goaway real courmxs(MAXTHREADS) common /conestep/ goaway,itx,ity, courmxs integer z0,z1, inset, outset, outsetcheck integer zprefetch, zupdate, zwrite, zfirst,zlast integer status_prefetch(NWSTATUS_THING,9) integer status_writeback(NWSTATUS_THING,54) integer writeBack, nwrites external writeBack real courmx real*8 sumkntc(32), sumprs(32) common /stats/ sumkntc, sumprs real rho(MX_AXIS), prs(MX_AXIS), uux(MX_AXIS), uuy(MX_AXIS), uuz(MX_AXIS), diag(MX_AXIS) integer ix,iy,iz c ---- itx= task(1) ity= task(2) idump = task(4) c // nstep is the CURRENT time step (copied for onestep) nstep= task(5) itz= 1 dtime= rtask(6) time= rtask(7) courmx= 0 do i= 1, nthreads courmxs(i)= 0 sumkntc(i)= 0 sumprs(i)= 0 enddo c The plan to update a 'tower' of blocks specified by the 2-D c domain decomposition suggests pipelining the updates, overlapping the c prefetch, update and writeback of each segment. c c This is especially appropriate since the largest problem sizes considered c result in towerss far larger than the available processor memory. c c So, calling the chunks z1, z2... the pipeline looks like this: c c prefetch(z1) c c update(z1) prefetch(z2) c c writeBack (z1) update(z2) prefetch(z3) c c writeBack (z2) update(z3) c c writeBack (z3) c zfirst= 1 zlast= nz c We read from the of interior and boundary zones, where inset is c zero or one. The is, of course, 1-inset, and the two flip on each c timestep pair. inset= 0 if ( mod(nstep/ncycles,2) .ne. 0 ) inset= 1 outset= 1-inset c During writeback, we also may write a non-local checkpoint set outsetcheck= 0 if ( ncheckpoint.ge.0 .and. mod(nstep,ncheckpoint) .eq. 0 ) then outsetcheck= 2 if ( mod(nstep,2*ncheckpoint) .eq. 0 ) outsetcheck= 3 endif zprefetch = zfirst zupdate = zprefetch - npz zwrite = zupdate - npz 1000 continue c write(0,*) "cm update_tile: inset, zpre,zupd,zwrite=", inset, zprefetch, zupdate,zwrite c // while ( zwrite .le. zlast ) if ( zupdate.ge.zfirst .and. zupdate .le.zlast ) then call copyIn( myid, zupdate, min(zlast,zupdate+npz-1), status_prefetch ) endif if ( zprefetch.ge.zfirst .and. zprefetch.le.zlast ) then z0= zprefetch z1= min(zlast, z0-1+npz ) call prefetch( inset, itx,ity, z0,z1, status_prefetch ) endif if ( zwrite.ge.zfirst .and. zwrite.le.zlast ) then c write(0,*) "&&&&&&&cm ",myid," waiting for writeback" call waitOn( myid, status_writeback, nwrites ) c write(0,*) "&&&&&&&cm ",myid," finished writeback" endif if ( zupdate.ge.zfirst .and. zupdate.le.zlast ) then z0= zupdate z1= min(zlast,zupdate+npz-1) c // portion of global coordinates for this tile ix0= ((itx-1) * npx ) - nbdyf iy0= ((ity-1) * npy ) - nbdyf iz0= z0 - nbdyf c ========= c Scan all of ddd0() looking for zero density d do iz= 1-nbdyf, npz+nbdyf d do iy= 1-nbdyf, npy+nbdyf d do ix= 1-nbdyf, npx+nbdyf d if ( ddd0(1,ix,iy,iz) .eq. 0 ) then d write(0,*) "0000000000000000000000000000000 at",ix,iy,iz d write(0,2000) "i","xl", "rho", "p","uux","uuy","uuz" d 2000 format(a5,1x,6a12) d d do i= 1-nbdyf, nx+nbdyf d write(0,2010) i,xxl(i),(ddd0(j,i,iy,iz),j=1,5) d enddo d 2010 format(i5,1x,6e12.5) d stop d endif d enddo d enddo d enddo c ========= c no more 1-d strips! c if ( ity.eq.1 .and. itz.eq.1 ) then c do i= 1, npx c rho(i)= ddd0(1,i,1,1) c prs(i)= ddd0(2,i,1,1) c uux(i)= ddd0(3,i,1,1) c uuy(i)= ddd0(4,i,1,1) c uuz(i)= ddd0(5,i,1,1) c diag(i)= ddd0(6,i,1,1) c enddo c call stripout( 'x',npx, itx,ity,itz, nstep, time, rho,prs,uux,uuy,uuz,diag ) c endif c c if ( itx.eq.1 .and. itz.eq.1 ) then c do i= 1, npy c rho(i)= ddd0(1,1,i,1) c prs(i)= ddd0(2,1,i,1) c uux(i)= ddd0(3,1,i,1) c uuy(i)= ddd0(4,1,i,1) c uuz(i)= ddd0(5,1,i,1) c diag(i)= ddd0(6,1,i,1) c enddo c call stripout( 'y',npy, itx,ity,itz, nstep, time, rho,prs,uux,uuy,uuz,diag ) c endif c c if ( itx.eq.1 .and. ity.eq.1 ) then c do i= 1, npz c rho(i)= ddd0(1,1,1,i) c prs(i)= ddd0(2,1,1,i) c uux(i)= ddd0(3,1,1,i) c uuy(i)= ddd0(4,1,1,i) c uuz(i)= ddd0(5,1,1,i) c diag(i)= ddd0(6,1,1,i) c enddo c call stripout( 'z',npz, itx,ity,itz, nstep, time, rho,prs,uux,uuy,uuz,diag ) c endif c write(0,*) "doing onestep", itx,ity, itz, nstep mythread= 1 call onestep(mythread) cc // These parameters now passed in /do2steps/ to make ||ism easier (spawning) cc call twostep( mythread ) courmx= max( courmx, courmxs(1) ) if ( idump .ge. 0 ) then c // for the cd, use the step number we just computed with twosteps nstep= task(5)+ncycles call cdump( ddd1, idump, itx,ity,itz) endif nwrites= writeBack( ddd1, outset, outsetcheck, itx,ity, z0, z1, status_writeback ) itz = itz + 1 endif c // if ( zupdate.. ) zprefetch = zprefetch + npz zupdate = zupdate + npz zwrite = zwrite + npz if ( zwrite .le. nz ) goto 1000 c // At this point there is nothing left to do, even pipeline cleanup. c -- say we found courno max rtask(6)= courmx c -- and these energy totals do i = 2, nthreads sumkntc(1)= sumkntc(1) + sumkntc(i) sumprs (1)= sumprs (1) + sumprs (i) enddo dtask(4)= sumkntc(1) dtask(5)= sumprs(1) c write(0,*) "CM: tile",itx,ity," returning sums ", sumkntc(1), sumprs(1), c & courmx return end c ---------------------------------------------------------------- c Wait until writeBack (of previous iteration) is complete subroutine waitOn( myid, status, nwrites ) implicit none integer myid include 'thingftn.h' integer status(NWSTATUS_THING,27) integer i, ierr, nwrites integer nbytes, nbytesRIO, thisbytes, thisremote real elapsed, elapsedRIO, thistime if ( nwrites .eq. 0 ) return elapsed= 0 elapsedRIO= 0 nbytes= 0 nbytesRIO= 0 do i= 1, nwrites ierr= wait_thing( status(1,i), FOREVER ) call getIOstatistics_thing( status(1,i), thistime, thisbytes, thisremote ) if ( thisremote .ne. 0 ) then nbytesRIO = nbytesRIO + thisbytes elapsedRIO= elapsedRIO + thistime else nbytes = nbytes + thisbytes elapsed= elapsed + thistime endif enddo if ( nbytes.eq.0 ) elapsed= 1 if ( nbytesRIO.eq.0 ) elapsedRIO= 1 if ( myid.eq.1 ) & write(0,*) "cm",myid," writeback local ",nbytes, nbytes/elapsed/1.0e6, & " remote ",nbytesRIO, nbytesRIO/elapsedRIO/1.0e6 return end c c ---------------------------------------------------------------- c Start reads for all boundary info & interior c needed to bdrysin and ccc0. z0 and z1 ranges will be extended c to cover needed z fake zones. c subroutine prefetch( iset,itx,ity, iz0, iz1, status ) implicit none include 'thingftn.h' include 'iq.h' include 'bdrysin.h' integer iset, itx, ity integer iz0, iz1, status(NWSTATUS_THING,9) integer zoff, z0,z1, nreals, nofz c This is a series of calls to read_stuff, which is in C for ease c in composing the thing names. c z0= iz0-nbdyf z1= iz1+nbdyf nofz = z1-z0+1 c write(0,*) "entering prefetch, z0/1 iset=",z0,z1, iset c Figure the offset in the files (reals) and read count (reals) for c each boundary element. We use reals instead of bytes for some 2^32 c headroom. zoff = npx*npy*nvar * (z0-(1-nbdyf)) nreals= npx*npy*nvar * nofz c write(0,*) "read_stuff zoff,nreals=", zoff, nreals call read_stuff( 1,itx,ity,iset, zoff,nreals, ccc0, status(1,1) ) zoff = nbdyf*npy*nvar * (z0-(1-nbdyf)) nreals= nbdyf*npy*nvar * nofz call read_stuff( 2,itx,ity,iset, zoff,nreals, xl, status(1,2) ) call read_stuff( 3,itx,ity,iset, zoff,nreals, xr, status(1,3) ) zoff = npx*nbdyf*nvar * (z0-(1-nbdyf)) nreals= npx*nbdyf*nvar * nofz call read_stuff( 4,itx,ity,iset, zoff,nreals, yb, status(1,4) ) call read_stuff( 5,itx,ity,iset, zoff,nreals, yt, status(1,5) ) zoff = nbdyf*nbdyf*nvar * (z0-(1-nbdyf)) nreals= nbdyf*nbdyf*nvar * nofz call read_stuff( 6,itx,ity,iset, zoff,nreals, xlyb, status(1,6) ) call read_stuff( 7,itx,ity,iset, zoff,nreals, xlyt, status(1,7) ) call read_stuff( 8,itx,ity,iset, zoff,nreals, xryt, status(1,8) ) call read_stuff( 9,itx,ity,iset, zoff,nreals, xryb, status(1,9) ) return end c stub for sppm's use (breaking optimizer loop combiner flow) subroutine dummy return end