c program main c call cmain c end c c -- tm.f -- c c Prototypical task manger. It has a simple context creation c phase phollowed by an update loop c subroutine tm( myid ) implicit none integer myid c include "iq.h" include "context.h" include "task.h" include "globals.h" include "thingftn.h" c tile & worker state arrays include "tminfo.h" c mpi numbering integer nodes c ==== tile state tracking for timestep loop; c logical imrestarting c ==== timestep control, statistics real courmx, dtimenext real*8 ek,eh,et, dvol c various scratch integer i, k, it, itx, ity,nper,iper,icm integer n,ierr, status(NWSTATUS_THING) integer mystep c ------------------------------------------- nodes = getNodeCount_thing() ncm= nodes-1 write(0,fmt="('================ Task manager (restart=',i1,') with',i3,' computers')") & restart, ncm write(6,fmt="('================ Task manager (restart=',i1,') with',i3,' computers')") & restart, ncm c Tile layout is regular: c ntx= nx/npx nty= ny/npy ntiles= ntx*nty nper= ntiles/ncm if ( nper.eq.0 ) nper= 1 write(0,fmt="('tm: =================ntxy=',2i3,', ntiles=',i3,', nper=',i2)") & ntx,nty,ntiles, nper imrestarting = (restart.ne.0) call opentrace call printtraceheader if ( imrestarting ) goto 100 c // initialization follows. c Once ALL contexts exist, the 'sync_task' causes a nil work done c message to be sent to the TM. The first task cycle will set up c the problem (init_task), as keyed by nstep<0 c --------------------------------------------------------- write(0,fmt="('tm: ================ creating contexts')") nstep= -ncycles time= 0 c Create all contexts, returning global energy sums. c call tm_create_contexts(nper) call writeGlobals(0) c call synchAllCreated_thing() numTilesDone= 0 numTilesAssigned= 0 c write(0,*) "After creation, tileInfo(State,:)= ", c & (tileInfo(STATE,it),it=1,ntiles) c write(0,*) ".. and workerState(:)=", (workerState(icm),icm=1,ncm) courmx= 0 ek= 0 eh= 0 goto 1000 c ------------------------------------------ c imrestarting: 100 continue c For restarting, I need something that creates the tileInfo() data c from the list of things. The TM will have a complete list as c soon as all CMs have checked in with pseudo task completion messages c call tm_checkin_contexts numTilesDone= 0 numTilesAssigned= 0 courmx= 0 ek= 0 eh= 0 c ------------------------------------------ 1000 continue c c update loop icm= waitForWorkDone_thing( task, ntask, FOREVER ) if ( icm .le. 0 ) then write(0,*) "tm: error return from waitforwork (?!)",icm goto 1000 endif goto 1020 c Collect work completion messages until there are no more 1015 continue icm= waitForWorkDone_thing( task, ntask, 60000 ) c If no one has finished work for 60seconds, something is wrong. if ( icm .le. 0 ) goto 1040 1020 continue c --- completed work ------------ numTilesDone = numTilesDone + 1 workerState(icm)= FREE itx= task(1) ity= task(2) c write(0,*) "tm: cm ",icm," sent work done, txy=", itx,ity write(0,1021) icm,itx,ity, ntiles-numTilesDone 1021 format("tm: cm ",i3," sent work done, txy=",2i6," work remaining:",i3) c write(0,*) "tm:",icm," says ek,eh,et=",dtask(4),dtask(5),dtask(4)+dtask(5) c // 0 tile number means this CM is completing its 'synch' if ( itx.eq.0 .or. ity.eq.0 ) then if ( numTilesDone .lt. ncm ) goto 1000 write(0,*) "tm: all cm namespaces synched-----------" numTilesDone= 0 c call list_thing() c write(0,*) "tm: ------------------------------------" goto 1030 endif it = itx + (ity-1)*ntx tileInfo(STATE,it)= DONE courmx= max( courmx, rtask(6) ) ek = ek + dtask(4) eh = eh + dtask(5) et = et + dtask(4) + dtask(5) if ( numTilesDone .eq. ntiles ) then c ---- complete update cycle ------------- c NOW computes a single timestep, not a pair. dvol= (xxl(2) - xxl(1)) * (yyl(2)-yyl(1)) * (zzl(2)-zzl(1)) c write(0,666) dvol,ek,eh,gamma ek= dvol * ek/2 eh= dvol * eh / (gamma-1.0) c write(0,666) dvol,ek,eh,gamma c 666 format('dvol,ek,eh,gamma=',4e14.6) c write(0,667) xxl(1),xxl(2),yyl(1),yyl(2),zzl(1),zzl(2) c 667 format('xxl(1),xxl(2),yyl(1),yyl(2),zzl(1),zzl(2)='/,6f10.6) nstep= nstep+ncycles c write "x-restart" file. Note to start (manually) the file "restart" had c better be the most current "x-restart". c i= 0 if ( mod(nstep/ncycles,2) .ne. 0 ) i= 1 call writeGlobals( i ) i= 0 if ( ncheckpoint.ge.0 .and. mod(nstep,ncheckpoint) .eq. 0 ) then c // write names of the form "2-restart" or "3-restart" which match the c // non-local object names. i= 2 if ( mod(nstep/ncycles,ncheckpoint) .ne. 0 ) i= 3 call writeGlobals(i) endif c nstep= nstep + ncycles call printtrace( nstep,time,dtime,courmx,ek,eh ) c // If we instructed the CMs to write a compressed dump, advance c the counters: if ( nstep.ge.0 .and. time .ge. tdump ) then c // all dumps are complete, create a signifier file 'doneXXXX' call cddone( homeDir, ndump ) tdump = tdump + dtdump ndump = ndump + 1 endif c Advance to next timestep if ( nstep .gt. 0 ) then time = time + ncycles*dtime c write(0,*) "dtime,safety,courmx=", dtime, safety, courmx c write(0,*) "dtmin,dtmax= ", dtmin, dtmax, nstep if ( (mod(nstep,2).eq.0 ) ) then c // adjust dtime for next cycle dtimenext= dtime * (safety / courmx) if ( dtimenext/dtime .gt. 1.1 ) dtimenext= 1.1*dtime dtime= dtimenext if ( dtime .gt. dtmax ) dtime= dtmax if ( dtime .lt. dtmin ) dtime= dtmin endif endif numTilesDone= 0 numTilesAssigned= 0 do i = 1, ntiles tileInfo(STATE,i)= READY enddo courmx= 0 ek= 0 eh= 0 et= 0 if ( nstep .ge. stop_step .or. time .ge. stop_time) then 6000 continue c // shut them down write(0,*) "tm: shutting down" task(3)= TERMN8_TASK do icm= 1, ncm ierr= assignWork_thing( icm, task, ntask ) enddo do icm= 1, ncm ierr= waitForWorkDone_thing(task,ntask, FOREVER ) enddo goto 2000 endif endif c goto 1015 1030 continue c ================================= c Worker has announced it has completed a task - c Make work assignments, 1st to the owning CM (no such concept for SFS) c if ( .not. SharedFileSystem .and. numTilesAssigned .lt. ntiles ) then c c // For all tiles needing update, if the owner is ready, dispatch it do it= 1, ntiles icm= tileInfo(HOME,it) if ( tileInfo(STATE,it).eq.READY .and. workerState(icm).eq.FREE ) then call assign_task( icm, it ) endif enddo endif c write(0,*) "numassigned=",numTilesAssigned c // if there are still matches to be made (nonlocal), do them if ( numTilesAssigned .lt. ntiles ) then c look in reverse order to scramble things do icm= ncm, 1, -1 if ( workerState(icm) .eq. FREE ) then c // find a tile ready to update - any ready one. it= 0 1050 continue it= it+1 if ( it.le.ntiles .and. tileInfo(STATE,it).ne.READY ) goto 1050 if ( it.le.ntiles ) then call assign_task( icm, it ) endif endif enddo endif goto 1000 1040 continue write(0,*) "??work completion timer expired with ",numTilesDone," tiles done", & numTilesAssigned," assigned" write(0,*) "dump of worker state: FREE=:",FREE do i = 1, ncm write(0,*) i,workerState(i) enddo write(0,*) "dump of tile states: RDY=",READY do i= 1, ntiles write(0,*) i,tileInfo(STATE,i) enddo goto 1030 c ======================= c Exit 2000 continue call closetrace call writeGlobals(1) return end c c ----------------------------------------------------------------- c Create contexts, synchronizing and asking for work at the end. subroutine tm_create_contexts( nper ) implicit none integer ntiles, nper, ntx, nty, ncm include 'iq.h' include 'task.h' include 'tminfo.h' include 'globals.h' include 'thingftn.h' integer iper,k,icm, n,itx,ity, ierr, it, i, jty c ------- c write(0,*) "tm: create contexts nper,ncm=", nper, ncm n= 0 iper= 0 1000 iper= iper + 1 k= 0 do icm= 1, ncm n= n+1 c n = (icm-1)*nper + iper if ( n .le. ntiles ) then ity= (n-1) / ntx + 1 itx= n - (ity-1) * ntx c // jty is redundancy direction jty = ity+1 if ( jty .gt.nty ) jty= 1 task(1)= itx task(2)= ity task(3)= CREATE_TASK task(4)= jty task(5)= nstep rtask(7)= time ierr= assignWork_thing( icm, task, ntask ) k = k+1 c write(0,*) "tm: ======================= k=",k c write(0,fmt="('tm: ====assign create',i2,' tile ',2i3,' to cm ',i3)") c & n, task(1),task(2),icm tileInfo(HOME,n)= icm tileInfo(STATE,n)= READY tileInfo(TX,n)= itx tileInfo(TY,n)= ity endif enddo c write(0,*) "TM: assigned ",k," create tasks" c --- caution: this loop fails with Intel's efc compiler with any optimization c turned on at all. Something about the loop bound 'k'. c // wait for tiles to be done do i= 1, k c write(0,*) "tm: waiting loop first i,k=", i,k icm= waitForWorkDone_thing( task, ntask, FOREVER ) c write(0,*) "tm: waiting loop i=", i,k if ( icm .lt. 0 ) stop "urk 3" c write(0,*) "tm: waiting loop i=", i,k write(0,fmt="('tm: cm ',i2,' CREATED TILE',2i2)") & icm, task(1),task(2) c write(0,*) "tm: waiting loop i=", i,k c it = task(1) + (task(2)-1)*ntx c tileInfo(STATE,it)= READY c write(0,*) "tm: waiting loop finally i=", i,k enddo iper= iper + 1 if ( n .lt. ntiles ) goto 1000 c call sleep(5000) c write(0,*) "TM: IMPOSSIBLY, received ",k," tile dones" c --------------------- task(1)= 0 task(2)= 0 task(3)= SYNC_TASK do icm = 1, ncm ierr= assignWork_thing( icm, task, ntask ) if ( ierr.lt.0 ) write(0,*) "tm: sync assign failed!" workerState(icm)= FREE enddo return end c c--------------------------------------------------------------------- c Wait for all tiles to appear in 'checkin' messages from cms. c When this is complete, send 'sync' tasks to all cms, which just cause c them to 'ask for work' c subroutine tm_checkin_contexts implicit none include 'iq.h' include 'task.h' include 'tminfo.h' include 'thingftn.h' integer icm, icheckedin, itx,ity, ihome integer it,ierr icheckedin= 0 write(0,*) "tm: waiting for context checkin, ntiles=", ntiles 1010 continue icm= waitForWorkDone_thing( task, ntask, FOREVER ) c --- completed work ------------ workerState(icm)= FREE itx= task(1) ity= task(2) ihome= task(4) write(0,*) "tm: got work done, txy=", itx,ity, ihome c // This is a checkin from restart. write(0,"('tm: tile ',i2,'/',i2,' x:',i2,' y:',i2,' checked in by worker ',i3)") & icheckedin+1, ntiles, itx,ity, icm it = itx + (ity-1)*ntx tileInfo(HOME,it)= ihome tileInfo(STATE,it)= READY tileInfo(TX,it)= itx tileInfo(TY,it)= ity c // when all tiles have been checked in, send a 'ping' to all c // workers instructing them to go ahead and ask for work icheckedin= icheckedin + 1 if ( icheckedin .lt. ntiles ) goto 1010 write(0,*) "tm: checkin complete" task(3)= SYNC_TASK do icm= 1, ncm ierr= assignWork_thing( icm, task, ntask ) enddo return end c c ------------------------------------------------------------------- c Assign update/init task to CM number icm c subroutine assign_task( icm, it ) implicit none integer icm, it include 'iq.h' include 'tminfo.h' include 'task.h' include 'globals.h' include 'thingftn.h' integer ierr, itx, ity c ----- ity= (it-1)/ntx itx= it - ity*ntx ity= ity+1 task(1)= itx task(2)= ity task(3)= UPDATE_TASK if ( nstep .lt. 0 ) task(3)= INIT_TASK task(4)= -1 if ( time .ge. tdump ) task(4)= ndump task(5)= nstep rtask(6)= dtime rtask(7)= time write(0,fmt="('tm: assigning task ',i3,' type ',i2,', txy=',2i3,' to cm ',i2)") & numTilesAssigned+1, task(3), task(1), task(2), icm ierr= assignWork_thing( icm, task, ntask ) if ( ierr .lt. 0 ) write(0,*) "tm: error return from assignment to ",icm numTilesAssigned= numTilesAssigned + 1 workerState(icm)= it c // that is, ASSIGNED tileInfo(STATE,it)= it return end c c ------------------------------------------------------------------- c subroutine printtraceheader character*100 line c open(unit=traceunit,name='trace', form='FORMATTED',position='APPEND') write(line,1700) call writetrace(line) write(line,1701) call writetrace(line) write(6,1700) write(6,1701) return 1700 format("nstep",5x,"Time",4x,"Dtime",3x,"Cour", & 7x,"Kinetic",10x,"Heat",9x,"Tot E") 1701 format( & "_____ ________ ________ ______", & " _____________ _____________ _____________") end c c ------------------------------------------------------------------- subroutine printtrace( nstep,time,dtime,courmx,ek,eh ) implicit none integer nstep real time,dtime,courmx real*8 et, ek,eh character*100 line c open(unit=traceunit,name='trace', form='FORMATTED',position='APPEND') et= ek+eh write(line,1710) nstep,time,dtime,courmx,ek,eh,et call writetrace(line) write(6,1710) nstep,time,dtime,courmx,ek,eh,et return 1710 format(i5, f9.4, f9.6, f7.4, 3f14.9 ) end c c ---------------------------------------------------------------- c subroutine stuffinCommon( c_glob, c_runName, c_rootDir, c_globalsName, c_homeDir, & c_xxl, c_yyl, c_zzl ) implicit none include 'iq.h' include 'globals.h' integer c_glob(22), glob(22) character*20 c_runName character*256 c_rootDir, c_globalsName, c_homeDir real*8 c_xxl(1-nbdyf:nx+nbdyf+1), c_yyl(1-nbdyf:ny+nbdyf+1), c_zzl(1-nbdyf:nz+nbdyf+1) integer i equivalence (glob(1),restart) do i= 1, 22 glob(i)= c_glob(i) enddo runName= c_runName rootDir= c_rootDir globalsName= c_globalsName homeDir= c_homeDir do i= 1-nbdyf, nx+nbdyf+1 xxl(i)= c_xxl(i) enddo do i= 1-nbdyf, ny+nbdyf+1 yyl(i)= c_yyl(i) enddo do i= 1-nbdyf, nz+nbdyf+1 zzl(i)= c_zzl(i) enddo return end c c----------------------------------------------------------------------------------- c Has to be in fortran to get /globals/ common block c Writes "prefix"-restart file, where prefix is assumed to be the digit 0,1,2 or 3 c subroutine writeGlobals( prefix ) implicit none integer prefix include 'iq.h' include 'globals.h' integer binopen integer iunit, n, nwritten character*256 name nwritten= 0 iunit= 66 name= char(ichar('0') + prefix)//'-'//globalsName iunit= binopen( name, len(name), iunit ) n = 23*4 + 20 + 3*256 call binwrite( iunit, restart, n ) nwritten= nwritten + n n = (2*nbdyf+MX_AXIS+1)*8 * 3 + MAXCM*16 call binwrite( iunit, xxl, n ) nwritten= nwritten + n call binclose(iunit) c n= 0 c 10 continue c n = n + 1 c if ( n+1.gt.len(globalsName)+2 ) goto 20 c if ( globalsName(n+1:n+1) .ne. char(0) ) goto 10 c 20 continue c c write(0,1) nwritten,name(1:n) c 1 format("tm: wrote",i6," bytes checkpoint globals ",a) return end