These timings suggest that parallel FFTs should be implemented using hybrid MPI-OpenMP code, and the large DGEMMs should use GPUs. This is somewhat justified, since the cost of parallel FFTs is mostly due to data movement, i.e. FFTs are memory bound rather than computationlly bound. However, we need to test the competiveness of pipelining FFT data to GPUs, and using Stockholm FFT kernels (https://github.com/ebylaska/PWDFT/tree/master/Miscellaneous/programfft), versus an MPI-only algorithm.
cmake -Bbuild_cuda -DNWPW_CUDA=ON ./Nwpw -DCMAKE_CUDA_ARCHITECTURES=80
cd build_cuda
make -j
4. Job submission script via `qsub polaris_submit.pbs`. Please note that it needs an additional bash-script to bind the GPU-IDs to MPI-ranks.
The script below is the MPI-ranks with GPUs affinity script:
!/bin/bash
num_gpus=4
need to assign GPUs in reverse order due to topology
PWDFT
Plane-Wave Density Functional Theory development for NWChemEx
Web location:
🔗 https://ebylaska.github.io/PWDFT/
Autonomous Plane-Wave DFT Engine for Exascale Materials Discovery
🧰 CMake — Generate a Project Buildsystem
Standard cmake build commands
Timings
laptop timings
These timings suggest that parallel FFTs should be implemented using hybrid MPI-OpenMP code, and the large DGEMMs should use GPUs. This is somewhat justified, since the cost of parallel FFTs is mostly due to data movement, i.e. FFTs are memory bound rather than computationlly bound. However, we need to test the competiveness of pipelining FFT data to GPUs, and using Stockholm FFT kernels (https://github.com/ebylaska/PWDFT/tree/master/Miscellaneous/programfft), versus an MPI-only algorithm.
Compiling and Running Instructions
Build instructions on ALCF Aurora/Sunspot
Required Modules
Getting the code and building instructions
Running
Instructions for OLCF Frontier
Toggle for Details
sbatch job_submit.sbatch```!/bin/bash
SBATCH -A
SBATCH -J
SBATCH -o %x-%j.out
SBATCH -t 00:15:00
SBATCH -N 1
SBATCH -C nvme
SBATCH –mail-user=
SBATCH –mail-type=END
module load amd-mixed module list
export MPICH_GPU_SUPPORT_ENABLED=0 export OMP_NUM_THREADS=1 export CRAYPE_LINK_TYPE=dynamic
date
NNODES=1 NRANKS_PER_NODE=8 NTOTRANKS=$(( NNODES * NRANKS_PER_NODE ))
PWDFT_EXE= PWDFT_INPUT=
srun -NNNODES−n{NTOTRANKS} -c1 –ntasks-per-gpu=1 –gpus-per-node=8 –gpu-bind=closest PWDFTEXE{PWDFT_INPUT}
cmake -H. -Bbuild_cuda -DNWPW_CUDA=ON ./Nwpw -DCMAKE_CUDA_ARCHITECTURES=80 -DCUDA_cublas_LIBRARY=CRAYCUDATOOLKITDIR/../../mathlibs/{CRAY_CUDATOOLKIT_VERSION#_}/lib64/libcublas.so -DCUDA_cufft_LIBRARY=CRAYCUDATOOLKITDIR/../../mathlibs/{CRAY_CUDATOOLKIT_VERSION#}/lib64/libcufft.so -DCUDA_cusolver_LIBRARY=CRAYCUDATOOLKITDIR/../../mathlibs/{CRAY_CUDATOOLKIT_VERSION#*}/lib64/libcusolver.so cd build_cuda make
module purge module load cmake/3.20.2 module load PrgEnv-intel module load craype-haswell module load openmpi
mkdir build cd build cmake -DCMAKE_CXX_COMPILER=CC ../Nwpw/
salloc –nodes 1 –qos interactive –time 01:00:00 –constraint haswell srun -n -c a.out srun -n -c pwdft
salloc –nodes 1 –qos interactive –time 01:00:00 –constraint haswell cd PWDFT/QA/C2_steepest_descent srun -n 24 ../../build/pwdft c2-sd.nw
ssh polaris.alcf.anl.gov
module load PrgEnv-gnu cudatoolkit-standalone cmake
cmake -Bbuild_cuda -DNWPW_CUDA=ON ./Nwpw -DCMAKE_CUDA_ARCHITECTURES=80 cd build_cuda make -j
!/bin/bash
num_gpus=4
need to assign GPUs in reverse order due to topology
See Polaris Device Affinity Information https://www.alcf.anl.gov/support/user-guides/polaris/hardware-overview/machine-overview/index.html
gpu=(({num_gpus} - 1 - PMILOCALRANK{num_gpus}))
unset CUDA_VISIBLE_DEVICES if [ PMILOCALRANK−ne4];thenexportCUDAVISIBLEDEVICES=gpu fi #echo “RANK= PMIRANKLOCALRANK={PMI_LOCAL_RANK} gpu= CUDAVISIBLEDEVICES"exec"@”
!/bin/bash
PBS -N develop
PBS -l select=8:system=polaris
PBS -l place=scatter
PBS -l walltime=01:00:00
PBS -l filesystems=home:eagle
PBS -A
PBS -q workq
module load PrgEnv-gnu cudatoolkit-standalone module list
export MPICH_GPU_SUPPORT_ENABLED=0 export CRAYPE_LINK_TYPE=dynamic env nvidia-smi topo -m
cd ${PBS_O_WORKDIR}
NNODES=
wc -l < $PBS_NODEFILENRANKS_PER_NODE=4 NTHREADS=1NTOTRANKS=((NNODES∗NRANKSPERNODE))echo"NUMOFNODES={NNODES} TOTAL_NUM_RANKS= NTOTRANKSRANKSPERNODE={NRANKS_PER_NODE} THREADS_PER_RANK= ${NTHREADS}”
PWDFT_EXE= PWDFT_INPUT=
mpiexec -n NTOTRANKS−−ppn{NRANKS_PER_NODE} –mem-bind list:0:1:2:3 –cpu-bind list:0-7:8-15:16-23:24-31 –env OMP_NUM_THREADS=NTHREADS./gpubindaffinity.sh{PWDFT_EXE} ${PWDFT_INPUT}
module unload impi module load PrgEnv-intel module load cmake module load cudatoolkit
mkdir build_cuda cd build-cuda cmake -DNWPW_CUDA=ON ../Nwpw/
module load cgpu salloc -C gpu -t 60 -c 10 -G 1 -q interactive -A salloc -C gpu -t 60 -c 10 -G 1 -q interactive -A mp119
cmake ../Nwpw -DMAKE_LIBRARY=true
cmake ../Nwpw -DMAKE_LIBRARY=true -DCMAKE_POSITION_INDEPENDENT_CODE=ON
make
prompt% ls CMakeCache.txt Makefile cmake_install.cmake nwpwlib/ CMakeFiles/ NwpwConfig.h libpwdft.dylib* pspw/
include
include “mpi.h”
namespace pwdft { using namespace pwdft;
extern char *util_date(); extern void seconds(double *); extern int cpsd(MPI_Comm, std::string&); extern int cpmd(MPI_Comm, std::string&); extern int pspw_minimizer(MPI_Comm, std::string&); extern int pspw_geovib(MPI_Comm, std::string&); }
ierr += pwdft::pspw_geovib(MPI_COMM_WORLD,nwinput);
mpic++ test.cpp ../build-shared/libpwdft.dylib setenv DYLD_LIBRARY_PATH /Users/bylaska/Codes/PWDFT/build-shared a.out
mpic++ test.cpp ../build-shared/libpwdft.so setenv DYLD_LIBRARY_PATH /Users/bylaska/Codes/PWDFT/build-shared a.out