Commit 18e8000d authored by Stefano de Gironcoli's avatar Stefano de Gironcoli
Browse files

Basic libraries updated to include some GPU stuff

Some unit-test stuff also added to UtilXlib
install/makedeps script updated
parent 532792e1
Pipeline #3729 failed with stages
in 14 seconds
......@@ -49,12 +49,4 @@ TEST0: test0.o libqefft.a
clean :
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L
# .PHONY forces execution of a rule irrespective of the presence of an
# updated file with the same name of the rule. In this way, the script
# that generates version.f90 always runs, updating the version if you
# execute "svn update". The update_version script takes care of not
# changing the file if the svn version did not change
.PHONY: all clean
include make.depend
# FFTXlib
Implements real space grid parallelization of FFT and task groups.
## Testing and Benchmarking
This library also provides a testing and timing code to asses the performance of your FFT, estimate the
scalability and the optimal parameters for your simulation.
To compile the test program, once you have properly configure QE within a parallel environment,
go inside the directory FFTXlib and type:
make TEST
Then you can run your FFT tests using command like:
mpirun -np 4 ./fft_test.x -ecutwfc 80 -alat 20 -nbnd 128 -ntg 4
Command line arguments:
-ecutwfc Plane wave energy cut off
-alat Lattice parameter (for hard coded lattice structure)
-nbnd Number of bands (fft cycles)
-ntg Number of task groups
-av1 x y z First lattice vector, in atomic units. N.B.: when using -av1, -alat is ignored!
-av2 x y z Second lattice vector, in atomic units. N.B.: when using -av2, -alat is ignored!
-av3 x y z Third lattice vector, in atomic units. N.B.: when using -av3, -alat is ignored!
-kmax kx ky kz Reciprocal lattice vector inside the BZ with maximum norm. Used to calculate max(|G+K|). (2pi/a)^2 units.
A python script to extract the parameters from an output file of pw.x is also available. Example usage:
$ python gen_test_params.py a_pw_output
To analize performances run with:
mpirun -np X ./fft_test.x -ntg Y -ecutwfc 36.7500 -ecutrho 147.0000 -av1 36.6048 0.0 0.0 -av2 -18.3024 31.70067192 0.0 -av3 0.0 0.0 18.3024 -nbnd 400 -gamma .true.
Replace `X` and `Y` with appropriate values for your simualtion.
......@@ -350,21 +350,26 @@ CONTAINS
! c array: stores the Fourier expansion coefficients
! Loop for all local g-vectors (ngw)
IF( PRESENT(ca) ) THEN
do ig = 1, desc%ngm
psi( desc%nlm( ig ) ) = CONJG( c( ig ) ) + ci * conjg( ca( ig ))
psi( desc%nl( ig ) ) = c( ig ) + ci * ca( ig )
end do
ELSE
IF( desc%ngm == desc%ngl( desc%mype + 1 ) ) THEN
DO ig = 1, desc%ngm
psi( desc%nl( ig ) ) = c( ig )
END DO
IF( desc%lgamma ) THEN
do ig = 1, desc%ngm
psi( desc%nlm( ig ) ) = CONJG( c( ig ) ) + ci * conjg( ca( ig ))
psi( desc%nl( ig ) ) = c( ig ) + ci * ca( ig )
end do
ELSE
! Gamma symmetry
do ig = 1, desc%ngm
psi( desc%nl( ig ) ) = c( ig ) + ci * ca( ig )
end do
END IF
ELSE
IF( desc%lgamma ) THEN
do ig = 1, desc%ngm
psi( desc%nlm( ig ) ) = CONJG( c( ig ) )
psi( desc%nl( ig ) ) = c( ig )
end do
ELSE
DO ig = 1, desc%ngm
psi( desc%nl( ig ) ) = c( ig )
END DO
END IF
END IF
END SUBROUTINE
......
......@@ -77,7 +77,17 @@
LOGICAL, SAVE :: dfti_first = .TRUE.
INTEGER :: dfti_status = 0
!
CALL check_dims()
! Check dimensions and corner cases.
!
IF ( nsl <= 0 ) THEN
IF ( nsl < 0 ) CALL fftx_error__(" fft_scalar: cft_1z ", " nsl out of range ", nsl)
! Starting from MKL 2019 it is no longer possible to define "empty" plans,
! i.e. plans with 0 FFTs. Just return immediately in this case.
RETURN
END IF
!
! Here initialize table only if necessary
!
......@@ -118,12 +128,6 @@
CONTAINS !=------------------------------------------------=!
SUBROUTINE check_dims()
IF( nsl < 0 ) THEN
CALL fftx_error__(" fft_scalar: cft_1z ", " nsl out of range ", nsl)
END IF
END SUBROUTINE check_dims
SUBROUTINE lookup()
IF( dfti_first ) THEN
DO ip = 1, ndims
......
......@@ -143,6 +143,11 @@ end function allowed
INTEGER, OPTIONAL, INTENT(IN) :: np
INTEGER :: new
IF (PRESENT(np)) THEN
IF (np <= 0) &
CALL fftx_error__( ' good_fft_order ', ' invalid np ', new )
ENDIF
new = nr
IF( PRESENT( np ) ) THEN
DO WHILE( ( ( .NOT. allowed( new ) ) .OR. ( MOD( new, np ) /= 0 ) ) .AND. ( new <= nfftx ) )
......
......@@ -148,8 +148,7 @@ CONTAINS
SUBROUTINE fft_type_allocate( desc, at, bg, gcutm, comm, fft_fact, nyfft )
!
! routine that allocate arrays of fft_type_descriptor
! must be called before fft_type_init
! routine allocating arrays of fft_type_descriptor, called by fft_type_init
!
TYPE (fft_type_descriptor) :: desc
REAL(DP), INTENT(IN) :: at(3,3), bg(3,3)
......@@ -732,7 +731,7 @@ CONTAINS
!=----------------------------------------------------------------------------=!
SUBROUTINE fft_type_init( dfft, smap, pers, lgamma, lpara, comm, at, bg, gcut_in, dual_in, nyfft )
SUBROUTINE fft_type_init( dfft, smap, pers, lgamma, lpara, comm, at, bg, gcut_in, dual_in, fft_fact, nyfft )
USE stick_base
......@@ -746,6 +745,7 @@ CONTAINS
REAL(DP), INTENT(IN) :: bg(3,3)
REAL(DP), INTENT(IN) :: at(3,3)
REAL(DP), OPTIONAL, INTENT(IN) :: dual_in
INTEGER, INTENT(IN), OPTIONAL :: fft_fact(3)
INTEGER, INTENT(IN) :: nyfft
!
! Potential or dual
......@@ -793,7 +793,7 @@ CONTAINS
!write (*,*) 'FFT_TYPE_INIT pers, gkcut,gcut', pers, gkcut, gcut
IF( .NOT. ALLOCATED( dfft%nsp ) ) THEN
CALL fft_type_allocate( dfft, at, bg, gcut, comm, nyfft=nyfft )
CALL fft_type_allocate( dfft, at, bg, gcut, comm, fft_fact=fft_fact, nyfft=nyfft )
ELSE
IF( dfft%comm /= comm ) THEN
CALL fftx_error__(' fft_type_init ', ' FFT already allocated with a different communicator ', 1 )
......@@ -804,8 +804,8 @@ CONTAINS
dfft%lpara = lpara ! this descriptor can be either a descriptor for a
! parallel FFT or a serial FFT even in parallel build
CALL sticks_map_allocate( smap, lgamma, dfft%lpara, dfft%nproc2, dfft%iproc, dfft%iproc2, &
dfft%nr1, dfft%nr2, dfft%nr3, bg, dfft%comm )
CALL sticks_map_allocate( smap, lgamma, dfft%lpara, dfft%nproc2, &
dfft%iproc, dfft%iproc2, dfft%nr1, dfft%nr2, dfft%nr3, bg, dfft%comm )
dfft%lgamma = smap%lgamma ! .TRUE. if the grid has Gamma symmetry
......@@ -822,7 +822,7 @@ CONTAINS
CALL get_sticks( smap, gcut, nstp, sstp, st, nst, ngm )
CALL fft_type_set( dfft, nst, smap%ub, smap%lb, smap%idx, &
smap%ist(:,1), smap%ist(:,2), nstp, nstpw, sstp, sstpw, st, stw )
smap%ist(:,1), smap%ist(:,2), nstp, nstpw, sstp, sstpw, st, stw )
dfft%ngw = dfft%nwl( dfft%mype + 1 )
dfft%ngm = dfft%ngl( dfft%mype + 1 )
......
import re, sys
info = """
This script collects the relevant simulation parameters and generates a
set of input options for `fft_test.x` that replicate the execution of
vloc_psi done in the production run.
This simplifies the identification of the optimal fft tasking paramete.
Usage: python run_test.py pw_out_file
"""
match_alat = re.compile(r'lattice parameter \(alat\)\s+=\s+([+-]?([0-9]*[.])?[0-9]+)')
match_nbnd = re.compile(r'number of Kohn-Sham states=\s+([+-]?([0-9]*[.])?[0-9]+)')
match_ecutwfc = re.compile(r'kinetic-energy cutoff\s+=\s+([+-]?([0-9]*[.])?[0-9]+)')
match_ecutrho = re.compile(r'charge density cutoff\s+=\s+([+-]?([0-9]*[.])?[0-9]+)')
match_k = re.compile(r'number of k points=\s+(\d+)')
if __name__ == "__main__":
if len(sys.argv) <= 1:
print(info)
else:
with open(sys.argv[1],'r') as f:
data = f.read(30000)
gamma = False
maxk = ''
alat = match_alat.findall(data)
nbnd = match_nbnd.findall(data)
ewfc = match_ecutwfc.findall(data)
erho = match_ecutrho.findall(data)
if len(alat[0]) == 0 or len(nbnd[0]) == 0 or len(ewfc[0]) == 0 or len(erho[0]) == 0:
print("Could not parse file. Sorry.")
alat = alat[0][0]; nbnd=nbnd[0][0]; ewfc=ewfc[0][0];erho=erho[0][0]
a1 = []
a2 = []
a3 = []
lines = data.splitlines()
for i, line in enumerate(lines):
if 'gamma-point specific algorithms are used' in line:
gamma = True
if 'crystal axes' in line:
a1 = [float(x)*float(alat) for x in lines[i+1].split()[3:6]]
a2 = [float(x)*float(alat) for x in lines[i+2].split()[3:6]]
a3 = [float(x)*float(alat) for x in lines[i+3].split()[3:6]]
if 'number of k points' in line:
nk = int(match_k.findall(line)[0])
if '2pi/alat' in lines[i+1]:
nrm2 = 0
for k in range(nk):
kn, v = re.findall(r"\(([-\d\s\.]*)\)", lines[i+k+2])
v2 = [float(x)**2 for x in v.split()]
if sum(v2) > nrm2:
maxk=v
print ("To analize performances run with:")
buf = ("mpirun -np X ./fft_test.x -ntg Y -ecutwfc {ewfc} -ecutrho {erho} " + \
"-av1 {av1} -av2 {av2} -av3 {av3} -nbnd {nbnd} -gamma {gamma}").format(\
ewfc=ewfc, erho=erho, \
av1=' '.join([str(x) for x in a1]), \
av2=' '.join([str(x) for x in a2]), \
av3=' '.join([str(x) for x in a3]), \
nbnd=nbnd, gamma=('.true.' if gamma else '.false.'))
if maxk:
buf += " -kmax "+maxk
print(buf)
......@@ -34,12 +34,4 @@ TEST : la_test.x
clean :
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x
# .PHONY forces execution of a rule irrespective of the presence of an
# updated file with the same name of the rule. In this way, the script
# that generates version.f90 always runs, updating the version if you
# execute "svn update". The update_version script takes care of not
# changing the file if the svn version did not change
.PHONY: all clean
include make.depend
......@@ -8,13 +8,7 @@
!
MODULE la_param
#if defined(__MPI)
#if defined(__MPI_MODULE)
USE mpi
#else
INCLUDE 'mpif.h'
#endif
#endif
use parallel_include
INTEGER, PARAMETER :: DP = selected_real_kind(14,200)
......
......@@ -79,7 +79,11 @@ SUBROUTINE rdiaghg( n, m, h, s, ldh, e, v )
!
! ... calculate all eigenvalues
!
v(:,:) = h(:,:)
!$omp parallel do
do i =1, n
v(1:ldh,i) = h(1:ldh,i)
end do
!$omp end parallel do
!
#if defined (__ESSL)
!
......@@ -118,6 +122,7 @@ SUBROUTINE rdiaghg( n, m, h, s, ldh, e, v )
!
! ... restore input H matrix from saved diagonal and lower triangle
!
!$omp parallel do
DO i = 1, n
h(i,i) = hdiag(i)
DO j = i + 1, n
......@@ -127,6 +132,7 @@ SUBROUTINE rdiaghg( n, m, h, s, ldh, e, v )
h(j,i) = 0.0_DP
END DO
END DO
!$omp end parallel do
!
DEALLOCATE( hdiag )
!
......@@ -144,6 +150,7 @@ SUBROUTINE rdiaghg( n, m, h, s, ldh, e, v )
! ... restore input S matrix from saved diagonal and lower triangle
!
!$omp parallel do
DO i = 1, n
s(i,i) = sdiag(i)
DO j = i + 1, n
......@@ -153,6 +160,7 @@ SUBROUTINE rdiaghg( n, m, h, s, ldh, e, v )
s(j,i) = 0.0_DP
END DO
END DO
!$omp end parallel do
!
DEALLOCATE( sdiag )
!
......@@ -214,6 +222,7 @@ SUBROUTINE prdiaghg( n, h, s, ldh, e, v, desc )
#if defined(__SCALAPACK)
INTEGER :: desch( 16 ), info
#endif
INTEGER :: i
!
CALL start_clock( 'rdiaghg' )
!
......@@ -227,8 +236,12 @@ SUBROUTINE prdiaghg( n, h, s, ldh, e, v, desc )
ALLOCATE( hh( nx, nx ) )
ALLOCATE( ss( nx, nx ) )
!
hh(1:nx,1:nx) = h(1:nx,1:nx)
ss(1:nx,1:nx) = s(1:nx,1:nx)
!$omp parallel do
do i=1,nx
hh(1:nx,i) = h(1:nx,i)
ss(1:nx,i) = s(1:nx,i)
end do
!$omp end parallel do
!
END IF
!
......
......@@ -142,7 +142,9 @@ program lax_test
write(6,*)
write(6,*) 'matrix size = ', n, ' x ', n
write(6,*) 'num. procs = ', npes
#if defined(_OPENMP)
write(6,*) 'thr x proc = ', omp_get_max_threads()
#endif
write(6,*)
endif
......
......@@ -6,13 +6,16 @@ include ../make.inc
MODFLAGS=$(MOD_FLAG).
UTIL = clocks_handler.o \
divide.o \
cuda_util.o \
divide.o \
data_buffer.o \
error_handler.o \
find_free_unit.o \
fletcher32_mod.o \
mem_counter.o \
mp.o \
mp_base.o \
mp_base_gpu.o \
mp_bands_util.o \
parallel_include.o \
util_param.o \
......@@ -28,12 +31,4 @@ libutil.a: $(UTIL)
clean :
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x
# .PHONY forces execution of a rule irrespective of the presence of an
# updated file with the same name of the rule. In this way, the script
# that generates version.f90 always runs, updating the version if you
# execute "svn update". The update_version script takes care of not
# changing the file if the svn version did not change
.PHONY: all clean
include make.depend
# Makefile for DAVID
include ../make.inc
# location of needed modules and included files (if any)
MODFLAGS=$(MOD_FLAG).
DFLAGS=$$DFLAGMATRIX
UTIL = clocks_handler.o \
divide.o \
data_buffer.o \
error_handler.o \
find_free_unit.o \
fletcher32_mod.o \
mp.o \
mp_base.o \
mp_base_gpu.o \
mp_bands_util.o \
parallel_include.o \
util_param.o
all : libutil.a
libutil.a: $(UTIL)
$(AR) $(ARFLAGS) $@ $?
$(RANLIB) $@
clean :
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x
# .PHONY forces execution of a rule irrespective of the presence of an
# updated file with the same name of the rule. In this way, the script
# that generates version.f90 always runs, updating the version if you
# execute "svn update". The update_version script takes care of not
# changing the file if the svn version did not change
.PHONY: all clean
include make.depend
UtilXlib
========
This library implements various basic tasks such as timing, tracing,
optimized memory accesses and an abstraction layer for the MPI subroutines.
The following pre-processor directives can be used to enable/disable some
features:
* `__MPI` : activates MPI support.
* `__TRACE` : activates verbose output for debugging purposes
* `__CUDA` : activates CUDA Fortran based interfaces.
* `__GPU_MPI` : use CUDA aware MPI calls instead of standard sync-send-update method (experimental).
Usage of wrapper interfaces for MPI
===================================
This library offers a number of interfaces to abstract the MPI APIs and
to optionally relax the dependency on a MPI library.
`mp_*` interfaces present in the library can only be called after the
initialization performed by the subroutine `mp_start` and before the
finalization done by `mp_end`.
All rules have exceptions and indeed subroutines `mp_count_nodes`,
`mp_type_create_column_section` and `mp_type_free` can also be called
outside the aforementioned window.
If CUDA Fortran support is enabled, almost all interfaces accept input
data declared with the `device` attribute. Note however that CUDA Fortran
support should be considered experimental.
CUDA specific notes
===================
All calls to message passing interfaces are synchronous with respect to
both MPI and CUDA streams. The code will synchronize the device before
starting the communication, also in those cases where communication
may be avoided (for example in serial version).
A different behaviour may be observed when the default stream
synchronization behaviour is overridden by the user (see `cudaStreamCreateWithFlags`).
Be careful when using CUDA-aware MPI. Some implementations are not
complete. The library will not check for the CUDA-aware MPI APIs during
the initialization, but may report failure codes during the execution.
If you encounter problems when adding the flag `__GPU_MPI` it might
be that the MPI library does not support some CUDA-aware APIs.
Known Issues
============
Owing to the use of the `source` option in data allocations,
PGI versions older than 17.10 may fail with arrays having initial index
different from 1.
Testing
=======
Partial unit testing is available in the `tests` sub-directory. See the
README in that directory for further information.
......@@ -321,6 +321,30 @@ SUBROUTINE print_clock( label )
END SUBROUTINE print_clock
!
!----------------------------------------------------------------------------
FUNCTION get_cpu_and_wall( n) result (t)
!--------------------------------------------------------------------------
!
USE util_param, ONLY : DP
USE mytime, ONLY : clock_label, cputime, walltime, mpi_per_thread, &
notrunning, called, t0cpu, t0wall, f_wall, f_tcpu
IMPLICIT NONE
!
INTEGER :: n
REAL(DP) :: t(2)
!
IF (t0cpu(n) == notrunning ) THEN
t(1) = cputime(n)
t(2) = walltime(n)
ELSE
t(1) = cputime(n) + f_tcpu() - t0cpu(n)
t(2) = walltime(n)+ f_wall() - t0wall(n)
END IF
#if defined(PRINT_AVG_CPU_TIME_PER_THREAD)
! rescale the elapsed cpu time on a per-thread basis
t(1) = t(1) * mpi_per_thread
#endif
END FUNCTION get_cpu_and_wall
!----------------------------------------------------------------------------
SUBROUTINE print_this_clock( n )
!----------------------------------------------------------------------------
!
......
!
! Copyright (C) 2002-2018 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
! Utility functions to perform memcpy and memset on the device with CUDA Fortran
! cuf_memXXX contains a CUF KERNEL to perform the selected operation
!
MODULE cuda_util
!
USE util_param, ONLY : DP
!
IMPLICIT NONE
!
PUBLIC :: cuf_memcpy, cuf_memset
!
INTERFACE cuf_memcpy
MODULE PROCEDURE &
cuf_memcpy_r1d, &
cuf_memcpy_r2d, &
cuf_memcpy_r3d, &
cuf_memcpy_c1d, &
cuf_memcpy_c2d, &
cuf_memcpy_c3d
END INTERFACE
!
INTERFACE cuf_memset
MODULE PROCEDURE &
cuf_memset_r1d, &
cuf_memset_r2d, &
cuf_memset_r3d, &
cuf_memset_c1d, &
cuf_memset_c2d, &
cuf_memset_c3d
END INTERFACE
!
CONTAINS
!
SUBROUTINE cuf_memcpy_r1d(array_out, array_in, range1 )
!
IMPLICIT NONE
!
REAL(DP), INTENT(INOUT) :: array_out(:)
REAL(DP), INTENT(IN) :: array_in(:)
INTEGER, INTENT(IN) :: range1(2)
!
#if defined(__CUDA)
attributes(DEVICE) :: array_out, array_in
#endif
!
INTEGER :: i1, d1s, d1e
!
d1s = range1(1)
d1e = range1(2)
!
!$cuf kernel do(1)
DO i1 = d1s, d1e
array_out(i1 ) = array_in(i1 )
ENDDO
!
END SUBROUTINE cuf_memcpy_r1d
!
SUBROUTINE cuf_memcpy_r2d(array_out, array_in, range1, range2 )
!
IMPLICIT NONE
!