gemv.f90 Source File


Source Code

module mfi_blas_gemv
    use iso_fortran_env
    use f77_blas
#if defined(MFI_CUBLAS)
    use iso_c_binding
    use mfi_blas_cublas
#endif
#if defined(MFI_EXTENSIONS)
    use mfi_blas_extensions
#endif
    implicit none

!> Generic modern interface for GEMV.
!> Supports s, d, c, z.
!> See also:
!> [[f77_gemv:sgemv]], [[f77_gemv:dgemv]], [[f77_gemv:cgemv]], [[f77_gemv:zgemv]].
interface mfi_gemv
    module procedure :: mfi_sgemv
    module procedure :: mfi_dgemv
    module procedure :: mfi_cgemv
    module procedure :: mfi_zgemv
end interface

contains

!> Modern interface for [[f77_gemv:f77_gemv]].
!> See also: [[mfi_gemv]], [[f77_gemv]].
pure subroutine mfi_sgemv(a, x, y, trans, alpha, beta, incx, incy)
    integer, parameter :: wp = REAL32
    real(REAL32), intent(in), target :: a(:,:), x(:)
    real(REAL32), intent(inout), target :: y(:)
    character, intent(in), optional :: trans
    character :: local_trans
    real(REAL32), intent(in), optional :: alpha
    real(REAL32) :: local_alpha
    real(REAL32), intent(in), optional :: beta
    real(REAL32) :: local_beta
    integer, intent(in), optional :: incx
    integer :: local_incx
    integer, intent(in), optional :: incy
    integer :: local_incy
    integer :: m, n, lda
    if (present(trans)) then
        local_trans = trans
    else
        local_trans = 'N'
    end if
    if (present(alpha)) then
        local_alpha = alpha
    else
        local_alpha = 1.0_wp
    end if
    if (present(beta)) then
        local_beta = beta
    else
        local_beta = 0.0_wp
    end if
    if (present(incx)) then
        local_incx = incx
    else
        local_incx = 1
    end if
    if (present(incy)) then
        local_incy = incy
    else
        local_incy = 1
    end if
    m = size(a,1)
    n = size(a,2)
    lda = max(1,m)
#if defined(MFI_EXTENSIONS) && defined(MFI_CUBLAS)
    if (mfi_cublas_is_active()) then
        block
    integer(c_int) :: cuda_allocation_status
        type(c_ptr) :: device_a
        type(c_ptr) :: device_x
        type(c_ptr) :: device_y
        integer(c_int) :: op, cublas_stat
        real(REAL32), target :: alpha_target, beta_target
        type(c_ptr) :: handle
        handle = mfi_cublas_handle_get()
        call cuda_malloc(device_a, &
                              int(size(a) * storage_size(a)/8, c_size_t), &
                              cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMalloc failed allocating a'
        call cuda_malloc(device_x, &
                              int(size(x) * storage_size(x)/8, c_size_t), &
                              cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMalloc failed allocating x'
        call cuda_malloc(device_y, &
                              int(size(y) * storage_size(y)/8, c_size_t), &
                              cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMalloc failed allocating y'
        if (local_trans == 'N' .or. local_trans == 'n') then
            op = CUBLAS_OP_N
        else if (local_trans == 'T' .or. local_trans == 't') then
            op = CUBLAS_OP_T
        else
            op = CUBLAS_OP_C
        end if
        alpha_target = local_alpha
        beta_target = local_beta
        call cudaMemcpy(device_a, c_loc(a), &
                        int(size(a) * storage_size(a)/8, c_size_t), &
                        cudaMemcpyHostToDevice, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (H->D) failed for a'
        call cudaMemcpy(device_x, c_loc(x), &
                        int(size(x) * storage_size(x)/8, c_size_t), &
                        cudaMemcpyHostToDevice, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (H->D) failed for x'
        call cudaMemcpy(device_y, c_loc(y), &
                        int(size(y) * storage_size(y)/8, c_size_t), &
                        cudaMemcpyHostToDevice, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (H->D) failed for y'
        cublas_stat = cublasSgemv(handle, op, &
                 int(m,c_int), int(n,c_int), &
                 c_loc(alpha_target), device_a, int(lda,c_int), &
                 device_x, int(local_incx,c_int), &
                 c_loc(beta_target), device_y, int(local_incy,c_int))
        if (cublas_stat /= 0) call mfi_cublas_error(cublas_stat, 'Sgemv')
        call cudaMemcpy(c_loc(y), device_y, &
                        int(size(y) * storage_size(y)/8, c_size_t), &
                        cudaMemcpyDeviceToHost, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (D->H) failed for y'
        call cuda_free(device_a, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaFree failed deallocating a'
        call cuda_free(device_x, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaFree failed deallocating x'
        call cuda_free(device_y, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaFree failed deallocating y'
        end block
        return
    end if
#endif
    call f77_gemv(local_trans,m,n,local_alpha,a,lda,x,local_incx,local_beta,y,local_incy)
end subroutine
!> Modern interface for [[f77_gemv:f77_gemv]].
!> See also: [[mfi_gemv]], [[f77_gemv]].
pure subroutine mfi_dgemv(a, x, y, trans, alpha, beta, incx, incy)
    integer, parameter :: wp = REAL64
    real(REAL64), intent(in), target :: a(:,:), x(:)
    real(REAL64), intent(inout), target :: y(:)
    character, intent(in), optional :: trans
    character :: local_trans
    real(REAL64), intent(in), optional :: alpha
    real(REAL64) :: local_alpha
    real(REAL64), intent(in), optional :: beta
    real(REAL64) :: local_beta
    integer, intent(in), optional :: incx
    integer :: local_incx
    integer, intent(in), optional :: incy
    integer :: local_incy
    integer :: m, n, lda
    if (present(trans)) then
        local_trans = trans
    else
        local_trans = 'N'
    end if
    if (present(alpha)) then
        local_alpha = alpha
    else
        local_alpha = 1.0_wp
    end if
    if (present(beta)) then
        local_beta = beta
    else
        local_beta = 0.0_wp
    end if
    if (present(incx)) then
        local_incx = incx
    else
        local_incx = 1
    end if
    if (present(incy)) then
        local_incy = incy
    else
        local_incy = 1
    end if
    m = size(a,1)
    n = size(a,2)
    lda = max(1,m)
#if defined(MFI_EXTENSIONS) && defined(MFI_CUBLAS)
    if (mfi_cublas_is_active()) then
        block
    integer(c_int) :: cuda_allocation_status
        type(c_ptr) :: device_a
        type(c_ptr) :: device_x
        type(c_ptr) :: device_y
        integer(c_int) :: op, cublas_stat
        real(REAL64), target :: alpha_target, beta_target
        type(c_ptr) :: handle
        handle = mfi_cublas_handle_get()
        call cuda_malloc(device_a, &
                              int(size(a) * storage_size(a)/8, c_size_t), &
                              cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMalloc failed allocating a'
        call cuda_malloc(device_x, &
                              int(size(x) * storage_size(x)/8, c_size_t), &
                              cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMalloc failed allocating x'
        call cuda_malloc(device_y, &
                              int(size(y) * storage_size(y)/8, c_size_t), &
                              cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMalloc failed allocating y'
        if (local_trans == 'N' .or. local_trans == 'n') then
            op = CUBLAS_OP_N
        else if (local_trans == 'T' .or. local_trans == 't') then
            op = CUBLAS_OP_T
        else
            op = CUBLAS_OP_C
        end if
        alpha_target = local_alpha
        beta_target = local_beta
        call cudaMemcpy(device_a, c_loc(a), &
                        int(size(a) * storage_size(a)/8, c_size_t), &
                        cudaMemcpyHostToDevice, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (H->D) failed for a'
        call cudaMemcpy(device_x, c_loc(x), &
                        int(size(x) * storage_size(x)/8, c_size_t), &
                        cudaMemcpyHostToDevice, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (H->D) failed for x'
        call cudaMemcpy(device_y, c_loc(y), &
                        int(size(y) * storage_size(y)/8, c_size_t), &
                        cudaMemcpyHostToDevice, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (H->D) failed for y'
        cublas_stat = cublasDgemv(handle, op, &
                 int(m,c_int), int(n,c_int), &
                 c_loc(alpha_target), device_a, int(lda,c_int), &
                 device_x, int(local_incx,c_int), &
                 c_loc(beta_target), device_y, int(local_incy,c_int))
        if (cublas_stat /= 0) call mfi_cublas_error(cublas_stat, 'Dgemv')
        call cudaMemcpy(c_loc(y), device_y, &
                        int(size(y) * storage_size(y)/8, c_size_t), &
                        cudaMemcpyDeviceToHost, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (D->H) failed for y'
        call cuda_free(device_a, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaFree failed deallocating a'
        call cuda_free(device_x, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaFree failed deallocating x'
        call cuda_free(device_y, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaFree failed deallocating y'
        end block
        return
    end if
#endif
    call f77_gemv(local_trans,m,n,local_alpha,a,lda,x,local_incx,local_beta,y,local_incy)
end subroutine
!> Modern interface for [[f77_gemv:f77_gemv]].
!> See also: [[mfi_gemv]], [[f77_gemv]].
pure subroutine mfi_cgemv(a, x, y, trans, alpha, beta, incx, incy)
    integer, parameter :: wp = REAL32
    complex(REAL32), intent(in), target :: a(:,:), x(:)
    complex(REAL32), intent(inout), target :: y(:)
    character, intent(in), optional :: trans
    character :: local_trans
    complex(REAL32), intent(in), optional :: alpha
    complex(REAL32) :: local_alpha
    complex(REAL32), intent(in), optional :: beta
    complex(REAL32) :: local_beta
    integer, intent(in), optional :: incx
    integer :: local_incx
    integer, intent(in), optional :: incy
    integer :: local_incy
    integer :: m, n, lda
    if (present(trans)) then
        local_trans = trans
    else
        local_trans = 'N'
    end if
    if (present(alpha)) then
        local_alpha = alpha
    else
        local_alpha = 1.0_wp
    end if
    if (present(beta)) then
        local_beta = beta
    else
        local_beta = 0.0_wp
    end if
    if (present(incx)) then
        local_incx = incx
    else
        local_incx = 1
    end if
    if (present(incy)) then
        local_incy = incy
    else
        local_incy = 1
    end if
    m = size(a,1)
    n = size(a,2)
    lda = max(1,m)
#if defined(MFI_EXTENSIONS) && defined(MFI_CUBLAS)
    if (mfi_cublas_is_active()) then
        block
    integer(c_int) :: cuda_allocation_status
        type(c_ptr) :: device_a
        type(c_ptr) :: device_x
        type(c_ptr) :: device_y
        integer(c_int) :: op, cublas_stat
        complex(REAL32), target :: alpha_target, beta_target
        type(c_ptr) :: handle
        handle = mfi_cublas_handle_get()
        call cuda_malloc(device_a, &
                              int(size(a) * storage_size(a)/8, c_size_t), &
                              cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMalloc failed allocating a'
        call cuda_malloc(device_x, &
                              int(size(x) * storage_size(x)/8, c_size_t), &
                              cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMalloc failed allocating x'
        call cuda_malloc(device_y, &
                              int(size(y) * storage_size(y)/8, c_size_t), &
                              cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMalloc failed allocating y'
        if (local_trans == 'N' .or. local_trans == 'n') then
            op = CUBLAS_OP_N
        else if (local_trans == 'T' .or. local_trans == 't') then
            op = CUBLAS_OP_T
        else
            op = CUBLAS_OP_C
        end if
        alpha_target = local_alpha
        beta_target = local_beta
        call cudaMemcpy(device_a, c_loc(a), &
                        int(size(a) * storage_size(a)/8, c_size_t), &
                        cudaMemcpyHostToDevice, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (H->D) failed for a'
        call cudaMemcpy(device_x, c_loc(x), &
                        int(size(x) * storage_size(x)/8, c_size_t), &
                        cudaMemcpyHostToDevice, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (H->D) failed for x'
        call cudaMemcpy(device_y, c_loc(y), &
                        int(size(y) * storage_size(y)/8, c_size_t), &
                        cudaMemcpyHostToDevice, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (H->D) failed for y'
        cublas_stat = cublasCgemv(handle, op, &
                 int(m,c_int), int(n,c_int), &
                 c_loc(alpha_target), device_a, int(lda,c_int), &
                 device_x, int(local_incx,c_int), &
                 c_loc(beta_target), device_y, int(local_incy,c_int))
        if (cublas_stat /= 0) call mfi_cublas_error(cublas_stat, 'Cgemv')
        call cudaMemcpy(c_loc(y), device_y, &
                        int(size(y) * storage_size(y)/8, c_size_t), &
                        cudaMemcpyDeviceToHost, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (D->H) failed for y'
        call cuda_free(device_a, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaFree failed deallocating a'
        call cuda_free(device_x, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaFree failed deallocating x'
        call cuda_free(device_y, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaFree failed deallocating y'
        end block
        return
    end if
#endif
    call f77_gemv(local_trans,m,n,local_alpha,a,lda,x,local_incx,local_beta,y,local_incy)
end subroutine
!> Modern interface for [[f77_gemv:f77_gemv]].
!> See also: [[mfi_gemv]], [[f77_gemv]].
pure subroutine mfi_zgemv(a, x, y, trans, alpha, beta, incx, incy)
    integer, parameter :: wp = REAL64
    complex(REAL64), intent(in), target :: a(:,:), x(:)
    complex(REAL64), intent(inout), target :: y(:)
    character, intent(in), optional :: trans
    character :: local_trans
    complex(REAL64), intent(in), optional :: alpha
    complex(REAL64) :: local_alpha
    complex(REAL64), intent(in), optional :: beta
    complex(REAL64) :: local_beta
    integer, intent(in), optional :: incx
    integer :: local_incx
    integer, intent(in), optional :: incy
    integer :: local_incy
    integer :: m, n, lda
    if (present(trans)) then
        local_trans = trans
    else
        local_trans = 'N'
    end if
    if (present(alpha)) then
        local_alpha = alpha
    else
        local_alpha = 1.0_wp
    end if
    if (present(beta)) then
        local_beta = beta
    else
        local_beta = 0.0_wp
    end if
    if (present(incx)) then
        local_incx = incx
    else
        local_incx = 1
    end if
    if (present(incy)) then
        local_incy = incy
    else
        local_incy = 1
    end if
    m = size(a,1)
    n = size(a,2)
    lda = max(1,m)
#if defined(MFI_EXTENSIONS) && defined(MFI_CUBLAS)
    if (mfi_cublas_is_active()) then
        block
    integer(c_int) :: cuda_allocation_status
        type(c_ptr) :: device_a
        type(c_ptr) :: device_x
        type(c_ptr) :: device_y
        integer(c_int) :: op, cublas_stat
        complex(REAL64), target :: alpha_target, beta_target
        type(c_ptr) :: handle
        handle = mfi_cublas_handle_get()
        call cuda_malloc(device_a, &
                              int(size(a) * storage_size(a)/8, c_size_t), &
                              cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMalloc failed allocating a'
        call cuda_malloc(device_x, &
                              int(size(x) * storage_size(x)/8, c_size_t), &
                              cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMalloc failed allocating x'
        call cuda_malloc(device_y, &
                              int(size(y) * storage_size(y)/8, c_size_t), &
                              cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMalloc failed allocating y'
        if (local_trans == 'N' .or. local_trans == 'n') then
            op = CUBLAS_OP_N
        else if (local_trans == 'T' .or. local_trans == 't') then
            op = CUBLAS_OP_T
        else
            op = CUBLAS_OP_C
        end if
        alpha_target = local_alpha
        beta_target = local_beta
        call cudaMemcpy(device_a, c_loc(a), &
                        int(size(a) * storage_size(a)/8, c_size_t), &
                        cudaMemcpyHostToDevice, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (H->D) failed for a'
        call cudaMemcpy(device_x, c_loc(x), &
                        int(size(x) * storage_size(x)/8, c_size_t), &
                        cudaMemcpyHostToDevice, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (H->D) failed for x'
        call cudaMemcpy(device_y, c_loc(y), &
                        int(size(y) * storage_size(y)/8, c_size_t), &
                        cudaMemcpyHostToDevice, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (H->D) failed for y'
        cublas_stat = cublasZgemv(handle, op, &
                 int(m,c_int), int(n,c_int), &
                 c_loc(alpha_target), device_a, int(lda,c_int), &
                 device_x, int(local_incx,c_int), &
                 c_loc(beta_target), device_y, int(local_incy,c_int))
        if (cublas_stat /= 0) call mfi_cublas_error(cublas_stat, 'Zgemv')
        call cudaMemcpy(c_loc(y), device_y, &
                        int(size(y) * storage_size(y)/8, c_size_t), &
                        cudaMemcpyDeviceToHost, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaMemcpy (D->H) failed for y'
        call cuda_free(device_a, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaFree failed deallocating a'
        call cuda_free(device_x, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaFree failed deallocating x'
        call cuda_free(device_y, cuda_allocation_status)
        if (cuda_allocation_status /= 0) error stop 'cudaFree failed deallocating y'
        end block
        return
    end if
#endif
    call f77_gemv(local_trans,m,n,local_alpha,a,lda,x,local_incx,local_beta,y,local_incy)
end subroutine
end module