!{\src2tex{textfont=tt}}
!!****f* ABINIT/nonlop_htor
!! nonlop_htor
!!
!! NAME
!! nonlop_htor
!!
!! FUNCTION
!! * Compute application of a nonlocal operator Vnl in order to get:
!!    - a function in reciprocal space (|out> = Vnl|in>)
!!   Operator Vnl, as the following form:
!!    $Vnl=sum_{R,lmn,l''m''n''} {|P_{Rlmn}> Enl^{R}_{lmn,l''m''n''} <P_{Rl''m''n''}|}$
!!   Operator Vnl is -- in the typical case -- the nonlocal potential.
!!   - With norm-conserving pseudopots, $Enl^{R}_{lmn,l''m''n''}$ is the
!!     Kleinmann-Bylander energy $Ekb^{R}_{ln}$.
!!   - The |P_{Rlmn}> are the projector functions.
!! * This routine uses Legendre polynomials Pl to express Vnl.
!!
!! COPYRIGHT
!! Copyright (C) 1998-2007 ABINIT group (DCA, XG, GMR, GZ, MT, FF, DRH)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt.
!!
!! INPUTS
!!  choice: chooses possible output:
!!    choice=1 => a non-local energy contribution
!!          =2 => a gradient with respect to atomic position(s)
!!          =3 => a gradient with respect to strain(s)
!!          =23=> a gradient with respect to atm. pos. and strain(s)
!!          =4 => a gradient and 2nd derivative with respect to atomic pos.
!!          =5 => a gradient with respect to k wavevector
!!          =6 => 2nd derivatives with respect to strain
!!  dimekb1,dimekb2=dimensions of ekb (see ekb)
!!  dimffnlin=second dimension of ffnlin (1+number of derivatives)
!!  dimffnlout=second dimension of ffnlout (1+number of derivatives)
!!  ekb(dimekb1,dimekb2)= (Real) Kleinman-Bylander energies (hartree)
!!                        dimekb1=lmnmax  -  dimekp2=ntypat
!!  ffnlin(npwin,dimffnlin,lmnmax,ntypat)=nonlocal form factors to be used
!!          for the application of the nonlocal operator to the |in> vector
!!  ffnlout(npwout,dimffnlout,lmnmax,ntypat)=nonlocal form factors to be used
!!          for the application of the nonlocal operator to the |out> vector
!!  gmet(3,3)=metric tensor for G vecs (in bohr**-2)
!!  gprimd(3,3)=dimensional reciprocal space primitive translations
!!   (bohr^-1)
!!  idir=direction of the - atom to be moved in the case (choice=2,signs=2),
!!                        - k point direction in the case (choice=5,signs=2)
!!                        - strain component (1:6) in the case (choice=2,signs=2) or (choice=6,signs=1)
!!  indlmn(6,i,ntypat)= array giving l,m,n,lm,ln,s for i=ln
!!  istwf_k=option parameter that describes the storage of wfs
!!  kgin(3,npwin)=integer coords of planewaves in basis sphere, for the |in> vector
!!  kgout(3,npwout)=integer coords of planewaves in basis sphere, for the |out> vector
!!  kpgin(npw,npkgin)= (k+G) components and related data, for the |in> vector
!!  kpgout(npw,nkpgout)=(k+G) components and related data, for the |out> vector
!!  kptin(3)=k point in terms of recip. translations, for the |in> vector
!!  kptout(3)=k point in terms of recip. translations, for the |out> vector
!!  lmnmax=max. number of (l,m,n) components over all types of atoms
!!  matblk=dimension of the arrays ph3din and ph3dout
!!  mgfft=maximum size of 1D FFTs
!!  mpi_enreg=informations about MPI parallelization
!!  natom=number of atoms in cell
!!  nattyp(ntypat)=number of atoms of each type
!!  ngfft(18)=contain all needed information about 3D FFT, see ~ABINIT/doc/input_variables/vargs.htm#ngfft
!!  nkpgin,nkpgout=second sizes of arrays kpgin/kpgout
!!  nloalg(5)=governs the choice of the algorithm for nonlocal operator
!!  nnlout=dimension of enlout: choice=1=>nnlout=1   choice=2=>nnlout=3*natom
!!                              choice=3=>nnlout=6   choice=4=>nnlout=6*natom
!!                              choice=5=>nnlout=1   choice=6=>nnlout=6*(3*natom+6)
!!                              choice=23=>nnlout=6+3*natom
!!  npwin=number of planewaves for given k point, for the |in> vector
!!  npwout=number of planewaves for given k point, for the |out> vector
!!  nspinor=number of spinorial components of the wavefunctions
!!  ntypat=number of types of atoms in cell
!!  phkxredin(2,natom)=phase factors exp(2 pi kptin.xred)
!!  phkxredout(2,natom)=phase factors exp(2 pi kptout.xred)
!!  ph1d(2,3*(2*mgfft+1)*natom)=1D structure factors phase information
!!  ph3din(2,npwin,matblk)=3D structure factors, for each atom and plane wave (in)
!!  ph3dout(2,npwout,matblk)=3-dim structure factors, for each atom and plane wave (out)
!!  pspso(ntypat)=spin-orbit characteristic for each atom type
!!  signs= if 1, get contracted elements (energy, forces, stress, ...)
!!         if 2, applies the non-local operator to a function in reciprocal space
!!  ucvol=unit cell volume (bohr^3)
!!  vectin(2,nspinor*npwin)=input cmplx wavefunction coefficients <G|Cnk>
!!
!! OUTPUT
!!  ==== if (signs==1) ====
!!     enlout(nnlout)= contribution of this state to the nl part
!!                     of the following properties:
!!       if choice=1 : enlout(1)               -> the energy
!!  ==== if (signs==2) ====
!!     vectout(2,nspinor*npwout)= result of the aplication of the nl operator
!!                                or one of its derivative to the input vect.:
!!       if choice=1 : Vnl |vectin>
!!
!! NOTES
!! In the case signs=1, the array vectout is not used, nor modified
!! so that the same array as vectin can be used as a dummy argument;
!! the same is true for the pairs npwin-npwout, ffnlin-ffnlout,
!! kgin-kgout, ph3din-ph3dout, phkredin-phkxredout).
!!
!! Calculation includes contributions to grads of Etot wrt coord and
!! wrt strains for l=0,1,2,3.
!!
!! WARNINGS
!!  - Warning 1: This routine is in a transient state, during the
!!    time of the implementation of the spin-orbit coupling...
!!    In particular, the OMP parallelisation is still missing,
!!    but it matters here only when nspinor==2.
!!  - Warning 2: the order of atoms is governed by atindx
!!
!! PARENTS
!!      cgwf_htor
!!
!! CHILDREN
!!      leave_new,metcon,mkffkg3_htor,mpi_allreduce,wrtout
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

subroutine nonlop_htor(choice,dimekb1,dimekb2,dimffnlin,dimffnlout,ekb,enlout,&
&                     ffnlin,ffnlout,gmet,gprimd,idir,indlmn,istwf_k,kgin,kgout,kpgin,kpgout,&
&                     kptin,kptout,lmnmax,matblk,mgfft,mpi_enreg,mpsang,mpssoang,&
&                     natom,nattyp,ngfft,nkpgin,nkpgout,nloalg,nnlout,npwin,npwout,nspinor,ntypat,&
&                     phkxredin,phkxredout,ph1d,ph3din,ph3dout,pspso,signs,&
&                     ucvol,vectin,vectout)

 use defs_basis
 use defs_datatypes

!This section has been created automatically by the script Abilint (TD). Do not modify these by hand.
#ifdef HAVE_FORTRAN_INTERFACES
 use interfaces_01manage_mpi
 use interfaces_13nonlocal
#endif
!End of the abilint section

 implicit none

#if defined MPI
 include 'mpif.h'
#endif

!Arguments ------------------------------------
!This type is defined in defs_mpi
!The (inout) classification below is misleading; mpi_enreg is temporarily
! changed but reset to its initial condition before exiting.
!scalars
 integer,intent(in) :: choice,dimekb1,dimekb2,dimffnlin,dimffnlout,idir,istwf_k
 integer,intent(in) :: lmnmax,matblk,mgfft,mpsang,mpssoang,natom,nkpgin,nkpgout
 integer,intent(in) :: nnlout,npwin,npwout,nspinor,ntypat,signs
 real(dp),intent(in) :: ucvol
 type(MPI_type),intent(inout) :: mpi_enreg
!arrays
 integer,intent(in) :: indlmn(6,lmnmax,ntypat),kgin(3,npwin),kgout(3,npwout)
 integer,intent(in) :: nattyp(ntypat),ngfft(18),nloalg(5),pspso(ntypat)
 real(dp),intent(in) :: ekb(dimekb1,dimekb2)
 real(dp),intent(in) :: ffnlin(npwin,dimffnlin,lmnmax,ntypat)
 real(dp),intent(in) :: ffnlout(npwout,dimffnlout,lmnmax,ntypat),gmet(3,3)
 real(dp),intent(in) :: gprimd(3,3),kpgin(npwin,nkpgin),kpgout(npwout,nkpgout),kptin(3),kptout(3)
 real(dp),intent(in) :: ph1d(2,3*(2*mgfft+1)*natom),ph3din(2,npwin,matblk)
 real(dp),intent(in) :: ph3dout(2,npwout,matblk),phkxredin(2,natom)
 real(dp),intent(in) :: phkxredout(2,natom),vectin(2,nspinor*npwin)
 real(dp),intent(out) :: enlout(nnlout),vectout(2,nspinor*npwout)

!Local variables-------------------------------
!mlang is the maximum number of different angular momenta
!(mlang=4 means s,p,d,f)
! Note : in a future version, one should adjust mlang to mpsang.
!mlang2 is the maximum number of unique tensor components for a tensor
!of rank (mlang-1) with index range 1-3
!mlang3 is the maximum number of unique tensor components summed over
!all tensors of rank 0 through mlang-1.
!mlang4 is the total number of additional unique tensor components
!related to strain gradients, ranks 2 through mlang+1.
!mlang6 is the total number of additional unique tensor components
!related to strain 2nd derivaives, ranks 4 through mlang+3.
!mlang1 is the total number of certain additional unique tensor components
!related to internal strain, ranks 1 through mlang
!mlang5 is the total number of other additional unique tensor components
!related to internal strain, ranks 1 through mlang
!scalars
 integer,parameter :: mlang=4
 integer,save :: mlang1=((mlang+1)*(mlang+2)*(mlang+3))/6-1
 integer,save :: mlang2=(mlang*(mlang+1))/2
 integer,save :: mlang3=(mlang*(mlang+1)*(mlang+2))/6
 integer,save :: mlang4=((mlang+2)*(mlang+3)*(mlang+4))/6-4
 integer,save :: mlang5=((mlang+3)*(mlang+4)*(mlang+5))/6-10
 integer,save :: mlang6=((mlang+4)*(mlang+5)*(mlang+6))/6-20
 integer :: compact,ia,ia1,ia2,ia3,ia4,ia5,ierr,iest,ig,ii,ilang,ilang2,ilmn
 integer :: iln,iln0,indx,iproj,ipsang,ipssoang,ishift,ispin,ispinor,ispinp
 integer :: istr,istr1,istr2,iterm,itypat,jj,jjk,jjs,jjs1,jjs2,jjs3,jjs4,jjstr
 integer :: lpsang,mincat,mproj,mu,mumax,n1,n2,n3,nincat,nlang
 integer :: nlangso,nproj,nspinso,old_paral_level,rank,shift1,shift2,shift3
 integer :: sign,spaceComm
 real(dp) :: e2nl,e2nldd,ph12i,ph12r,ph1i,ph1r,ph2i,ph2r,ph3i,ph3r
 character(len=500) :: message
!arrays
 integer,allocatable :: jproj(:)
 real(dp) :: amet(2,3,3,2,2),amet_lo(3,3),e2nl_tmp(6),eisnl(3),rank2(6)
 real(dp) :: rank2c(2,6),strsnl(6),strsso(6,3),strssoc(6),trace(2),tsec(2)
 real(dp),allocatable :: d2gxdis(:,:,:,:,:),d2gxdis_s(:,:,:,:)
 real(dp),allocatable :: d2gxds2(:,:,:,:,:),d2gxds2_s(:,:,:,:)
 real(dp),allocatable :: dgxdis(:,:,:,:,:),dgxdis_s(:,:,:,:)
 real(dp),allocatable :: dgxds_s(:,:,:,:)
 real(dp),allocatable :: dgxdt_s(:,:,:,:,:)
 real(dp),allocatable :: ekb_s(:,:),gxa(:,:,:,:)
 real(dp),allocatable :: gxa_s(:,:,:,:),gxafac(:,:,:,:),pauli(:,:,:,:)
 real(dp),allocatable :: temp(:,:),tmpfac(:,:),vectin_s(:,:),vectout_s(:,:)
 real(dp),allocatable :: wt(:,:)
 ! htor for opernl4a
 integer :: mblkpw, iffkg, ipw, ipw1, ipw2, iaph3d, nincpw, nffkg, ntens
 integer,allocatable :: parity(:)
 real(dp) :: ai, ar
 real(dp),allocatable :: ffkg(:,:),kpgx(:,:),scalars(:,:)
 integer :: gmin, gmax, gnpw, gmpw


! **********************************************************************
#if defined MPI
gmin=mpi_enreg%gmin
gmax=mpi_enreg%gmax
gnpw=gmax-gmin+1
gmpw=mpi_enreg%mgblk
#else
gmin=1
gmax=npwin
gnpw=gmax-gmin+1
gmpw=npwin
#endif


 ! Define dimension of work arrays.
 ! mincat=min(nloalg(4),maxval(nattyp))
 ! this wastes memory because we need space in all arrays to store the meximum
 ! amount of atoms of all tom types
 mincat=maxval(nattyp) ! htor set this to the maximum to erase a loop
 mproj=maxval(indlmn(3,:,:))
 allocate(temp(2,mlang4),tmpfac(2,mlang4))
 allocate(wt(mlang,mproj),jproj(mlang))
 allocate(ekb_s(mlang,mproj))

 ! Allocate array gxa (contains projected scalars).
 allocate(gxa(2,mlang3,mincat,mproj))
 allocate(gxafac(2,mlang3,mincat,mproj))
 gxa(:,:,:,:)=zero

 ! Zero out some arrays
 enlout(1)=0.0_dp
 vectout(:,:)=0.0_dp

 ! Big loop on atom types.
 ia1=1
 do itypat=1,ntypat
  ! Get atom loop indices for different types:
  ia2=ia1+nattyp(itypat)-1

  ekb_s(:,:)=zero
  wt(:,:)=zero
  iln0=0
  jproj(:)=0
  nlang=0
  do ilmn=1,lmnmax
   if(1/=indlmn(6,ilmn,itypat))cycle
   iln =indlmn(5,ilmn,itypat)
   if (iln>iln0) then
    iln0=iln
    ipsang=indlmn(1,ilmn,itypat)+1
    iproj=indlmn(3,ilmn,itypat)
    ekb_s(ipsang,iproj)=ekb(iln,itypat)
    wt(ipsang,iproj)=4._dp*pi/ucvol*dble(2*ipsang-1)*ekb_s(ipsang,iproj)
    jproj(ipsang)=max(jproj(ipsang),iproj)
    if(iproj>0) nlang=max(nlang,ipsang)
   end if
  end do ! ilmn


  ! If nlang is not 0, then some non-local part is to be applied for
  ! that type of atom.
  if (nlang/=0) then

   ! Only the first spinorial component of vectin is taken into account first
   !!!!!!!!!!!!!!!!!!!!!!!!! opernl4a
   !mblkpw=nloalg(3)
   mblkpw=gnpw

   ! Set up dimension of kpgx and allocate
   ! ntens sets the maximum number of independent tensor components
   ! over all allowed angular momenta; need 20 for spdf for tensors
   ! up to rank 3; to handle stress tensor, need up to rank 5
   ntens=1
   if(nlang>=2)ntens=4
   if(nlang>=3)ntens=10
   if(nlang>=4)ntens=20

   ! Set up second dimension of ffkg array, and allocate
   nffkg=0
   do ilang=1,nlang
    ! Get the number of projectors for that angular momentum
    nffkg=nffkg+jproj(ilang)*(ilang*(ilang+1))/2
   end do

   allocate(ffkg(nffkg,mblkpw),parity(nffkg))
   allocate(kpgx(mblkpw,ntens))
   allocate(scalars(2,nffkg))

   gxa(:,:,:,:)=0.0_dp

   ! this can be done for every block of the G's in parallel by setting ipw1 to the start band
   ! and nincpw to the block-length (currently: ipw=1, nincpw=npw)
   ! so this is ideal for parallelization over G's
   ipw1=gmin
   nincpw=gnpw

   ! Initialize kpgx and ffkg arrays related to tensors defined below
   call mkffkg3_htor(choice,ffkg,ffnlin,gmet,idir,indlmn,ipw1,1,itypat,&
   &  kgin,kpgin,kpgx,kptin,lmnmax,mblkpw,0,nffkg,dimffnlin,nincpw,nkpgin,nlang,nloalg,&
   &  npwin,ntens,ntypat,parity)


   ! loop on number of atoms
   do ia=1,nattyp(itypat)
    ! Compute the shift eventually needed to get the phases in ph3d
    iaph3d=ia+ia1-1

    scalars(:,:)=0.0_dp;

    ! ******* Entering the first time-consuming part of the routine *******
    ! htor's treatment - extremely slow ... :)
    ig=ipw1
    do ipw=1,nincpw
     ar=vectin(1,ipw)*ph3din(1,ig,iaph3d)-vectin(2,ipw)*ph3din(2,ig,iaph3d)
     ai=vectin(2,ipw)*ph3din(1,ig,iaph3d)+vectin(1,ipw)*ph3din(2,ig,iaph3d)
     ig=ig+1
     do iffkg=1,nffkg
      scalars(1,iffkg)=scalars(1,iffkg)+ar*ffkg(iffkg,ipw)
      scalars(2,iffkg)=scalars(2,iffkg)+ai*ffkg(iffkg,ipw)
     end do
    end do

    ! ******* Leaving the critical part *********************************

    iffkg=0
    ! angular momentum loop
    do ilang=1,nlang
     ! ilang2 is the number of independent tensor components
     ! for symmetric tensor of rank ilang-1
     ilang2=(ilang*(ilang+1))/2
     ! Loop over projectors
     do iproj=1,jproj(ilang)
      ! Multiply by the k+G factors (tensors of various rank)
      do ii=1,ilang2
       ! Get the starting address for the relevant tensor
       jj=ii+((ilang-1)*ilang*(ilang+1))/6
       iffkg=iffkg+1
       gxa(1,jj,ia,iproj)=gxa(1,jj,ia,iproj)+scalars(1,iffkg)
       gxa(2,jj,ia,iproj)=gxa(2,jj,ia,iproj)+scalars(2,iffkg)
      end do
     end do
    end do
   end do ! End loop on atoms

#if defined MPI && defined MPI_EXT
   call MPI_ALLREDUCE(MPI_IN_PLACE, gxa, 2*mlang3*mincat*mproj, MPI_DOUBLE_PRECISION, MPI_SUM, &
   &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif

   !!!!!!!!!!!!!!!!!!!!!!!!! /opernl4a
   ! Perform contractions for the various tensors (d)gx?, producing the
   ! contracted tensors (d)gx?fac to be passed back to opernl:
   do ia=1,nattyp(itypat)
    do ilang=1,nlang
     nproj=jproj(ilang)
     if(nproj/=0) then
      ilang2=(ilang*(ilang+1))/2
      do iproj=1,nproj
       ! The rank of the tensor gxa equals l:
       rank=ilang-1
       ! jjs gives the starting address of the relevant components
       jjs=1+((ilang-1)*ilang*(ilang+1))/6
       if (ilang>4) then
         write(message, '(a,a,a,a,i8,a)' )ch10, ' nonlop: BUG -',ch10,&
         & '  ilang must fall in range [1..4] but value is ',ilang,'.'
         call wrtout(06,message,'PERS')
         call leave_new('PERS')
       end if

       ! Metric & spinorial contraction from gxa to gxafac. The treatment
       ! is different for the scalar-relativistic and spin-orbit parts.
       ! ------ Scalar-Relativistic ------
       temp(:,1:((rank+1)*(rank+2))/2)= gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)
       call metcon(rank,gmet,temp,tmpfac)
       gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)= wt(ilang,iproj)*tmpfac(:,1:((rank+1)*(rank+2))/2)

       ! ---  Accumulate the nonlocal energy.
       do ii=1,ilang2
        jj=ii-1+jjs
        enlout(1)=enlout(1)+(gxafac(1,jj,ia,iproj)*gxa(1,jj,ia,iproj)+gxafac(2,jj,ia,iproj)*gxa(2,jj,ia,iproj) )
       end do

       ! End loop over iproj:
      end do
      ! End condition of existence of a reference state:
     end if
     ! End loop over ilang:
    end do
    ! End loop over ia:
   end do

   ! Operate with the non-local potential on the projected scalars,
   ! in order to get matrix element
   !!!!!!!!!!!!!!!!!!!!! opernl4b
   !Loop on subsets of plane waves (blocking)
   ipw1=gmin
   nincpw=gnpw

   ! Initialize kpgx array related to tensors defined below
   call mkffkg3_htor(choice,ffkg,ffnlout,gmet,idir,indlmn,ipw1,1,itypat,&
   &  kgout,kpgout,kpgx,kptout,lmnmax,mblkpw,0,nffkg,dimffnlout,nincpw,nkpgout,nlang,nloalg,&
   &  npwout,ntens,ntypat,parity)

   ! Application of non-local part from projected scalars
   ! back to reciprocal space ...
   ! [this section merely computes terms which add to <G|Vnl|C>;
   ! nothing here is needed when various gradients are being computed]

   ! Loop on atoms
   do ia=1,nattyp(itypat)

    ! Compute the shift needed to get the phases in ph3d
    iaph3d=ia+ia1-1

    ! Transfer gxa in scalars with different indexing
    iffkg=0
    do ilang=1,nlang
     nproj=jproj(ilang)
     if (nproj>0) then
      ilang2=(ilang*(ilang+1))/2
      do iproj=1,nproj
       do ii=1,ilang2
        jj=ii+((ilang-1)*ilang*(ilang+1))/6
        iffkg=iffkg+1
        scalars(1,iffkg)=gxafac(1,jj,ia,iproj)
        scalars(2,iffkg)=gxafac(2,jj,ia,iproj)
       end do
      end do
     end if
    end do

    ! ******* Entering the second critical part ****************************
    ! htor's processing
    ig=ipw1
    do ipw=1,nincpw
     ar=0 ; ai=0
     do iffkg=1,nffkg
      ar=ar+ffkg(iffkg,ipw)*scalars(1,iffkg)
      ai=ai+ffkg(iffkg,ipw)*scalars(2,iffkg)
     end do
     vectout(1,ipw)=vectout(1,ipw)+ar*ph3dout(1,ig,iaph3d)+ai*ph3dout(2,ig,iaph3d)
     vectout(2,ipw)=vectout(2,ipw)+ai*ph3dout(1,ig,iaph3d)-ar*ph3dout(2,ig,iaph3d)
     ig=ig+1
    end do
    ! ******* Leaving the critical part *********************************
    !  End loop on atoms
   end do

   deallocate(ffkg,kpgx,parity,scalars)
   !!!!!!!!!!!!!!!!!!!!! /opernl4b

   ! End condition of existence of a non-local part for that type of atom:
  end if

  !End atom type loop, over itypat:
  ia1=ia2+1
 end do

 ! De-allocate temporary space.
 deallocate(ekb_s,gxa,gxafac,wt,jproj,temp,tmpfac)

 if ((choice<1 .or. choice>6) .and. choice/=23 ) then
  write(message, '(a,a,a,a,i4,a)' )ch10, ' nonlop : BUG -',ch10,&
  & '  Does not support this choice=',choice,'.'
  call wrtout(06,message,'PERS')
  call leave_new('PERS')
 end if

end subroutine nonlop_htor
!!***
