! rbmatmod -- Copyright 2000 Paolo Luchini
! http://CPLcode.net/Applications/Numerical/rbmatmod/
!
! parallel implementation of banded LU decomposition
! performs the solution of a banded linear system by the same algorithm used
! in rbmat.cpl. Each node in sequence solves a portion of the system and passes
! boundary data to the next node.

!( Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE. !)


SUBROUTINE LUdecompStep(POINTER TO ARRAY(*,-2..2) OF REAL A)
  IF last THEN A(HI)=0; A(HI-1)=0; A(HI-2,1..2)=0; A(HI-3,2)=0 ELSE
    READ BINARY FROM next A(HI-1),A(HI)
  END IF
  REAL piv
  LOOP FOR i=A.HI1-A.HI2 DOWN TO A.LO1
    LOOP FOR k=A.HI2 DOWN TO 1
      POINTER INTO A(i,*+k),A(i+k,*) j=0
      piv=A(i,j+k)
      DO DEC j; A(i,j+k)=~-piv*A(i+k,j) FOR -A.LO2 TIMES
    REPEAT LOOP
    piv=1/A(i,0); A(i,0)=piv
    LOOP FOR j INTO A(i,A.LO2..-1): A(i,j)=~*piv
  REPEAT LOOP
  IF first THEN A(LO,-2..-1)=0; A(LO+1,-2)=0 ELSE
    WRITE BINARY TO prev A(LO),A(LO+1)
  END IF
END LUdecompStep

SUBROUTINE LeftLUDivStep1(POINTER TO ARRAY(*) OF REAL x; ARRAY(*,-2..2) OF REAL A; ARRAY(*) OF REAL t)
  IF NOT last THEN READ BINARY FROM next x(HI-1),x(HI)
  LOOP FOR i=A.HI1-A.HI2 DOWN TO A.LO1
    POINTER INTO A(i,*),x(i+*) j=A.HI2
    REAL sum=t(i)
    LOOP FOR A.HI2 TIMES: sum=~-A(i,j)*x(i+j); DEC j
    x(i+j)=sum*A(i,j)
  REPEAT LOOP
  IF NOT first THEN WRITE BINARY TO prev x(LO+2),x(LO+3)
END LeftLUDivStep1

INLINE SUBROUTINE FlushStep1 = IF NOT first THEN FLUSH prev

SUBROUTINE LeftLUDivStep2(POINTER TO ARRAY(*) OF REAL x; ARRAY(*,-2..2) OF REAL A)
  IF NOT first THEN READ BINARY FROM prev x(LO),x(LO+1)
  LOOP FOR i=A.LO1 TO A.HI1
    POINTER INTO A(i,*),x(i+*) j=A.LO2
    REAL sum=0
    LOOP FOR -A.LO2 TIMES: sum=~+A(i,j)*x(i+j); INC j
    x(i+j)=~-sum
  REPEAT LOOP
  IF NOT last THEN WRITE BINARY TO next x(HI-3),x(HI-2)
END LeftLUDivStep2

INLINE SUBROUTINE FlushStep2 = IF NOT last THEN FLUSH next

SUBROUTINE RightLUDivStep1(ARRAY(*) OF REAL x^,t; ARRAY(*,-2..2) OF REAL A)
  IF NOT last THEN READ BINARY FROM next x(HI-1),x(HI)
  LOOP FOR i=A.HI1-A.HI2 DOWN TO A.LO1
    REAL sum=t(i)
    DO sum=~-A(i-j,j)*x(i-j) FOR j=A.LO2 TO -1
    x(i)=sum
  REPEAT LOOP
  IF NOT first THEN WRITE BINARY TO prev x(LO+2),x(LO+3)
END RightLUDivStep1

SUBROUTINE RightLUDivStep2(ARRAY(*) OF REAL x^; ARRAY(*,-2..2) OF REAL A)
  IF NOT first THEN READ BINARY FROM prev x(LO),x(LO+1),x(LO+2),x(LO+3)
  LOOP FOR i=A.LO1 TO A.HI1-A.HI2
    POINTER INTO A(i,*),x(i+*) j=0
    p=x(i+j)*A(i,0)
    x(i+j)=p
    DO INC j; x(i+j)=~-A(i,j)*p FOR A.HI2 TIMES
  REPEAT LOOP
  IF NOT last THEN
    WRITE BINARY TO next x(HI-3),x(HI-2),x(HI-1),x(HI)
    x(HI-1)=~*A(HI-1,0)
    x(HI)=(~-A(HI-1,1)*x(HI-1))*A(HI,0)
  END IF
END RightLUDivStep2