Amesos Package Browser (Single Doxygen Collection) Development
amesos_klu_tsolve.c
Go to the documentation of this file.
```00001 /* ========================================================================== */
00002 /* === KLU_tsolve =========================================================== */
00003 /* ========================================================================== */
00004
00005 /* Solve A'x=b using the symbolic and numeric objects from KLU_analyze
00006  * (or KLU_analyze_given) and KLU_factor.  Note that no iterative refinement is
00007  * performed.  Uses Numeric->Xwork as workspace (undefined on input and output),
00008  * of size 4n Entry's (note that columns 2 to 4 of Xwork overlap with
00009  * Numeric->Iwork).
00010  */
00011
00012 #include "amesos_klu_internal.h"
00013
00014 Int KLU_tsolve
00015 (
00016     /* inputs, not modified */
00017     KLU_symbolic *Symbolic,
00018     KLU_numeric *Numeric,
00019     Int d,        /* leading dimension of B */
00020     Int nrhs,       /* number of right-hand-sides */
00021
00022     /* right-hand-side on input, overwritten with solution to Ax=b on output */
00023     double B [ ],     /* size n*nrhs, in column-oriented form, with
00024            * leading dimension d. */
00025 #ifdef COMPLEX
00026     Int conj_solve,     /* TRUE for conjugate transpose solve, FALSE for
00027            * array transpose solve.  Used for the complex
00028            * case only. */
00029 #endif
00030     /* --------------- */
00031     KLU_common *Common
00032 )
00033 {
00034     Entry x [4], offik, s ;
00035     double rs, *Rs ;
00036     Entry *Offx, *X, *Bz, *Udiag ;
00037     Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
00038     Unit **LUbx ;
00039     Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
00040 #ifdef KLU_ENABLE_OPENMP
00041     Entry *X1 ;
00042     int tid ;
00043 #endif
00044
00045     /* ---------------------------------------------------------------------- */
00046     /* check inputs */
00047     /* ---------------------------------------------------------------------- */
00048
00049     if (Common == NULL)
00050     {
00051   return (FALSE) ;
00052     }
00053     if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
00054   B == NULL)
00055     {
00056   Common->status = KLU_INVALID ;
00057   return (FALSE) ;
00058     }
00059     Common->status = KLU_OK ;
00060
00061     /* ---------------------------------------------------------------------- */
00062     /* get the contents of the Symbolic object */
00063     /* ---------------------------------------------------------------------- */
00064
00065     Bz = (Entry *) B ;
00066     n = Symbolic->n ;
00067     nblocks = Symbolic->nblocks ;
00068     Q = Symbolic->Q ;
00069     R = Symbolic->R ;
00070
00071     /* ---------------------------------------------------------------------- */
00072     /* get the contents of the Numeric object */
00073     /* ---------------------------------------------------------------------- */
00074
00075     ASSERT (nblocks == Numeric->nblocks) ;
00076     Pnum = Numeric->Pnum ;
00077     Offp = Numeric->Offp ;
00078     Offi = Numeric->Offi ;
00079     Offx = (Entry *) Numeric->Offx ;
00080
00081     Lip  = Numeric->Lip ;
00082     Llen = Numeric->Llen ;
00083     Uip  = Numeric->Uip ;
00084     Ulen = Numeric->Ulen ;
00085     LUbx = (Unit **) Numeric->LUbx ;
00086     Udiag = Numeric->Udiag ;
00087
00088     Rs = Numeric->Rs ;
00089 #ifdef KLU_ENABLE_OPENMP
00090     X1 = (Entry *) Numeric->Xwork ;
00091 #else
00092     X = (Entry *) Numeric->Xwork ;
00093 #endif
00094     ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
00095
00096     /* ---------------------------------------------------------------------- */
00097     /* solve in chunks of 4 columns at a time */
00098     /* ---------------------------------------------------------------------- */
00099
00100 #ifdef KLU_ENABLE_OPENMP
00101 #pragma omp parallel for schedule(guided) private(tid, nr, k, block, k1, k2, nk, pend, p, s, i, Bz, X, x, offik, rs)
00102 #endif
00103     for (chunk = 0 ; chunk < nrhs ; chunk += 4)
00104     {
00105 #ifdef KLU_ENABLE_OPENMP
00106         Bz  = ((Entry *) B) + d*chunk ;
00108         X = X1 + (tid * n * 4);
00109 #endif
00110
00111
00112   /* ------------------------------------------------------------------ */
00113   /* get the size of the current chunk */
00114   /* ------------------------------------------------------------------ */
00115
00116   nr = MIN (nrhs - chunk, 4) ;
00117
00118   /* ------------------------------------------------------------------ */
00119   /* permute the right hand side, X = Q'*B */
00120   /* ------------------------------------------------------------------ */
00121
00122   switch (nr)
00123   {
00124
00125       case 1:
00126
00127     for (k = 0 ; k < n ; k++)
00128     {
00129         X [k] = Bz  [Q [k]] ;
00130     }
00131     break ;
00132
00133       case 2:
00134
00135     for (k = 0 ; k < n ; k++)
00136     {
00137         i = Q [k] ;
00138         X [2*k    ] = Bz [i      ] ;
00139         X [2*k + 1] = Bz [i + d  ] ;
00140     }
00141     break ;
00142
00143       case 3:
00144
00145     for (k = 0 ; k < n ; k++)
00146     {
00147         i = Q [k] ;
00148         X [3*k    ] = Bz [i      ] ;
00149         X [3*k + 1] = Bz [i + d  ] ;
00150         X [3*k + 2] = Bz [i + d*2] ;
00151     }
00152     break ;
00153
00154       case 4:
00155
00156     for (k = 0 ; k < n ; k++)
00157     {
00158         i = Q [k] ;
00159         X [4*k    ] = Bz [i      ] ;
00160         X [4*k + 1] = Bz [i + d  ] ;
00161         X [4*k + 2] = Bz [i + d*2] ;
00162         X [4*k + 3] = Bz [i + d*3] ;
00163     }
00164     break ;
00165
00166   }
00167
00168   /* ------------------------------------------------------------------ */
00169   /* solve X = (L*U + Off)'\X */
00170   /* ------------------------------------------------------------------ */
00171
00172   for (block = 0 ; block < nblocks ; block++)
00173   {
00174
00175       /* -------------------------------------------------------------- */
00176       /* the block of size nk is from rows/columns k1 to k2-1 */
00177       /* -------------------------------------------------------------- */
00178
00179       k1 = R [block] ;
00180       k2 = R [block+1] ;
00181       nk = k2 - k1 ;
00182       PRINTF (("tsolve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
00183
00184       /* -------------------------------------------------------------- */
00185       /* block back-substitution for the off-diagonal-block entries */
00186       /* -------------------------------------------------------------- */
00187
00188       if (block > 0)
00189       {
00190     switch (nr)
00191         {
00192
00193         case 1:
00194
00195       for (k = k1 ; k < k2 ; k++)
00196       {
00197           pend = Offp [k+1] ;
00198           for (p = Offp [k] ; p < pend ; p++)
00199           {
00200 #ifdef COMPLEX
00201         if (conj_solve)
00202         {
00203             MULT_SUB_CONJ (X [k], X [Offi [p]],
00204               Offx [p]) ;
00205         }
00206         else
00207 #endif
00208         {
00209             MULT_SUB (X [k], Offx [p], X [Offi [p]]) ;
00210         }
00211           }
00212       }
00213       break ;
00214
00215         case 2:
00216
00217       for (k = k1 ; k < k2 ; k++)
00218       {
00219           pend = Offp [k+1] ;
00220           x [0] = X [2*k    ] ;
00221           x [1] = X [2*k + 1] ;
00222           for (p = Offp [k] ; p < pend ; p++)
00223           {
00224         i = Offi [p] ;
00225 #ifdef COMPLEX
00226         if (conj_solve)
00227         {
00228             CONJ (offik, Offx [p]) ;
00229         }
00230         else
00231 #endif
00232         {
00233             offik = Offx [p] ;
00234         }
00235         MULT_SUB (x [0], offik, X [2*i]) ;
00236         MULT_SUB (x [1], offik, X [2*i + 1]) ;
00237           }
00238           X [2*k    ] = x [0] ;
00239           X [2*k + 1] = x [1] ;
00240       }
00241       break ;
00242
00243         case 3:
00244
00245       for (k = k1 ; k < k2 ; k++)
00246       {
00247           pend = Offp [k+1] ;
00248           x [0] = X [3*k    ] ;
00249           x [1] = X [3*k + 1] ;
00250           x [2] = X [3*k + 2] ;
00251           for (p = Offp [k] ; p < pend ; p++)
00252           {
00253         i = Offi [p] ;
00254 #ifdef COMPLEX
00255         if (conj_solve)
00256         {
00257             CONJ (offik, Offx [p]) ;
00258         }
00259         else
00260 #endif
00261         {
00262             offik = Offx [p] ;
00263         }
00264         MULT_SUB (x [0], offik, X [3*i]) ;
00265         MULT_SUB (x [1], offik, X [3*i + 1]) ;
00266         MULT_SUB (x [2], offik, X [3*i + 2]) ;
00267           }
00268           X [3*k    ] = x [0] ;
00269           X [3*k + 1] = x [1] ;
00270           X [3*k + 2] = x [2] ;
00271       }
00272       break ;
00273
00274         case 4:
00275
00276       for (k = k1 ; k < k2 ; k++)
00277       {
00278           pend = Offp [k+1] ;
00279           x [0] = X [4*k    ] ;
00280           x [1] = X [4*k + 1] ;
00281           x [2] = X [4*k + 2] ;
00282           x [3] = X [4*k + 3] ;
00283           for (p = Offp [k] ; p < pend ; p++)
00284           {
00285         i = Offi [p] ;
00286 #ifdef COMPLEX
00287         if (conj_solve)
00288         {
00289             CONJ(offik, Offx [p]) ;
00290         }
00291         else
00292 #endif
00293         {
00294             offik = Offx [p] ;
00295         }
00296         MULT_SUB (x [0], offik, X [4*i]) ;
00297         MULT_SUB (x [1], offik, X [4*i + 1]) ;
00298         MULT_SUB (x [2], offik, X [4*i + 2]) ;
00299         MULT_SUB (x [3], offik, X [4*i + 3]) ;
00300           }
00301           X [4*k    ] = x [0] ;
00302           X [4*k + 1] = x [1] ;
00303           X [4*k + 2] = x [2] ;
00304           X [4*k + 3] = x [3] ;
00305       }
00306       break ;
00307         }
00308       }
00309
00310       /* -------------------------------------------------------------- */
00311       /* solve the block system */
00312       /* -------------------------------------------------------------- */
00313
00314       if (nk == 1)
00315       {
00316 #ifdef COMPLEX
00317     if (conj_solve)
00318     {
00319         CONJ (s, Udiag [k1]) ;
00320     }
00321     else
00322 #endif
00323     {
00324         s = Udiag [k1] ;
00325     }
00326     switch (nr)
00327     {
00328
00329         case 1:
00330       DIV (X [k1], X [k1], s) ;
00331       break ;
00332
00333         case 2:
00334       DIV (X [2*k1], X [2*k1], s) ;
00335       DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
00336       break ;
00337
00338         case 3:
00339       DIV (X [3*k1], X [3*k1], s) ;
00340       DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
00341       DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
00342       break ;
00343
00344         case 4:
00345       DIV (X [4*k1], X [4*k1], s) ;
00346       DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
00347       DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
00348       DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
00349       break ;
00350
00351     }
00352       }
00353       else
00354       {
00355     KLU_utsolve (nk, Uip + k1, Ulen + k1, LUbx [block],
00356       Udiag + k1, nr,
00357 #ifdef COMPLEX
00358       conj_solve,
00359 #endif
00360       X + nr*k1) ;
00361     KLU_ltsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
00362 #ifdef COMPLEX
00363       conj_solve,
00364 #endif
00365       X + nr*k1) ;
00366       }
00367   }
00368
00369   /* ------------------------------------------------------------------ */
00370   /* scale and permute the result, Bz  = P'(R\X) */
00371   /* ------------------------------------------------------------------ */
00372
00373   if (Rs == NULL)
00374   {
00375
00376       /* no scaling */
00377       switch (nr)
00378       {
00379
00380     case 1:
00381
00382         for (k = 0 ; k < n ; k++)
00383         {
00384       Bz  [Pnum [k]] = X [k] ;
00385         }
00386         break ;
00387
00388     case 2:
00389
00390         for (k = 0 ; k < n ; k++)
00391         {
00392       i = Pnum [k] ;
00393       Bz  [i      ] = X [2*k    ] ;
00394       Bz  [i + d  ] = X [2*k + 1] ;
00395         }
00396         break ;
00397
00398     case 3:
00399
00400         for (k = 0 ; k < n ; k++)
00401         {
00402       i = Pnum [k] ;
00403       Bz  [i      ] = X [3*k    ] ;
00404       Bz  [i + d  ] = X [3*k + 1] ;
00405       Bz  [i + d*2] = X [3*k + 2] ;
00406         }
00407         break ;
00408
00409     case 4:
00410
00411         for (k = 0 ; k < n ; k++)
00412         {
00413       i = Pnum [k] ;
00414       Bz  [i      ] = X [4*k    ] ;
00415       Bz  [i + d  ] = X [4*k + 1] ;
00416       Bz  [i + d*2] = X [4*k + 2] ;
00417       Bz  [i + d*3] = X [4*k + 3] ;
00418         }
00419         break ;
00420       }
00421
00422   }
00423   else
00424   {
00425
00426       switch (nr)
00427       {
00428
00429     case 1:
00430
00431         for (k = 0 ; k < n ; k++)
00432         {
00433       SCALE_DIV_ASSIGN (Bz [Pnum [k]], X [k], Rs [k]) ;
00434         }
00435         break ;
00436
00437     case 2:
00438
00439         for (k = 0 ; k < n ; k++)
00440         {
00441       i = Pnum [k] ;
00442       rs = Rs [k] ;
00443       SCALE_DIV_ASSIGN (Bz [i], X [2*k], rs) ;
00444       SCALE_DIV_ASSIGN (Bz [i + d], X [2*k + 1], rs) ;
00445         }
00446         break ;
00447
00448     case 3:
00449
00450         for (k = 0 ; k < n ; k++)
00451         {
00452       i = Pnum [k] ;
00453       rs = Rs [k] ;
00454       SCALE_DIV_ASSIGN (Bz [i], X [3*k], rs) ;
00455       SCALE_DIV_ASSIGN (Bz [i + d], X [3*k + 1], rs) ;
00456       SCALE_DIV_ASSIGN (Bz [i + d*2], X [3*k + 2], rs) ;
00457         }
00458         break ;
00459
00460     case 4:
00461
00462         for (k = 0 ; k < n ; k++)
00463         {
00464       i = Pnum [k] ;
00465       rs = Rs [k] ;
00466       SCALE_DIV_ASSIGN (Bz [i], X [4*k], rs) ;
00467       SCALE_DIV_ASSIGN (Bz [i + d], X [4*k + 1], rs) ;
00468       SCALE_DIV_ASSIGN (Bz [i + d*2], X [4*k + 2], rs) ;
00469       SCALE_DIV_ASSIGN (Bz [i + d*3], X [4*k + 3], rs) ;
00470         }
00471         break ;
00472       }
00473   }
00474
00475   /* ------------------------------------------------------------------ */
00476   /* go to the next chunk of B */
00477   /* ------------------------------------------------------------------ */
00478
00479 #ifndef KLU_ENABLE_OPENMP
00480   Bz  += d*4 ;
00481 #endif
00482     }
00483     return (TRUE) ;
00484 }
```