|
Anasazi Version of the Day
|
00001 // @HEADER 00002 // *********************************************************************** 00003 // 00004 // Anasazi: Block Eigensolvers Package 00005 // Copyright (2010) Sandia Corporation 00006 // 00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 00008 // license for use of this work by or on behalf of the U.S. Government. 00009 // 00010 // This library is free software; you can redistribute it and/or modify 00011 // it under the terms of the GNU Lesser General Public License as 00012 // published by the Free Software Foundation; either version 2.1 of the 00013 // License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, but 00016 // WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00023 // USA 00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 00025 // 00026 // *********************************************************************** 00027 // @HEADER 00028 00029 #ifndef __TSQR_Tsqr_SequentialCholeskyQR_hpp 00030 #define __TSQR_Tsqr_SequentialCholeskyQR_hpp 00031 00032 #include <Tsqr_MatView.hpp> 00033 #include <Tsqr_Blas.hpp> 00034 #include <Tsqr_Lapack.hpp> 00035 #include <Tsqr_CacheBlockingStrategy.hpp> 00036 #include <Tsqr_CacheBlocker.hpp> 00037 #include <Tsqr_ScalarTraits.hpp> 00038 #include <Tsqr_Util.hpp> 00039 00040 #include <string> 00041 #include <utility> 00042 #include <vector> 00043 00046 00047 namespace TSQR { 00048 00049 template< class LocalOrdinal, class Scalar > 00050 class SequentialCholeskyQR { 00051 private: 00052 typedef MatView< LocalOrdinal, Scalar > mat_view; 00053 typedef ConstMatView< LocalOrdinal, Scalar > const_mat_view; 00054 00055 public: 00056 typedef Scalar scalar_type; 00057 typedef LocalOrdinal ordinal_type; 00058 // Here, FactorOutput is just a minimal object whose value is 00059 // irrelevant, so that the static interface looks like that of 00060 // SequentialTSQR. 00061 typedef int FactorOutput; 00062 00064 size_t 00065 cache_block_size () const { return strategy_.cache_block_size(); } 00066 00073 SequentialCholeskyQR (const size_t cache_block_size = 0) : 00074 strategy_ (cache_block_size) 00075 {} 00076 00080 bool QR_produces_R_factor_with_nonnegative_diagonal () const { 00081 return true; 00082 } 00083 00088 FactorOutput 00089 factor (const LocalOrdinal nrows, 00090 const LocalOrdinal ncols, 00091 const Scalar A[], 00092 const LocalOrdinal lda, 00093 Scalar R[], 00094 const LocalOrdinal ldr, 00095 const bool contiguous_cache_blocks = false) 00096 { 00097 CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); 00098 LAPACK< LocalOrdinal, Scalar > lapack; 00099 BLAS< LocalOrdinal, Scalar > blas; 00100 std::vector< Scalar > work (ncols); 00101 Matrix< LocalOrdinal, Scalar > ATA (ncols, ncols, Scalar(0)); 00102 FactorOutput retval (0); 00103 00104 if (contiguous_cache_blocks) 00105 { 00106 // Compute ATA := A^T * A, by iterating through the cache 00107 // blocks of A from top to bottom. 00108 // 00109 // We say "A_rest" because it points to the remaining part of 00110 // the matrix left to process; at the beginning, the "remaining" 00111 // part is the whole matrix, but that will change as the 00112 // algorithm progresses. 00113 mat_view A_rest (nrows, ncols, A, lda); 00114 // This call modifies A_rest (but not the actual matrix 00115 // entries; just the dimensions and current position). 00116 mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); 00117 // Process the first cache block: ATA := A_cur^T * A_cur 00118 blas.GEMM ("T", "N", ncols, ncols, A_cur.nrows(), 00119 Scalar(1), A_cur.get(), A_cur.lda(), A_cur.get(), A_cur.lda(), 00120 Scalar(0), ATA.get(), ATA.lda()); 00121 // Process the remaining cache blocks in order. 00122 while (! A_rest.empty()) 00123 { 00124 A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); 00125 // ATA := ATA + A_cur^T * A_cur 00126 blas.GEMM ("T", "N", ncols, ncols, A_cur.nrows(), 00127 Scalar(1), A_cur.get(), A_cur.lda(), A_cur.get(), A_cur.lda(), 00128 Scalar(1), ATA.get(), ATA.lda()); 00129 } 00130 } 00131 else 00132 // Compute ATA := A^T * A, using a single BLAS call. 00133 blas.GEMM ("T", "N", ncols, ncols, nrows, 00134 Scalar(1), A, lda, A, lda, 00135 Scalar(0), ATA.get(), ATA.lda()); 00136 00137 // Compute the Cholesky factorization of ATA in place, so that 00138 // A^T * A = R^T * R, where R is ncols by ncols upper 00139 // triangular. 00140 int info = 0; 00141 lapack.POTRF ("U", ncols, ATA.get(), ATA.lda(), &info); 00142 // FIXME (mfh 22 June 2010) The right thing to do here would be 00143 // to resort to a rank-revealing factorization, as Stathopoulos 00144 // and Wu (2002) do with their CholeskyQR + symmetric 00145 // eigensolver factorization. 00146 if (info != 0) 00147 throw std::runtime_error("Cholesky factorization failed"); 00148 00149 // Copy out the R factor 00150 fill_matrix (ncols, ncols, R, ldr, Scalar(0)); 00151 copy_upper_triangle (ncols, ncols, R, ldr, ATA.get(), ATA.lda()); 00152 00153 // Compute A := A * R^{-1}. We do this in place in A, using 00154 // BLAS' TRSM with the R factor (form POTRF) stored in the upper 00155 // triangle of ATA. 00156 { 00157 mat_view A_rest (nrows, ncols, A, lda); 00158 // This call modifies A_rest. 00159 mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); 00160 00161 // Compute A_cur / R (Matlab notation for A_cur * R^{-1}) in place. 00162 blas.TRSM ("R", "U", "N", "N", A_cur.nrows(), ncols, 00163 Scalar(1), ATA.get(), ATA.lda(), A_cur.get(), A_cur.lda()); 00164 00165 // Process the remaining cache blocks in order. 00166 while (! A_rest.empty()) 00167 { 00168 A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); 00169 blas.TRSM ("R", "U", "N", "N", A_cur.nrows(), ncols, 00170 Scalar(1), ATA.get(), ATA.lda(), A_cur.get(), A_cur.lda()); 00171 } 00172 } 00173 00174 return retval; 00175 } 00176 00177 00180 void 00181 explicit_Q (const LocalOrdinal nrows, 00182 const LocalOrdinal ncols_Q, 00183 const Scalar Q[], 00184 const LocalOrdinal ldq, 00185 const FactorOutput& factor_output, 00186 const LocalOrdinal ncols_C, 00187 Scalar C[], 00188 const LocalOrdinal ldc, 00189 const bool contiguous_cache_blocks = false) 00190 { 00191 if (ncols_Q != ncols_C) 00192 throw std::logic_error("SequentialCholeskyQR::explicit_Q() " 00193 "does not work if ncols_C != ncols_Q"); 00194 const LocalOrdinal ncols = ncols_Q; 00195 00196 if (contiguous_cache_blocks) 00197 { 00198 CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); 00199 mat_view C_rest (nrows, ncols, C, ldc); 00200 const_mat_view Q_rest (nrows, ncols, Q, ldq); 00201 00202 mat_view C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks); 00203 const_mat_view Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks); 00204 00205 while (! C_rest.empty()) 00206 Q_cur.copy (C_cur); 00207 } 00208 else 00209 { 00210 mat_view C_view (nrows, ncols, C, ldc); 00211 C_view.copy (const_mat_view (nrows, ncols, Q, ldq)); 00212 } 00213 } 00214 00215 00217 void 00218 cache_block (const LocalOrdinal nrows, 00219 const LocalOrdinal ncols, 00220 Scalar A_out[], 00221 const Scalar A_in[], 00222 const LocalOrdinal lda_in) const 00223 { 00224 CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); 00225 blocker.cache_block (nrows, ncols, A_out, A_in, lda_in); 00226 } 00227 00228 00230 void 00231 un_cache_block (const LocalOrdinal nrows, 00232 const LocalOrdinal ncols, 00233 Scalar A_out[], 00234 const LocalOrdinal lda_out, 00235 const Scalar A_in[]) const 00236 { 00237 CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); 00238 blocker.un_cache_block (nrows, ncols, A_out, lda_out, A_in); 00239 } 00240 00242 void 00243 fill_with_zeros (const LocalOrdinal nrows, 00244 const LocalOrdinal ncols, 00245 Scalar A[], 00246 const LocalOrdinal lda, 00247 const bool contiguous_cache_blocks = false) 00248 { 00249 CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); 00250 blocker.fill_with_zeros (nrows, ncols, A, lda, contiguous_cache_blocks); 00251 } 00252 00260 template< class MatrixViewType > 00261 MatrixViewType 00262 top_block (const MatrixViewType& C, 00263 const bool contiguous_cache_blocks = false) const 00264 { 00265 // The CacheBlocker object knows how to construct a view of the 00266 // top cache block of C. This is complicated because cache 00267 // blocks (in C) may or may not be stored contiguously. If they 00268 // are stored contiguously, the CacheBlocker knows the right 00269 // layout, based on the cache blocking strategy. 00270 CacheBlocker< LocalOrdinal, Scalar > blocker (C.nrows(), C.ncols(), strategy_); 00271 00272 // C_top_block is a view of the topmost cache block of C. 00273 // C_top_block should have >= ncols rows, otherwise either cache 00274 // blocking is broken or the input matrix C itself had fewer 00275 // rows than columns. 00276 MatrixViewType C_top_block = blocker.top_block (C, contiguous_cache_blocks); 00277 if (C_top_block.nrows() < C_top_block.ncols()) 00278 throw std::logic_error ("C\'s topmost cache block has fewer rows than " 00279 "columns"); 00280 return C_top_block; 00281 } 00282 00283 private: 00284 CacheBlockingStrategy< LocalOrdinal, Scalar > strategy_; 00285 }; 00286 00287 } // namespace TSQR 00288 00289 #endif // __TSQR_Tsqr_SequentialCholeskyQR_hpp
1.7.4