|
Anasazi Version of the Day
|
00001 // @HEADER 00002 // *********************************************************************** 00003 // 00004 // Anasazi: Block Eigensolvers Package 00005 // Copyright (2010) Sandia Corporation 00006 // 00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 00008 // license for use of this work by or on behalf of the U.S. Government. 00009 // 00010 // This library is free software; you can redistribute it and/or modify 00011 // it under the terms of the GNU Lesser General Public License as 00012 // published by the Free Software Foundation; either version 2.1 of the 00013 // License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, but 00016 // WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00023 // USA 00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 00025 // 00026 // *********************************************************************** 00027 // @HEADER 00028 00029 #ifndef __TSQR_Tsqr_hpp 00030 #define __TSQR_Tsqr_hpp 00031 00032 #include <Tsqr_ApplyType.hpp> 00033 #include <Tsqr_Matrix.hpp> 00034 #include <Tsqr_MessengerBase.hpp> 00035 #include <Tsqr_DistTsqr.hpp> 00036 #include <Tsqr_SequentialTsqr.hpp> 00037 #include <Tsqr_ScalarTraits.hpp> 00038 #include <Tsqr_Util.hpp> 00039 00042 00043 namespace TSQR { 00044 00077 template< class LocalOrdinal, 00078 class Scalar, 00079 class NodeTsqrType = SequentialTsqr< LocalOrdinal, Scalar >, 00080 class DistTsqrType = DistTsqr< LocalOrdinal, Scalar > > 00081 class Tsqr { 00082 public: 00083 typedef MatView< LocalOrdinal, Scalar > matview_type; 00084 typedef ConstMatView< LocalOrdinal, Scalar > const_matview_type; 00085 typedef Matrix< LocalOrdinal, Scalar > matrix_type; 00086 00087 typedef Scalar scalar_type; 00088 typedef LocalOrdinal ordinal_type; 00089 typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type; 00090 00091 typedef NodeTsqrType node_tsqr_type; 00092 typedef DistTsqrType dist_tsqr_type; 00093 typedef typename Teuchos::RCP< node_tsqr_type > node_tsqr_ptr; 00094 typedef typename Teuchos::RCP< dist_tsqr_type > dist_tsqr_ptr; 00095 typedef typename dist_tsqr_type::rank_type rank_type; 00096 00097 typedef typename node_tsqr_type::FactorOutput NodeOutput; 00098 typedef typename dist_tsqr_type::FactorOutput DistOutput; 00099 typedef std::pair< NodeOutput, DistOutput > FactorOutput; 00100 00108 Tsqr (const node_tsqr_ptr& nodeTsqr, const dist_tsqr_ptr& distTsqr) : 00109 nodeTsqr_ (nodeTsqr), distTsqr_ (distTsqr) 00110 {} 00111 00120 size_t cache_block_size() const { return nodeTsqr_->cache_block_size(); } 00121 00128 bool QR_produces_R_factor_with_nonnegative_diagonal () const { 00129 return nodeTsqr_->QR_produces_R_factor_with_nonnegative_diagonal() && 00130 distTsqr_->QR_produces_R_factor_with_nonnegative_diagonal(); 00131 } 00132 00133 00134 void 00135 factorExplicit (matview_type& A, 00136 matview_type& Q, 00137 matview_type& R, 00138 const bool contiguousCacheBlocks = false) 00139 { 00140 // Sanity checks for matrix dimensions 00141 if (A.nrows() < A.ncols()) 00142 throw std::logic_error ("A must have at least as many rows as columns"); 00143 else if (A.nrows() != Q.nrows()) 00144 throw std::logic_error ("A and Q must have same number of rows"); 00145 else if (R.nrows() < R.ncols()) 00146 throw std::logic_error ("R must have at least as many rows as columns"); 00147 else if (A.ncols() != R.ncols()) 00148 throw std::logic_error ("A and R must have the same number of columns"); 00149 00150 // Checks for quick exit, based on matrix dimensions 00151 if (Q.ncols() == ordinal_type(0)) 00152 return; 00153 00154 R.fill (scalar_type (0)); 00155 NodeOutput nodeResults = 00156 nodeTsqr_->factor (A.nrows(), A.ncols(), A.get(), A.lda(), 00157 R.get(), R.lda(), contiguousCacheBlocks); 00158 nodeTsqr_->fill_with_zeros (Q.nrows(), Q.ncols(), Q.get(), Q.lda(), 00159 contiguousCacheBlocks); 00160 matview_type Q_top_block = 00161 nodeTsqr_->top_block (Q, contiguousCacheBlocks); 00162 if (Q_top_block.nrows() < R.ncols()) 00163 throw std::logic_error("Q has too few rows"); 00164 matview_type Q_top (R.ncols(), Q.ncols(), 00165 Q_top_block.get(), Q_top_block.lda()); 00166 distTsqr_->factorExplicit (R, Q_top); 00167 nodeTsqr_->apply (ApplyType::NoTranspose, 00168 A.nrows(), A.ncols(), A.get(), A.lda(), nodeResults, 00169 Q.ncols(), Q.get(), Q.lda(), contiguousCacheBlocks); 00170 } 00171 00172 00216 FactorOutput 00217 factor (const LocalOrdinal nrows_local, 00218 const LocalOrdinal ncols, 00219 Scalar A_local[], 00220 const LocalOrdinal lda_local, 00221 Scalar R[], 00222 const LocalOrdinal ldr, 00223 const bool contiguousCacheBlocks = false) 00224 { 00225 MatView< LocalOrdinal, Scalar > R_view (ncols, ncols, R, ldr); 00226 R_view.fill (Scalar(0)); 00227 NodeOutput nodeResults = 00228 nodeTsqr_->factor (nrows_local, ncols, A_local, lda_local, 00229 R_view.get(), R_view.lda(), 00230 contiguousCacheBlocks); 00231 DistOutput distResults = distTsqr_->factor (R_view); 00232 return std::make_pair (nodeResults, distResults); 00233 } 00234 00273 void 00274 apply (const std::string& op, 00275 const LocalOrdinal nrows_local, 00276 const LocalOrdinal ncols_Q, 00277 const Scalar Q_local[], 00278 const LocalOrdinal ldq_local, 00279 const FactorOutput& factor_output, 00280 const LocalOrdinal ncols_C, 00281 Scalar C_local[], 00282 const LocalOrdinal ldc_local, 00283 const bool contiguousCacheBlocks = false) 00284 { 00285 ApplyType applyType (op); 00286 00287 // This determines the order in which we apply the intranode 00288 // part of the Q factor vs. the internode part of the Q factor. 00289 const bool transposed = applyType.transposed(); 00290 00291 // View of this node's local part of the matrix C. 00292 matview_type C_view (nrows_local, ncols_C, C_local, ldc_local); 00293 00294 // Identify top ncols_C by ncols_C block of C_local. 00295 // top_block() will set C_top_view to have the correct leading 00296 // dimension, whether or not cache blocks are stored 00297 // contiguously. 00298 // 00299 // C_top_view is the topmost cache block of C_local. It has at 00300 // least as many rows as columns, but it likely has more rows 00301 // than columns. 00302 matview_type C_view_top_block = 00303 nodeTsqr_->top_block (C_view, contiguousCacheBlocks); 00304 00305 // View of the topmost ncols_C by ncols_C block of C. 00306 matview_type C_top_view (ncols_C, ncols_C, C_view_top_block.get(), 00307 C_view_top_block.lda()); 00308 00309 if (! transposed) 00310 { 00311 // C_top (small compact storage) gets a deep copy of the top 00312 // ncols_C by ncols_C block of C_local. 00313 matrix_type C_top (C_top_view); 00314 00315 // Compute in place on all processors' C_top blocks. 00316 distTsqr_->apply (applyType, C_top.ncols(), ncols_Q, C_top.get(), 00317 C_top.lda(), factor_output.second); 00318 00319 // Copy the result from C_top back into the top ncols_C by 00320 // ncols_C block of C_local. 00321 C_top_view.copy (C_top); 00322 00323 // Apply the local Q factor (in Q_local and 00324 // factor_output.first) to C_local. 00325 nodeTsqr_->apply (applyType, nrows_local, ncols_Q, 00326 Q_local, ldq_local, factor_output.first, 00327 ncols_C, C_local, ldc_local, 00328 contiguousCacheBlocks); 00329 } 00330 else 00331 { 00332 // Apply the (transpose of the) local Q factor (in Q_local 00333 // and factor_output.first) to C_local. 00334 nodeTsqr_->apply (applyType, nrows_local, ncols_Q, 00335 Q_local, ldq_local, factor_output.first, 00336 ncols_C, C_local, ldc_local, 00337 contiguousCacheBlocks); 00338 00339 // C_top (small compact storage) gets a deep copy of the top 00340 // ncols_C by ncols_C block of C_local. 00341 matrix_type C_top (C_top_view); 00342 00343 // Compute in place on all processors' C_top blocks. 00344 distTsqr_->apply (applyType, ncols_C, ncols_Q, C_top.get(), 00345 C_top.lda(), factor_output.second); 00346 00347 // Copy the result from C_top back into the top ncols_C by 00348 // ncols_C block of C_local. 00349 C_top_view.copy (C_top); 00350 } 00351 } 00352 00386 void 00387 explicit_Q (const LocalOrdinal nrows_local, 00388 const LocalOrdinal ncols_Q_in, 00389 const Scalar Q_local_in[], 00390 const LocalOrdinal ldq_local_in, 00391 const FactorOutput& factorOutput, 00392 const LocalOrdinal ncols_Q_out, 00393 Scalar Q_local_out[], 00394 const LocalOrdinal ldq_local_out, 00395 const bool contiguousCacheBlocks = false) 00396 { 00397 nodeTsqr_->fill_with_zeros (nrows_local, ncols_Q_out, Q_local_out, 00398 ldq_local_out, contiguousCacheBlocks); 00399 const rank_type myRank = distTsqr_->rank(); 00400 if (myRank == 0) 00401 { 00402 // View of this node's local part of the matrix Q_out. 00403 matview_type Q_out_view (nrows_local, ncols_Q_out, 00404 Q_local_out, ldq_local_out); 00405 00406 // View of the topmost cache block of Q_out. It is 00407 // guaranteed to have at least as many rows as columns. 00408 matview_type Q_out_top = 00409 nodeTsqr_->top_block (Q_out_view, contiguousCacheBlocks); 00410 00411 // Fill (topmost cache block of) Q_out with the first 00412 // ncols_Q_out columns of the identity matrix. 00413 for (ordinal_type j = 0; j < ncols_Q_out; ++j) 00414 Q_out_top(j, j) = Scalar(1); 00415 } 00416 apply ("N", nrows_local, 00417 ncols_Q_in, Q_local_in, ldq_local_in, factorOutput, 00418 ncols_Q_out, Q_local_out, ldq_local_out, 00419 contiguousCacheBlocks); 00420 } 00421 00426 void 00427 Q_times_B (const LocalOrdinal nrows, 00428 const LocalOrdinal ncols, 00429 Scalar Q[], 00430 const LocalOrdinal ldq, 00431 const Scalar B[], 00432 const LocalOrdinal ldb, 00433 const bool contiguousCacheBlocks = false) const 00434 { 00435 // This requires no internode communication. However, the work 00436 // is not redundant, since each MPI process has a different Q. 00437 nodeTsqr_->Q_times_B (nrows, ncols, Q, ldq, B, ldb, 00438 contiguousCacheBlocks); 00439 // We don't need a barrier after this method, unless users are 00440 // doing mean MPI_Get() things. 00441 } 00442 00450 LocalOrdinal 00451 reveal_R_rank (const LocalOrdinal ncols, 00452 Scalar R[], 00453 const LocalOrdinal ldr, 00454 Scalar U[], 00455 const LocalOrdinal ldu, 00456 const magnitude_type tol) const 00457 { 00458 // Forward the request to the intranode TSQR implementation. 00459 // This work is performed redundantly on all MPI processes. 00460 // 00461 // FIXME (mfh 26 Aug 2010) Could be a problem if your cluster is 00462 // heterogeneous, because then you might obtain different 00463 // answers (due to rounding error) on different processors. 00464 return nodeTsqr_->reveal_R_rank (ncols, R, ldr, U, ldu, tol); 00465 } 00466 00485 LocalOrdinal 00486 reveal_rank (const LocalOrdinal nrows, 00487 const LocalOrdinal ncols, 00488 Scalar Q[], 00489 const LocalOrdinal ldq, 00490 Scalar R[], 00491 const LocalOrdinal ldr, 00492 const magnitude_type tol, 00493 const bool contiguousCacheBlocks = false) const 00494 { 00495 // Take the easy exit if available. 00496 if (ncols == 0) 00497 return 0; 00498 00499 // 00500 // FIXME (mfh 16 Jul 2010) We _should_ compute the SVD of R (as 00501 // the copy B) on Proc 0 only. This would ensure that all 00502 // processors get the same SVD and rank (esp. in a heterogeneous 00503 // computing environment). For now, we just do this computation 00504 // redundantly, and hope that all the returned rank values are 00505 // the same. 00506 // 00507 matrix_type U (ncols, ncols, Scalar(0)); 00508 const ordinal_type rank = 00509 reveal_R_rank (ncols, R, ldr, U.get(), U.lda(), tol); 00510 00511 if (rank < ncols) 00512 { 00513 // cerr << ">>> Rank of R: " << rank << " < ncols=" << ncols << endl; 00514 // cerr << ">>> Resulting U:" << endl; 00515 // print_local_matrix (cerr, ncols, ncols, R, ldr); 00516 // cerr << endl; 00517 00518 // If R is not full rank: reveal_R_rank() already computed 00519 // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and 00520 // overwrote R with \f$\Sigma V^*\f$. Now, we compute \f$Q 00521 // := Q \cdot U\f$, respecting cache blocks of Q. 00522 Q_times_B (nrows, ncols, Q, ldq, U.get(), U.lda(), 00523 contiguousCacheBlocks); 00524 } 00525 return rank; 00526 } 00527 00532 void 00533 cache_block (const LocalOrdinal nrows_local, 00534 const LocalOrdinal ncols, 00535 Scalar A_local_out[], 00536 const Scalar A_local_in[], 00537 const LocalOrdinal lda_local_in) const 00538 { 00539 nodeTsqr_->cache_block (nrows_local, ncols, 00540 A_local_out, 00541 A_local_in, lda_local_in); 00542 } 00543 00548 void 00549 un_cache_block (const LocalOrdinal nrows_local, 00550 const LocalOrdinal ncols, 00551 Scalar A_local_out[], 00552 const LocalOrdinal lda_local_out, 00553 const Scalar A_local_in[]) const 00554 { 00555 nodeTsqr_->un_cache_block (nrows_local, ncols, 00556 A_local_out, lda_local_out, 00557 A_local_in); 00558 } 00559 00560 private: 00561 node_tsqr_ptr nodeTsqr_; 00562 dist_tsqr_ptr distTsqr_; 00563 }; // class Tsqr 00564 00565 } // namespace TSQR 00566 00567 #endif // __TSQR_Tsqr_hpp
1.7.4