|
Anasazi Version of the Day
|
00001 // @HEADER 00002 // *********************************************************************** 00003 // 00004 // Anasazi: Block Eigensolvers Package 00005 // Copyright (2010) Sandia Corporation 00006 // 00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 00008 // license for use of this work by or on behalf of the U.S. Government. 00009 // 00010 // This library is free software; you can redistribute it and/or modify 00011 // it under the terms of the GNU Lesser General Public License as 00012 // published by the Free Software Foundation; either version 2.1 of the 00013 // License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, but 00016 // WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00023 // USA 00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 00025 // 00026 // *********************************************************************** 00027 // @HEADER 00028 00029 #ifndef __TSQR_Tsqr_SequentialTsqr_hpp 00030 #define __TSQR_Tsqr_SequentialTsqr_hpp 00031 00032 #include <Tsqr_ApplyType.hpp> 00033 #include <Tsqr_Matrix.hpp> 00034 #include <Tsqr_CacheBlockingStrategy.hpp> 00035 #include <Tsqr_CacheBlocker.hpp> 00036 #include <Tsqr_LocalVerify.hpp> 00037 #include <Tsqr_ScalarTraits.hpp> 00038 #include <Tsqr_Combine.hpp> 00039 #include <Tsqr_Util.hpp> 00040 00041 #include <algorithm> 00042 #include <limits> 00043 #include <sstream> 00044 #include <string> 00045 #include <utility> // std::pair 00046 #include <vector> 00047 00048 // #define TSQR_SEQ_TSQR_EXTRA_DEBUG 1 00049 00050 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG 00051 # include <iostream> 00052 using std::cerr; 00053 using std::endl; 00054 00055 template< class MatrixView > 00056 void view_print (const MatrixView& view) { 00057 cerr << view.nrows() << ", " << view.ncols() << ", " << view.lda(); 00058 } 00059 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG 00060 00063 00064 namespace TSQR { 00065 00066 template< class LocalOrdinal, class Scalar > 00067 class SequentialTsqr { 00068 private: 00069 typedef typename std::vector< std::vector< Scalar > >::const_iterator FactorOutputIter; 00070 typedef typename std::vector< std::vector< Scalar > >::const_reverse_iterator FactorOutputReverseIter; 00071 typedef MatView< LocalOrdinal, Scalar > mat_view; 00072 typedef ConstMatView< LocalOrdinal, Scalar > const_mat_view; 00073 typedef std::pair< mat_view, mat_view > block_pair_type; 00074 typedef std::pair< const_mat_view, const_mat_view > const_block_pair_type; 00075 00084 mat_view 00085 factor_first_block (Combine< LocalOrdinal, Scalar >& combine, 00086 mat_view& A_top, 00087 std::vector< Scalar >& tau, 00088 std::vector< Scalar >& work) 00089 { 00090 const LocalOrdinal ncols = A_top.ncols(); 00091 combine.factor_first (A_top.nrows(), ncols, A_top.get(), A_top.lda(), 00092 &tau[0], &work[0]); 00093 return mat_view(ncols, ncols, A_top.get(), A_top.lda()); 00094 } 00095 00100 void 00101 apply_first_block (Combine< LocalOrdinal, Scalar >& combine, 00102 const ApplyType& applyType, 00103 const const_mat_view& Q_first, 00104 const std::vector< Scalar >& tau, 00105 mat_view& C_first, 00106 std::vector< Scalar >& work) 00107 { 00108 const LocalOrdinal nrowsLocal = Q_first.nrows(); 00109 combine.apply_first (applyType, nrowsLocal, C_first.ncols(), 00110 Q_first.ncols(), Q_first.get(), Q_first.lda(), 00111 &tau[0], C_first.get(), C_first.lda(), &work[0]); 00112 } 00113 00114 void 00115 combine_apply (Combine< LocalOrdinal, Scalar >& combine, 00116 const ApplyType& apply_type, 00117 const const_mat_view& Q_cur, 00118 const std::vector< Scalar >& tau, 00119 mat_view& C_top, 00120 mat_view& C_cur, 00121 std::vector< Scalar >& work) 00122 { 00123 const LocalOrdinal nrows_local = Q_cur.nrows(); 00124 const LocalOrdinal ncols_Q = Q_cur.ncols(); 00125 const LocalOrdinal ncols_C = C_cur.ncols(); 00126 00127 combine.apply_inner (apply_type, 00128 nrows_local, ncols_C, ncols_Q, 00129 Q_cur.get(), C_cur.lda(), &tau[0], 00130 C_top.get(), C_top.lda(), 00131 C_cur.get(), C_cur.lda(), &work[0]); 00132 } 00133 00134 void 00135 combine_factor (Combine< LocalOrdinal, Scalar >& combine, 00136 mat_view& R, 00137 mat_view& A_cur, 00138 std::vector< Scalar >& tau, 00139 std::vector< Scalar >& work) 00140 { 00141 const LocalOrdinal nrows_local = A_cur.nrows(); 00142 const LocalOrdinal ncols = A_cur.ncols(); 00143 00144 combine.factor_inner (nrows_local, ncols, R.get(), R.lda(), 00145 A_cur.get(), A_cur.lda(), &tau[0], 00146 &work[0]); 00147 } 00148 00149 public: 00150 typedef Scalar scalar_type; 00151 typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type; 00152 typedef LocalOrdinal ordinal_type; 00153 typedef std::vector< std::vector< Scalar > > FactorOutput; 00154 00160 SequentialTsqr (const size_t cacheBlockSize = 0) : 00161 strategy_ (cacheBlockSize) 00162 {} 00163 00166 bool QR_produces_R_factor_with_nonnegative_diagonal () const { 00167 typedef Combine< LocalOrdinal, Scalar > combine_type; 00168 return combine_type::QR_produces_R_factor_with_nonnegative_diagonal(); 00169 } 00170 00172 size_t 00173 cache_block_size () const { return strategy_.cache_block_size(); } 00174 00184 FactorOutput 00185 factor (const LocalOrdinal nrows, 00186 const LocalOrdinal ncols, 00187 Scalar A[], 00188 const LocalOrdinal lda, 00189 const bool contiguous_cache_blocks = false) 00190 { 00191 CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); 00192 Combine< LocalOrdinal, Scalar > combine; 00193 std::vector< Scalar > work (ncols); 00194 FactorOutput tau_arrays; 00195 00196 // We say "A_rest" because it points to the remaining part of 00197 // the matrix left to factor; at the beginning, the "remaining" 00198 // part is the whole matrix, but that will change as the 00199 // algorithm progresses. 00200 // 00201 // Note: if the cache blocks are stored contiguously, lda won't 00202 // be the correct leading dimension of A, but it won't matter: 00203 // we only ever operate on A_cur here, and A_cur's leading 00204 // dimension is set correctly by A_rest.split_top(). 00205 mat_view A_rest (nrows, ncols, A, lda); 00206 // This call modifies A_rest. 00207 mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); 00208 00209 // Factor the topmost block of A. 00210 std::vector< Scalar > tau_first (ncols); 00211 mat_view R_view = factor_first_block (combine, A_cur, tau_first, work); 00212 tau_arrays.push_back (tau_first); 00213 00214 while (! A_rest.empty()) 00215 { 00216 A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); 00217 std::vector< Scalar > tau (ncols); 00218 combine_factor (combine, R_view, A_cur, tau, work); 00219 tau_arrays.push_back (tau); 00220 } 00221 return tau_arrays; 00222 } 00223 00225 void 00226 extract_R (const LocalOrdinal nrows, 00227 const LocalOrdinal ncols, 00228 const Scalar A[], 00229 const LocalOrdinal lda, 00230 Scalar R[], 00231 const LocalOrdinal ldr, 00232 const bool contiguous_cache_blocks = false) 00233 { 00234 const_mat_view A_view (nrows, ncols, A, lda); 00235 00236 // Identify top cache block of A 00237 const_mat_view A_top = top_block (A_view, contiguous_cache_blocks); 00238 00239 // Fill R (including lower triangle) with zeros. 00240 fill_matrix (ncols, ncols, R, ldr, Scalar(0)); 00241 00242 // Copy out the upper triangle of the R factor from A into R. 00243 copy_upper_triangle (ncols, ncols, R, ldr, A_top.get(), A_top.lda()); 00244 } 00245 00255 FactorOutput 00256 factor (const LocalOrdinal nrows, 00257 const LocalOrdinal ncols, 00258 Scalar A[], 00259 const LocalOrdinal lda, 00260 Scalar R[], 00261 const LocalOrdinal ldr, 00262 const bool contiguous_cache_blocks = false) 00263 { 00264 CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); 00265 Combine< LocalOrdinal, Scalar > combine; 00266 std::vector< Scalar > work (ncols); 00267 FactorOutput tau_arrays; 00268 00269 // We say "A_rest" because it points to the remaining part of 00270 // the matrix left to factor; at the beginning, the "remaining" 00271 // part is the whole matrix, but that will change as the 00272 // algorithm progresses. 00273 // 00274 // Note: if the cache blocks are stored contiguously, lda won't 00275 // be the correct leading dimension of A, but it won't matter: 00276 // we only ever operate on A_cur here, and A_cur's leading 00277 // dimension is set correctly by A_rest.split_top(). 00278 mat_view A_rest (nrows, ncols, A, lda); 00279 // This call modifies A_rest. 00280 mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); 00281 00282 // Factor the topmost block of A. 00283 std::vector< Scalar > tau_first (ncols); 00284 mat_view R_view = factor_first_block (combine, A_cur, tau_first, work); 00285 tau_arrays.push_back (tau_first); 00286 00287 while (! A_rest.empty()) 00288 { 00289 A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); 00290 std::vector< Scalar > tau (ncols); 00291 combine_factor (combine, R_view, A_cur, tau, work); 00292 tau_arrays.push_back (tau); 00293 } 00294 00295 // Copy the R factor resulting from the factorization out of 00296 // R_view (a view of the topmost cache block of A) into the R 00297 // output argument. 00298 fill_matrix (ncols, ncols, R, ldr, Scalar(0)); 00299 copy_upper_triangle (ncols, ncols, R, ldr, R_view.get(), R_view.lda()); 00300 return tau_arrays; 00301 } 00302 00303 00306 LocalOrdinal 00307 factor_num_cache_blocks (const LocalOrdinal nrows, 00308 const LocalOrdinal ncols, 00309 Scalar A[], 00310 const LocalOrdinal lda, 00311 const bool contiguous_cache_blocks = false) 00312 { 00313 CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); 00314 LocalOrdinal count = 0; 00315 00316 mat_view A_rest (nrows, ncols, A, lda); 00317 if (A_rest.empty()) 00318 return count; 00319 00320 mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); 00321 count++; // first factor step 00322 00323 while (! A_rest.empty()) 00324 { 00325 A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); 00326 count++; // next factor step 00327 } 00328 return count; 00329 } 00330 00331 00332 void 00333 apply (const ApplyType& apply_type, 00334 const LocalOrdinal nrows, 00335 const LocalOrdinal ncols_Q, 00336 const Scalar Q[], 00337 const LocalOrdinal ldq, 00338 const FactorOutput& factor_output, 00339 const LocalOrdinal ncols_C, 00340 Scalar C[], 00341 const LocalOrdinal ldc, 00342 const bool contiguous_cache_blocks = false) 00343 { 00344 // Quick exit and error tests 00345 if (ncols_Q == 0 || ncols_C == 0 || nrows == 0) 00346 return; 00347 else if (ldc < nrows) 00348 { 00349 std::ostringstream os; 00350 os << "SequentialTsqr::apply: ldc (= " << ldc << ") < nrows (= " << nrows << ")"; 00351 throw std::invalid_argument (os.str()); 00352 } 00353 else if (ldq < nrows) 00354 { 00355 std::ostringstream os; 00356 os << "SequentialTsqr::apply: ldq (= " << ldq << ") < nrows (= " << nrows << ")"; 00357 throw std::invalid_argument (os.str()); 00358 } 00359 00360 // If contiguous cache blocks are used, then we have to use the 00361 // same convention as we did for factor(). Otherwise, we are 00362 // free to choose the cache block dimensions as we wish in 00363 // apply(), independently of what we did in factor(). 00364 CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols_Q, strategy_); 00365 LAPACK< LocalOrdinal, Scalar > lapack; 00366 Combine< LocalOrdinal, Scalar > combine; 00367 00368 const bool transposed = apply_type.transposed(); 00369 const FactorOutput& tau_arrays = factor_output; // rename for encapsulation 00370 std::vector< Scalar > work (ncols_C); 00371 00372 // We say "*_rest" because it points to the remaining part of 00373 // the matrix left to factor; at the beginning, the "remaining" 00374 // part is the whole matrix, but that will change as the 00375 // algorithm progresses. 00376 // 00377 // Note: if the cache blocks are stored contiguously, ldq 00378 // resp. ldc won't be the correct leading dimension, but it 00379 // won't matter, since we only read the leading dimension of 00380 // return values of split_top_block() / split_bottom_block(), 00381 // which are set correctly (based e.g., on whether or not we are 00382 // using contiguous cache blocks). 00383 const_mat_view Q_rest (nrows, ncols_Q, Q, ldq); 00384 mat_view C_rest (nrows, ncols_C, C, ldc); 00385 00386 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG 00387 if (Q_rest.empty()) 00388 throw std::logic_error ("Q_rest initially empty"); 00389 else if (C_rest.empty()) 00390 throw std::logic_error ("C_rest initially empty"); 00391 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG 00392 00393 // Identify the top ncols_C by ncols_C block of C. C_rest is 00394 // not modified. 00395 mat_view C_top = blocker.top_block (C_rest, contiguous_cache_blocks); 00396 00397 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG 00398 if (C_top.empty()) 00399 throw std::logic_error ("C_top initially empty"); 00400 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG 00401 00402 if (transposed) 00403 { 00404 const_mat_view Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks); 00405 mat_view C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks); 00406 00407 // Apply the topmost block of Q. 00408 FactorOutputIter tau_iter = tau_arrays.begin(); 00409 const std::vector< Scalar >& tau = *tau_iter++; 00410 apply_first_block (combine, apply_type, Q_cur, tau, C_cur, work); 00411 00412 while (! Q_rest.empty()) 00413 { 00414 Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks); 00415 C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks); 00416 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG 00417 if (tau_iter == tau_arrays.end()) 00418 throw std::logic_error("Not enough tau arrays!"); 00419 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG 00420 combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work); 00421 } 00422 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG 00423 if (tau_iter != tau_arrays.end()) 00424 throw std::logic_error ("Too many tau arrays!"); 00425 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG 00426 } 00427 else 00428 { 00429 // Start with the last local Q factor and work backwards up the matrix. 00430 FactorOutputReverseIter tau_iter = tau_arrays.rbegin(); 00431 00432 const_mat_view Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks); 00433 mat_view C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks); 00434 00435 while (! Q_rest.empty()) 00436 { 00437 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG 00438 if (Q_cur.empty()) 00439 throw std::logic_error ("Q_cur empty at last stage of applying Q"); 00440 else if (C_cur.empty()) 00441 throw std::logic_error ("C_cur empty at last stage of applying Q"); 00442 else if (tau_iter == tau_arrays.rend()) 00443 throw std::logic_error ("Not enough tau arrays!"); 00444 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG 00445 combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work); 00446 Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks); 00447 C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks); 00448 } 00449 // 00450 // Apply to last (topmost) cache block. 00451 // 00452 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG 00453 if (Q_cur.empty()) 00454 throw std::logic_error ("Q_cur empty at last stage of applying Q"); 00455 else if (C_cur.empty()) 00456 throw std::logic_error ("C_cur empty at last stage of applying Q"); 00457 else if (tau_iter == tau_arrays.rend()) 00458 throw std::logic_error ("Not enough tau arrays!"); 00459 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG 00460 00461 apply_first_block (combine, apply_type, Q_cur, *tau_iter++, C_cur, work); 00462 00463 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG 00464 if (tau_iter != tau_arrays.rend()) 00465 throw std::logic_error ("Too many tau arrays!"); 00466 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG 00467 } 00468 } 00469 00470 void 00471 explicit_Q (const LocalOrdinal nrows, 00472 const LocalOrdinal ncols_Q, 00473 const Scalar Q[], 00474 const LocalOrdinal ldq, 00475 const FactorOutput& factor_output, 00476 const LocalOrdinal ncols_C, 00477 Scalar C[], 00478 const LocalOrdinal ldc, 00479 const bool contiguous_cache_blocks = false) 00480 { 00481 // Identify top ncols_C by ncols_C block of C. C_view is not 00482 // modified. top_block() will set C_top to have the correct 00483 // leading dimension, whether or not cache blocks are stored 00484 // contiguously. 00485 mat_view C_view (nrows, ncols_C, C, ldc); 00486 mat_view C_top = top_block (C_view, contiguous_cache_blocks); 00487 00488 // Fill C with zeros, and then fill the topmost block of C with 00489 // the first ncols_C columns of the identity matrix, so that C 00490 // itself contains the first ncols_C columns of the identity 00491 // matrix. 00492 fill_with_zeros (nrows, ncols_C, C, ldc, contiguous_cache_blocks); 00493 for (LocalOrdinal j = 0; j < ncols_C; ++j) 00494 C_top(j, j) = Scalar(1); 00495 00496 // Apply the Q factor to C, to extract the first ncols_C columns 00497 // of Q in explicit form. 00498 apply (ApplyType::NoTranspose, 00499 nrows, ncols_Q, Q, ldq, factor_output, 00500 ncols_C, C, ldc, contiguous_cache_blocks); 00501 } 00502 00503 00508 void 00509 Q_times_B (const LocalOrdinal nrows, 00510 const LocalOrdinal ncols, 00511 Scalar Q[], 00512 const LocalOrdinal ldq, 00513 const Scalar B[], 00514 const LocalOrdinal ldb, 00515 const bool contiguous_cache_blocks = false) const 00516 { 00517 // We don't do any other error checking here (e.g., matrix 00518 // dimensions), though it would be a good idea to do so. 00519 00520 // Take the easy exit if available. 00521 if (ncols == 0 || nrows == 0) 00522 return; 00523 00524 // Compute Q := Q*B by iterating through cache blocks of Q. 00525 // This iteration works much like iteration through cache blocks 00526 // of A in factor() (which see). Here, though, each cache block 00527 // computation is completely independent of the others; a slight 00528 // restructuring of this code would parallelize nicely using 00529 // OpenMP. 00530 CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); 00531 BLAS< LocalOrdinal, Scalar > blas; 00532 mat_view Q_rest (nrows, ncols, Q, ldq); 00533 Matrix< LocalOrdinal, Scalar > 00534 Q_cur_copy (LocalOrdinal(0), LocalOrdinal(0)); // will be resized 00535 while (! Q_rest.empty()) 00536 { 00537 mat_view Q_cur = 00538 blocker.split_top_block (Q_rest, contiguous_cache_blocks); 00539 00540 // GEMM doesn't like aliased arguments, so we use a copy. 00541 // We only copy the current cache block, rather than all of 00542 // Q; this saves memory. 00543 Q_cur_copy.reshape (Q_cur.nrows(), ncols); 00544 Q_cur_copy.copy (Q_cur); 00545 // Q_cur := Q_cur_copy * B. 00546 blas.GEMM ("N", "N", Q_cur.nrows(), ncols, ncols, Scalar(1), 00547 Q_cur_copy.get(), Q_cur_copy.lda(), B, ldb, 00548 Scalar(0), Q_cur.get(), Q_cur.lda()); 00549 } 00550 } 00551 00559 LocalOrdinal 00560 reveal_R_rank (const LocalOrdinal ncols, 00561 Scalar R[], 00562 const LocalOrdinal ldr, 00563 Scalar U[], 00564 const LocalOrdinal ldu, 00565 const magnitude_type tol) const 00566 { 00567 if (tol < 0) 00568 { 00569 std::ostringstream os; 00570 os << "reveal_R_rank: negative tolerance tol = " 00571 << tol << " is not allowed."; 00572 throw std::logic_error (os.str()); 00573 } 00574 // We don't do any other error checking here (e.g., matrix 00575 // dimensions), though it would be a good idea to do so. 00576 00577 // Take the easy exit if available. 00578 if (ncols == 0) 00579 return 0; 00580 00581 LAPACK< LocalOrdinal, Scalar > lapack; 00582 MatView< LocalOrdinal, Scalar > R_view (ncols, ncols, R, ldr); 00583 Matrix< LocalOrdinal, Scalar > B (R_view); // B := R (deep copy) 00584 MatView< LocalOrdinal, Scalar > U_view (ncols, ncols, U, ldu); 00585 Matrix< LocalOrdinal, Scalar > VT (ncols, ncols, Scalar(0)); 00586 00587 std::vector< magnitude_type > svd_rwork (5*ncols); 00588 std::vector< magnitude_type > singular_values (ncols); 00589 LocalOrdinal svd_lwork = -1; // -1 for LWORK query; will be changed 00590 int svd_info = 0; 00591 00592 // LAPACK LWORK query for singular value decomposition. WORK 00593 // array is always of ScalarType, even in the complex case. 00594 { 00595 Scalar svd_lwork_scalar = Scalar(0); 00596 lapack.GESVD ("A", "A", ncols, ncols, 00597 B.get(), B.lda(), &singular_values[0], 00598 U_view.get(), U_view.lda(), VT.get(), VT.lda(), 00599 &svd_lwork_scalar, svd_lwork, &svd_rwork[0], &svd_info); 00600 if (svd_info != 0) 00601 { 00602 std::ostringstream os; 00603 os << "reveal_R_rank: GESVD LWORK query returned nonzero INFO = " 00604 << svd_info; 00605 throw std::logic_error (os.str()); 00606 } 00607 svd_lwork = static_cast< LocalOrdinal > (svd_lwork_scalar); 00608 // Check the LWORK cast. LAPACK shouldn't ever return LWORK 00609 // that won't fit in an OrdinalType, but it's not bad to make 00610 // sure. 00611 if (static_cast< Scalar > (svd_lwork) != svd_lwork_scalar) 00612 { 00613 std::ostringstream os; 00614 os << "In SequentialTsqr::reveal_rank: GESVD LWORK query " 00615 "returned LWORK that doesn\'t fit in LocalOrdinal: returned " 00616 "LWORK (as Scalar) is " << svd_lwork_scalar << ", but cast to " 00617 "LocalOrdinal it becomes " << svd_lwork << "."; 00618 throw std::logic_error (os.str()); 00619 } 00620 // Make sure svd_lwork >= 0. 00621 if (std::numeric_limits< LocalOrdinal >::is_signed && svd_lwork < 0) 00622 { 00623 std::ostringstream os; 00624 os << "In SequentialTsqr::reveal_rank: GESVD LWORK query " 00625 "returned negative LWORK = " << svd_lwork; 00626 throw std::logic_error (os.str()); 00627 } 00628 } 00629 // Allocate workspace for SVD. 00630 std::vector< Scalar > svd_work (svd_lwork); 00631 00632 // Compute SVD $B := U \Sigma V^*$. B is overwritten, which is 00633 // why we copied R into B (so that we don't overwrite R if R is 00634 // full rank). 00635 lapack.GESVD ("A", "A", ncols, ncols, 00636 B.get(), B.lda(), &singular_values[0], 00637 U_view.get(), U_view.lda(), VT.get(), VT.lda(), 00638 &svd_work[0], svd_lwork, &svd_rwork[0], &svd_info); 00639 00640 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG 00641 { 00642 cerr << "-- GESVD computed singular values:" << endl; 00643 for (int k = 0; k < ncols; ++k) 00644 cerr << singular_values[k] << " "; 00645 cerr << endl; 00646 } 00647 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG 00648 00649 // GESVD computes singular values in decreasing order and they 00650 // are all nonnegative. We know by now that ncols > 0. "tol" 00651 // is a relative tolerance: relative to the largest singular 00652 // value, which is the 2-norm of the matrix. 00653 const magnitude_type absolute_tolerance = tol * singular_values[0]; 00654 00655 // Determine rank of B, using singular values. 00656 LocalOrdinal rank = ncols; 00657 for (LocalOrdinal k = 1; k < ncols; ++k) 00658 // "<=" in case singular_values[0] == 0. 00659 if (singular_values[k] <= absolute_tolerance) 00660 { 00661 rank = k; 00662 break; 00663 } 00664 00665 if (rank == ncols) 00666 return rank; // Don't modify Q or R, if R is full rank. 00667 00668 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG 00669 { 00670 cerr << "Rank of B (i.e., R): " << rank << " < ncols=" << ncols << endl; 00671 cerr << "Original R = " << endl; 00672 print_local_matrix (cerr, ncols, ncols, R, ldr); 00673 } 00674 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG 00675 00676 // 00677 // R is not full rank. 00678 // 00679 // 1. Compute \f$R := \Sigma V^*\f$. 00680 // 2. Return rank (0 <= rank < ncols). 00681 // 00682 00683 // Compute R := \Sigma VT. \Sigma is diagonal so we apply it 00684 // column by column (normally one would think of this as row by 00685 // row, but this "Hadamard product" formulation iterates more 00686 // efficiently over VT). 00687 // 00688 // After this computation, R may no longer be upper triangular. 00689 // R may be zero if all the singular values are zero, but we 00690 // don't need to check for this case; it's rare in practice, and 00691 // the computations below will be correct regardless. 00692 for (LocalOrdinal j = 0; j < ncols; ++j) 00693 { 00694 const Scalar* const VT_j = &VT(0,j); 00695 Scalar* const R_j = &R_view(0,j); 00696 00697 for (LocalOrdinal i = 0; i < ncols; ++i) 00698 R_j[i] = singular_values[i] * VT_j[i]; 00699 } 00700 00701 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG 00702 { 00703 cerr << "Resulting R = " << endl; 00704 print_local_matrix (cerr, ncols, ncols, R, ldr); 00705 } 00706 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG 00707 00708 return rank; 00709 } 00710 00722 LocalOrdinal 00723 reveal_rank (const LocalOrdinal nrows, 00724 const LocalOrdinal ncols, 00725 Scalar Q[], 00726 const LocalOrdinal ldq, 00727 Scalar R[], 00728 const LocalOrdinal ldr, 00729 const magnitude_type tol, 00730 const bool contiguous_cache_blocks = false) const 00731 { 00732 // Take the easy exit if available. 00733 if (ncols == 0) 00734 return 0; 00735 Matrix< LocalOrdinal, Scalar > U (ncols, ncols, Scalar(0)); 00736 const LocalOrdinal rank = 00737 reveal_R_rank (ncols, R, ldr, U.get(), U.ldu(), tol); 00738 00739 if (rank < ncols) 00740 { 00741 // If R is not full rank: reveal_R_rank() already computed 00742 // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and 00743 // overwrote R with \f$\Sigma V^*\f$. Now, we compute \f$Q 00744 // := Q \cdot U\f$, respecting cache blocks of Q. 00745 Q_times_B (nrows, ncols, Q, ldq, U.get(), U.lda(), 00746 contiguous_cache_blocks); 00747 } 00748 return rank; 00749 } 00750 00752 void 00753 cache_block (const LocalOrdinal nrows, 00754 const LocalOrdinal ncols, 00755 Scalar A_out[], 00756 const Scalar A_in[], 00757 const LocalOrdinal lda_in) const 00758 { 00759 CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); 00760 blocker.cache_block (nrows, ncols, A_out, A_in, lda_in); 00761 } 00762 00764 void 00765 un_cache_block (const LocalOrdinal nrows, 00766 const LocalOrdinal ncols, 00767 Scalar A_out[], 00768 const LocalOrdinal lda_out, 00769 const Scalar A_in[]) const 00770 { 00771 CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); 00772 blocker.un_cache_block (nrows, ncols, A_out, lda_out, A_in); 00773 } 00774 00787 void 00788 fill_with_zeros (const LocalOrdinal nrows, 00789 const LocalOrdinal ncols, 00790 Scalar A[], 00791 const LocalOrdinal lda, 00792 const bool contiguous_cache_blocks = false) const 00793 { 00794 CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); 00795 blocker.fill_with_zeros (nrows, ncols, A, lda, contiguous_cache_blocks); 00796 } 00797 00814 template< class MatrixViewType > 00815 MatrixViewType 00816 top_block (const MatrixViewType& C, 00817 const bool contiguous_cache_blocks = false) const 00818 { 00819 // The CacheBlocker object knows how to construct a view of the 00820 // top cache block of C. This is complicated because cache 00821 // blocks (in C) may or may not be stored contiguously. If they 00822 // are stored contiguously, the CacheBlocker knows the right 00823 // layout, based on the cache blocking strategy. 00824 CacheBlocker< LocalOrdinal, Scalar > blocker (C.nrows(), C.ncols(), strategy_); 00825 00826 // C_top_block is a view of the topmost cache block of C. 00827 // C_top_block should have >= ncols rows, otherwise either cache 00828 // blocking is broken or the input matrix C itself had fewer 00829 // rows than columns. 00830 MatrixViewType C_top_block = blocker.top_block (C, contiguous_cache_blocks); 00831 if (C_top_block.nrows() < C_top_block.ncols()) 00832 throw std::logic_error ("C\'s topmost cache block has fewer rows than " 00833 "columns"); 00834 return C_top_block; 00835 } 00836 00837 private: 00838 CacheBlockingStrategy< LocalOrdinal, Scalar > strategy_; 00839 }; 00840 00841 } // namespace TSQR 00842 00843 #endif // __TSQR_Tsqr_SequentialTsqr_hpp
1.7.4