|
Anasazi Version of the Day
|
00001 // @HEADER 00002 // *********************************************************************** 00003 // 00004 // Anasazi: Block Eigensolvers Package 00005 // Copyright (2010) Sandia Corporation 00006 // 00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 00008 // license for use of this work by or on behalf of the U.S. Government. 00009 // 00010 // This library is free software; you can redistribute it and/or modify 00011 // it under the terms of the GNU Lesser General Public License as 00012 // published by the Free Software Foundation; either version 2.1 of the 00013 // License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, but 00016 // WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00023 // USA 00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 00025 // 00026 // *********************************************************************** 00027 // @HEADER 00028 00029 #ifndef __TSQR_TBB_TbbParallelTsqr_hpp 00030 #define __TSQR_TBB_TbbParallelTsqr_hpp 00031 00032 #include <tbb/tbb.h> 00033 #include <TbbTsqr_FactorTask.hpp> 00034 #include <TbbTsqr_ApplyTask.hpp> 00035 #include <TbbTsqr_ExplicitQTask.hpp> 00036 #include <TbbTsqr_RevealRankTask.hpp> 00037 #include <TbbTsqr_CacheBlockTask.hpp> 00038 #include <TbbTsqr_UnCacheBlockTask.hpp> 00039 #include <TbbTsqr_FillWithZerosTask.hpp> 00040 00041 #include <Tsqr_ApplyType.hpp> 00042 #include <Tsqr_ScalarTraits.hpp> 00043 00044 #include <algorithm> 00045 #include <limits> 00046 00049 00050 namespace TSQR { 00051 namespace TBB { 00052 00053 template< class LocalOrdinal, class Scalar, class TimerType > 00054 class TbbParallelTsqr { 00055 private: 00056 typedef MatView< LocalOrdinal, Scalar > mat_view; 00057 typedef ConstMatView< LocalOrdinal, Scalar > const_mat_view; 00058 typedef std::pair< mat_view, mat_view > split_t; 00059 typedef std::pair< const_mat_view, const_mat_view > const_split_t; 00060 typedef std::pair< const_mat_view, mat_view > top_blocks_t; 00061 typedef std::vector< top_blocks_t > array_top_blocks_t; 00062 00063 template< class MatrixViewType > 00064 MatrixViewType 00065 top_block_helper (const size_t P_first, 00066 const size_t P_last, 00067 const MatrixViewType& C, 00068 const bool contiguous_cache_blocks = false) const 00069 { 00070 if (P_first > P_last) 00071 throw std::logic_error ("P_first > P_last"); 00072 else if (P_first == P_last) 00073 return seq_.top_block (C, contiguous_cache_blocks); 00074 else 00075 { 00076 typedef std::pair< MatrixViewType, MatrixViewType > split_type; 00077 00078 // Divide [P_first, P_last] into two intervals: [P_first, 00079 // P_mid] and [P_mid+1, P_last]. Recurse on the first 00080 // interval [P_first, P_mid]. 00081 const size_t P_mid = (P_first + P_last) / 2; 00082 split_type C_split = partitioner_.split (C, P_first, P_mid, P_last, 00083 contiguous_cache_blocks); 00084 return top_block_helper (P_first, P_mid, C_split.first, 00085 contiguous_cache_blocks); 00086 } 00087 } 00088 00089 public: 00090 typedef Scalar scalar_type; 00091 typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type; 00092 typedef LocalOrdinal ordinal_type; 00093 00096 static bool QR_produces_R_factor_with_nonnegative_diagonal() { 00097 typedef Combine< LocalOrdinal, Scalar > combine_type; 00098 typedef LAPACK< LocalOrdinal, Scalar > lapack_type; 00099 00100 return combine_type::QR_produces_R_factor_with_nonnegative_diagonal() && 00101 lapack_type::QR_produces_R_factor_with_nonnegative_diagonal(); 00102 } 00103 00106 typedef typename SequentialTsqr< LocalOrdinal, Scalar >::FactorOutput SeqOutput; 00109 typedef std::vector< std::vector< Scalar > > ParOutput; 00113 typedef typename std::pair< std::vector< SeqOutput >, ParOutput > FactorOutput; 00114 00122 TbbParallelTsqr (const size_t numCores = 1, 00123 const size_t cacheBlockSize = 0) : 00124 seq_ (cacheBlockSize), 00125 min_seq_factor_timing_ (std::numeric_limits<double>::max()), 00126 max_seq_factor_timing_ (std::numeric_limits<double>::min()), 00127 min_seq_apply_timing_ (std::numeric_limits<double>::max()), 00128 max_seq_apply_timing_ (std::numeric_limits<double>::min()) 00129 { 00130 if (numCores < 1) 00131 ncores_ = 1; // default is no parallelism 00132 else 00133 ncores_ = numCores; 00134 } 00135 00139 size_t ncores() const { return ncores_; } 00140 00145 size_t cache_block_size() const { return seq_.cache_block_size(); } 00146 00147 00148 double 00149 min_seq_factor_timing () const { return min_seq_factor_timing_; } 00150 double 00151 max_seq_factor_timing () const { return max_seq_factor_timing_; } 00152 double 00153 min_seq_apply_timing () const { return min_seq_apply_timing_; } 00154 double 00155 max_seq_apply_timing () const { return max_seq_apply_timing_; } 00156 00157 FactorOutput 00158 factor (const LocalOrdinal nrows, 00159 const LocalOrdinal ncols, 00160 Scalar A[], 00161 const LocalOrdinal lda, 00162 Scalar R[], 00163 const LocalOrdinal ldr, 00164 const bool contiguous_cache_blocks = false) 00165 { 00166 using tbb::task; 00167 00168 mat_view A_view (nrows, ncols, A, lda); 00169 // A_top will be modified in place by exactly one task, to 00170 // indicate the partition from which we may extract the R 00171 // factor after finishing the factorization. 00172 mat_view A_top; 00173 00174 std::vector< SeqOutput > seq_output (ncores()); 00175 ParOutput par_output (ncores(), std::vector< Scalar >(ncols)); 00176 if (ncores() < 1) 00177 { 00178 if (! A_view.empty()) 00179 throw std::logic_error("Zero subproblems, but A not empty!"); 00180 else // Return empty results 00181 return std::make_pair (seq_output, par_output); 00182 } 00183 00184 double my_seq_timing = double(0); 00185 double min_seq_timing = double(0); 00186 double max_seq_timing = double(0); 00187 try { 00188 typedef FactorTask< LocalOrdinal, Scalar, TimerType > factor_task_t; 00189 00190 // When the root task completes, A_top will be set to the 00191 // topmost partition of A. We can then extract the R factor 00192 // from A_top. 00193 factor_task_t& root_task = *new( task::allocate_root() ) 00194 factor_task_t(0, ncores()-1, A_view, &A_top, seq_output, 00195 par_output, seq_, my_seq_timing, min_seq_timing, 00196 max_seq_timing, contiguous_cache_blocks); 00197 task::spawn_root_and_wait (root_task); 00198 } catch (tbb::captured_exception& ex) { 00199 // TBB can't guarantee on all systems that an exception 00200 // thrown in another thread will have its type correctly 00201 // propagated to this thread. If it can't, then it captures 00202 // the exception as a tbb:captured_exception, and propagates 00203 // it to here. It may be able to propagate the exception, 00204 // though, so be prepared for that. We deal with the latter 00205 // case by allowing the exception to propagate. 00206 std::ostringstream os; 00207 os << "Intel TBB caught an exception, while computing the QR factor" 00208 "ization of a matrix A. Unfortunately, its type information was " 00209 "lost, because the exception was thrown in another thread. Its " 00210 "\"what()\" function returns the following string: " << ex.what(); 00211 throw std::runtime_error (os.str()); 00212 } 00213 00214 // Copy the R factor out of A_top into R. 00215 seq_.extract_R (A_top.nrows(), A_top.ncols(), A_top.get(), 00216 A_top.lda(), R, ldr, contiguous_cache_blocks); 00217 00218 // Save the timings for future reference 00219 if (min_seq_timing < min_seq_factor_timing_) 00220 min_seq_factor_timing_ = min_seq_timing; 00221 if (max_seq_timing > max_seq_factor_timing_) 00222 max_seq_factor_timing_ = max_seq_timing; 00223 00224 return std::make_pair (seq_output, par_output); 00225 } 00226 00227 void 00228 apply (const ApplyType& apply_type, 00229 const LocalOrdinal nrows, 00230 const LocalOrdinal ncols_Q, 00231 const Scalar Q[], 00232 const LocalOrdinal ldq, 00233 const FactorOutput& factor_output, 00234 const LocalOrdinal ncols_C, 00235 Scalar C[], 00236 const LocalOrdinal ldc, 00237 const bool contiguous_cache_blocks = false) 00238 { 00239 using tbb::task; 00240 00241 if (apply_type.transposed()) 00242 throw std::logic_error ("Applying Q^T and Q^H not implemented"); 00243 00244 const_mat_view Q_view (nrows, ncols_Q, Q, ldq); 00245 mat_view C_view (nrows, ncols_C, C, ldc); 00246 if (! apply_type.transposed()) 00247 { 00248 array_top_blocks_t top_blocks (ncores()); 00249 build_partition_array (0, ncores()-1, top_blocks, Q_view, 00250 C_view, contiguous_cache_blocks); 00251 double my_seq_timing = 0.0; 00252 double min_seq_timing = 0.0; 00253 double max_seq_timing = 0.0; 00254 try { 00255 typedef ApplyTask< LocalOrdinal, Scalar, TimerType > apply_task_t; 00256 apply_task_t& root_task = 00257 *new( task::allocate_root() ) 00258 apply_task_t (0, ncores()-1, Q_view, C_view, top_blocks, 00259 factor_output, seq_, my_seq_timing, 00260 min_seq_timing, max_seq_timing, 00261 contiguous_cache_blocks); 00262 task::spawn_root_and_wait (root_task); 00263 } catch (tbb::captured_exception& ex) { 00264 std::ostringstream os; 00265 os << "Intel TBB caught an exception, while applying a Q factor " 00266 "computed previously by factor() to the matrix C. Unfortunate" 00267 "ly, its type information was lost, because the exception was " 00268 "thrown in another thread. Its \"what()\" function returns th" 00269 "e following string: " << ex.what(); 00270 throw std::runtime_error (os.str()); 00271 } 00272 00273 // Save the timings for future reference 00274 if (min_seq_timing < min_seq_apply_timing_) 00275 min_seq_apply_timing_ = min_seq_timing; 00276 if (max_seq_timing > max_seq_apply_timing_) 00277 max_seq_apply_timing_ = max_seq_timing; 00278 } 00279 } 00280 00281 00282 void 00283 explicit_Q (const LocalOrdinal nrows, 00284 const LocalOrdinal ncols_Q_in, 00285 const Scalar Q_in[], 00286 const LocalOrdinal ldq_in, 00287 const FactorOutput& factor_output, 00288 const LocalOrdinal ncols_Q_out, 00289 Scalar Q_out[], 00290 const LocalOrdinal ldq_out, 00291 const bool contiguous_cache_blocks = false) 00292 { 00293 using tbb::task; 00294 00295 mat_view Q_out_view (nrows, ncols_Q_out, Q_out, ldq_out); 00296 try { 00297 typedef ExplicitQTask< LocalOrdinal, Scalar > explicit_Q_task_t; 00298 explicit_Q_task_t& root_task = *new( task::allocate_root() ) 00299 explicit_Q_task_t (0, ncores()-1, Q_out_view, seq_, 00300 contiguous_cache_blocks); 00301 task::spawn_root_and_wait (root_task); 00302 } catch (tbb::captured_exception& ex) { 00303 std::ostringstream os; 00304 os << "Intel TBB caught an exception, while preparing to compute" 00305 " the explicit Q factor from a QR factorization computed previ" 00306 "ously by factor(). Unfortunately, its type information was l" 00307 "ost, because the exception was thrown in another thread. Its" 00308 " \"what()\" function returns the following string: " 00309 << ex.what(); 00310 throw std::runtime_error (os.str()); 00311 } 00312 apply (ApplyType::NoTranspose, 00313 nrows, ncols_Q_in, Q_in, ldq_in, factor_output, 00314 ncols_Q_out, Q_out, ldq_out, 00315 contiguous_cache_blocks); 00316 } 00317 00322 void 00323 Q_times_B (const LocalOrdinal nrows, 00324 const LocalOrdinal ncols, 00325 Scalar Q[], 00326 const LocalOrdinal ldq, 00327 const Scalar B[], 00328 const LocalOrdinal ldb, 00329 const bool contiguous_cache_blocks = false) const 00330 { 00331 // Compute Q := Q*B in parallel. This works much like 00332 // cache_block() (which see), in that each thread's instance 00333 // does not need to communicate with the others. 00334 try { 00335 using tbb::task; 00336 typedef RevealRankTask< LocalOrdinal, Scalar > rrtask_type; 00337 00338 mat_view Q_view (nrows, ncols, Q, ldq); 00339 const_mat_view B_view (ncols, ncols, B, ldb); 00340 00341 rrtask_type& root_task = *new( task::allocate_root() ) 00342 rrtask_type (0, ncores()-1, Q_view, B_view, seq_, 00343 contiguous_cache_blocks); 00344 task::spawn_root_and_wait (root_task); 00345 } catch (tbb::captured_exception& ex) { 00346 std::ostringstream os; 00347 os << "Intel TBB caught an exception, while computing Q := Q*U. " 00348 "Unfortunately, its type information was lost, because the " 00349 "exception was thrown in another thread. Its \"what()\" function " 00350 "returns the following string: " << ex.what(); 00351 throw std::runtime_error (os.str()); 00352 } 00353 } 00354 00355 00363 LocalOrdinal 00364 reveal_R_rank (const LocalOrdinal ncols, 00365 Scalar R[], 00366 const LocalOrdinal ldr, 00367 Scalar U[], 00368 const LocalOrdinal ldu, 00369 const magnitude_type tol) const 00370 { 00371 return seq_.reveal_R_rank (ncols, R, ldr, U, ldu, tol); 00372 } 00373 00385 LocalOrdinal 00386 reveal_rank (const LocalOrdinal nrows, 00387 const LocalOrdinal ncols, 00388 Scalar Q[], 00389 const LocalOrdinal ldq, 00390 Scalar R[], 00391 const LocalOrdinal ldr, 00392 const magnitude_type tol, 00393 const bool contiguous_cache_blocks = false) const 00394 { 00395 // Take the easy exit if available. 00396 if (ncols == 0) 00397 return 0; 00398 00399 Matrix< LocalOrdinal, Scalar > U (ncols, ncols, Scalar(0)); 00400 const LocalOrdinal rank = 00401 reveal_R_rank (ncols, R, ldr, U.get(), U.ldu(), tol); 00402 00403 if (rank < ncols) 00404 { 00405 // If R is not full rank: reveal_R_rank() already computed 00406 // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and 00407 // overwrote R with \f$\Sigma V^*\f$. Now, we compute \f$Q 00408 // := Q \cdot U\f$, respecting cache blocks of Q. 00409 Q_times_B (nrows, ncols, Q, ldq, U.get(), U.lda(), 00410 contiguous_cache_blocks); 00411 } 00412 return rank; 00413 } 00414 00415 void 00416 cache_block (const LocalOrdinal nrows, 00417 const LocalOrdinal ncols, 00418 Scalar A_out[], 00419 const Scalar A_in[], 00420 const LocalOrdinal lda_in) const 00421 { 00422 using tbb::task; 00423 00424 const_mat_view A_in_view (nrows, ncols, A_in, lda_in); 00425 // A_out won't have leading dimension lda_in, but that's OK, 00426 // as long as all the routines are told that A_out is 00427 // cache-blocked. 00428 mat_view A_out_view (nrows, ncols, A_out, lda_in); 00429 try { 00430 typedef CacheBlockTask< LocalOrdinal, Scalar > cache_block_task_t; 00431 cache_block_task_t& root_task = *new( task::allocate_root() ) 00432 cache_block_task_t (0, ncores()-1, A_out_view, A_in_view, seq_); 00433 task::spawn_root_and_wait (root_task); 00434 } catch (tbb::captured_exception& ex) { 00435 std::ostringstream os; 00436 os << "Intel TBB caught an exception, while cache-blocking a mat" 00437 "rix. Unfortunately, its type information was lost, because t" 00438 "he exception was thrown in another thread. Its \"what()\" fu" 00439 "nction returns the following string: " << ex.what(); 00440 throw std::runtime_error (os.str()); 00441 } 00442 } 00443 00444 void 00445 un_cache_block (const LocalOrdinal nrows, 00446 const LocalOrdinal ncols, 00447 Scalar A_out[], 00448 const LocalOrdinal lda_out, 00449 const Scalar A_in[]) const 00450 { 00451 using tbb::task; 00452 00453 // A_in doesn't have leading dimension lda_out, but that's OK, 00454 // as long as all the routines are told that A_in is cache- 00455 // blocked. 00456 const_mat_view A_in_view (nrows, ncols, A_in, lda_out); 00457 mat_view A_out_view (nrows, ncols, A_out, lda_out); 00458 try { 00459 typedef UnCacheBlockTask< LocalOrdinal, Scalar > un_cache_block_task_t; 00460 un_cache_block_task_t& root_task = *new( task::allocate_root() ) 00461 un_cache_block_task_t (0, ncores()-1, A_out_view, A_in_view, seq_); 00462 task::spawn_root_and_wait (root_task); 00463 } catch (tbb::captured_exception& ex) { 00464 std::ostringstream os; 00465 os << "Intel TBB caught an exception, while un-cache-blocking a " 00466 "matrix. Unfortunately, its type information was lost, becaus" 00467 "e the exception was thrown in another thread. Its \"what()\"" 00468 " function returns the following string: " << ex.what(); 00469 throw std::runtime_error (os.str()); 00470 } 00471 } 00472 00473 template< class MatrixViewType > 00474 MatrixViewType 00475 top_block (const MatrixViewType& C, 00476 const bool contiguous_cache_blocks = false) const 00477 { 00478 return top_block_helper (0, ncores()-1, C, contiguous_cache_blocks); 00479 } 00480 00481 void 00482 fill_with_zeros (const LocalOrdinal nrows, 00483 const LocalOrdinal ncols, 00484 Scalar C[], 00485 const LocalOrdinal ldc, 00486 const bool contiguous_cache_blocks = false) const 00487 { 00488 using tbb::task; 00489 mat_view C_view (nrows, ncols, C, ldc); 00490 00491 try { 00492 typedef FillWithZerosTask< LocalOrdinal, Scalar > fill_task_t; 00493 fill_task_t& root_task = *new( task::allocate_root() ) 00494 fill_task_t (0, ncores()-1, C_view, seq_, contiguous_cache_blocks); 00495 task::spawn_root_and_wait (root_task); 00496 } catch (tbb::captured_exception& ex) { 00497 std::ostringstream os; 00498 os << "Intel TBB caught an exception, while un-cache-blocking a " 00499 "matrix. Unfortunately, its type information was lost, becaus" 00500 "e the exception was thrown in another thread. Its \"what()\"" 00501 " function returns the following string: " << ex.what(); 00502 throw std::runtime_error (os.str()); 00503 } 00504 } 00505 00506 private: 00507 size_t ncores_; 00508 TSQR::SequentialTsqr< LocalOrdinal, Scalar > seq_; 00509 TSQR::Combine< LocalOrdinal, Scalar > combine_; 00510 Partitioner< LocalOrdinal, Scalar > partitioner_; 00511 00512 double min_seq_factor_timing_; 00513 double max_seq_factor_timing_; 00514 double min_seq_apply_timing_; 00515 double max_seq_apply_timing_; 00516 00517 void 00518 build_partition_array (const size_t P_first, 00519 const size_t P_last, 00520 array_top_blocks_t& top_blocks, 00521 const_mat_view& Q, 00522 mat_view& C, 00523 const bool contiguous_cache_blocks = false) const 00524 { 00525 if (P_first > P_last) 00526 return; 00527 else if (P_first == P_last) 00528 { 00529 const_mat_view Q_top = seq_.top_block (Q, contiguous_cache_blocks); 00530 mat_view C_top = seq_.top_block (C, contiguous_cache_blocks); 00531 top_blocks[P_first] = 00532 std::make_pair (const_mat_view (Q_top.ncols(), Q_top.ncols(), Q_top.get(), Q_top.lda()), 00533 mat_view (C_top.ncols(), C_top.ncols(), C_top.get(), C_top.lda())); 00534 } 00535 else 00536 { 00537 // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last] 00538 const size_t P_mid = (P_first + P_last) / 2; 00539 const_split_t Q_split = 00540 partitioner_.split (Q, P_first, P_mid, P_last, 00541 contiguous_cache_blocks); 00542 split_t C_split = 00543 partitioner_.split (C, P_first, P_mid, P_last, 00544 contiguous_cache_blocks); 00545 build_partition_array (P_first, P_mid, top_blocks, Q_split.first, 00546 C_split.first, contiguous_cache_blocks); 00547 build_partition_array (P_mid+1, P_last, top_blocks, Q_split.second, 00548 C_split.second, contiguous_cache_blocks); 00549 } 00550 } 00551 00552 00553 }; 00554 00555 } // namespace TBB 00556 } // namespace TSQR 00557 00558 #endif // __TSQR_TBB_TbbParallelTsqr_hpp
1.7.4