|
Anasazi Version of the Day
|
00001 // @HEADER 00002 // *********************************************************************** 00003 // 00004 // Anasazi: Block Eigensolvers Package 00005 // Copyright (2010) Sandia Corporation 00006 // 00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 00008 // license for use of this work by or on behalf of the U.S. Government. 00009 // 00010 // This library is free software; you can redistribute it and/or modify 00011 // it under the terms of the GNU Lesser General Public License as 00012 // published by the Free Software Foundation; either version 2.1 of the 00013 // License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, but 00016 // WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00023 // USA 00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 00025 // 00026 // *********************************************************************** 00027 // @HEADER 00028 00029 #ifndef __TSQR_Test_TsqrTest_hpp 00030 #define __TSQR_Test_TsqrTest_hpp 00031 00032 #include <Tsqr_Config.hpp> 00033 00034 #include <Tsqr.hpp> 00035 #ifdef HAVE_TSQR_INTEL_TBB 00036 # include <TbbTsqr.hpp> 00037 #endif // HAVE_TSQR_INTEL_TBB 00038 #include <Tsqr_TestSetup.hpp> 00039 #include <Tsqr_GlobalVerify.hpp> 00040 #include <Tsqr_printGlobalMatrix.hpp> 00041 #include <Tsqr_verifyTimerConcept.hpp> 00042 00043 #include <cstring> // size_t 00044 #include <iostream> 00045 #include <stdexcept> 00046 #include <string> 00047 00050 00051 namespace TSQR { 00052 namespace Test { 00053 00054 template< class TsqrType > 00055 class TsqrVerifier { 00056 public: 00057 typedef TsqrType tsqr_type; 00058 typedef typename tsqr_type::scalar_type scalar_type; 00059 typedef typename tsqr_type::ordinal_type ordinal_type; 00060 typedef Matrix< ordinal_type, scalar_type > matrix_type; 00061 typedef typename tsqr_type::FactorOutput factor_output_type; 00062 typedef MessengerBase< scalar_type > messenger_type; 00063 typedef Teuchos::RCP< messenger_type > messenger_ptr; 00064 00065 static void 00066 verify (tsqr_type& tsqr, 00067 const messenger_ptr& scalarComm, 00068 const matrix_type& A_local, 00069 matrix_type& A_copy, 00070 matrix_type& Q_local, 00071 matrix_type& R, 00072 const bool contiguousCacheBlocks, 00073 const bool b_debug = false) 00074 { 00075 using std::cerr; 00076 using std::endl; 00077 00078 const ordinal_type nrows_local = A_local.nrows(); 00079 const ordinal_type ncols = A_local.ncols(); 00080 00081 // If specified, rearrange cache blocks in the copy. 00082 if (contiguousCacheBlocks) 00083 { 00084 tsqr.cache_block (nrows_local, ncols, A_copy.get(), 00085 A_local.get(), A_local.lda()); 00086 if (b_debug) 00087 { 00088 scalarComm->barrier(); 00089 if (scalarComm->rank() == 0) 00090 cerr << "-- Cache-blocked input matrix to factor." << endl; 00091 } 00092 } 00093 else 00094 A_copy.copy (A_local); 00095 00096 const bool testFactorExplicit = true; 00097 if (testFactorExplicit) 00098 { 00099 tsqr.factorExplicit (A_copy.view(), Q_local.view(), R.view(), 00100 contiguousCacheBlocks); 00101 if (b_debug) 00102 { 00103 scalarComm->barrier(); 00104 if (scalarComm->rank() == 0) 00105 cerr << "-- Finished Tsqr::factorExplicit" << endl; 00106 } 00107 } 00108 else 00109 { 00110 // Factor the (copy of the) matrix. 00111 factor_output_type factorOutput = 00112 tsqr.factor (nrows_local, ncols, A_copy.get(), A_copy.lda(), 00113 R.get(), R.lda(), contiguousCacheBlocks); 00114 if (b_debug) 00115 { 00116 scalarComm->barrier(); 00117 if (scalarComm->rank() == 0) 00118 cerr << "-- Finished Tsqr::factor" << endl; 00119 } 00120 00121 // Compute the explicit Q factor in Q_local 00122 tsqr.explicit_Q (nrows_local, 00123 ncols, A_copy.get(), A_copy.lda(), factorOutput, 00124 ncols, Q_local.get(), Q_local.lda(), 00125 contiguousCacheBlocks); 00126 if (b_debug) 00127 { 00128 scalarComm->barrier(); 00129 if (scalarComm->rank() == 0) 00130 cerr << "-- Finished Tsqr::explicit_Q" << endl; 00131 } 00132 } 00133 00134 // "Un"-cache-block the output, if contiguous cache blocks were 00135 // used. This is only necessary because global_verify() doesn't 00136 // currently support contiguous cache blocks. 00137 if (contiguousCacheBlocks) 00138 { 00139 // We can use A_copy as scratch space for un-cache-blocking 00140 // Q_local, since we're done using A_copy for other things. 00141 tsqr.un_cache_block (nrows_local, ncols, A_copy.get(), 00142 A_copy.lda(), Q_local.get()); 00143 // Overwrite Q_local with the un-cache-blocked Q factor. 00144 Q_local.copy (A_copy); 00145 00146 if (b_debug) 00147 { 00148 scalarComm->barrier(); 00149 if (scalarComm->rank() == 0) 00150 cerr << "-- Un-cache-blocked output Q factor" << endl; 00151 } 00152 } 00153 } 00154 }; 00155 00195 template< class Ordinal, class Scalar, class Generator > 00196 void 00197 verifyTsqr (const std::string& which, 00198 Generator& generator, 00199 const Ordinal nrows_global, 00200 const Ordinal ncols, 00201 const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm, 00202 const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm, 00203 const int num_cores = 1, 00204 const size_t cache_block_size = 0, 00205 const bool contiguousCacheBlocks = false, 00206 const bool human_readable = false, 00207 const bool b_debug = false) 00208 { 00209 typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type; 00210 using std::cerr; 00211 using std::cout; 00212 using std::endl; 00213 00214 const bool b_extra_debug = false; 00215 const int nprocs = scalarComm->size(); 00216 const int my_rank = scalarComm->rank(); 00217 if (b_debug) 00218 { 00219 scalarComm->barrier(); 00220 if (my_rank == 0) 00221 cerr << "tsqr_verify:" << endl; 00222 scalarComm->barrier(); 00223 } 00224 const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs); 00225 00226 // Set up storage for the test problem. 00227 Matrix< Ordinal, Scalar > A_local (nrows_local, ncols); 00228 Matrix< Ordinal, Scalar > Q_local (nrows_local, ncols); 00229 if (std::numeric_limits< Scalar >::has_quiet_NaN) 00230 { 00231 A_local.fill (std::numeric_limits< Scalar >::quiet_NaN()); 00232 Q_local.fill (std::numeric_limits< Scalar >::quiet_NaN()); 00233 } 00234 Matrix< Ordinal, Scalar > R (ncols, ncols, Scalar(0)); 00235 00236 // Generate the test problem. 00237 distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get()); 00238 if (b_debug) 00239 { 00240 scalarComm->barrier(); 00241 if (my_rank == 0) 00242 cerr << "-- Generated test problem." << endl; 00243 } 00244 00245 // Make sure that the test problem (the matrix to factor) was 00246 // distributed correctly. 00247 if (b_extra_debug && b_debug) 00248 { 00249 if (my_rank == 0) 00250 cerr << "Test matrix A:" << endl; 00251 scalarComm->barrier (); 00252 printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get()); 00253 scalarComm->barrier (); 00254 } 00255 00256 // Factoring the matrix stored in A_local overwrites it, so we 00257 // make a copy of A_local. Initialize with NaNs to make sure 00258 // that cache blocking works correctly (if applicable). 00259 Matrix< Ordinal, Scalar > A_copy (nrows_local, ncols); 00260 if (std::numeric_limits< Scalar >::has_quiet_NaN) 00261 A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN()); 00262 00263 // actual_cache_block_size: "cache_block_size" is just a 00264 // suggestion. TSQR determines the cache block size itself; 00265 // this remembers it so we can print it out later. 00266 size_t actual_cache_block_size; 00267 00268 if (which == "MpiTbbTSQR") 00269 { 00270 #ifdef HAVE_TSQR_INTEL_TBB 00271 using Teuchos::RCP; 00272 typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar > node_tsqr_type; 00273 typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type; 00274 typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type; 00275 00276 RCP< node_tsqr_type > node_tsqr (new node_tsqr_type (num_cores, cache_block_size)); 00277 RCP< dist_tsqr_type > dist_tsqr (new dist_tsqr_type (scalarComm)); 00278 tsqr_type tsqr (node_tsqr, dist_tsqr); 00279 00280 // Compute the factorization and explicit Q factor. 00281 TsqrVerifier< tsqr_type >::verify (tsqr, scalarComm, A_local, A_copy, 00282 Q_local, R, contiguousCacheBlocks, 00283 b_debug); 00284 // Save the "actual" cache block size 00285 actual_cache_block_size = tsqr.cache_block_size(); 00286 #else 00287 throw std::logic_error("TSQR not built with Intel TBB support"); 00288 #endif // HAVE_TSQR_INTEL_TBB 00289 } 00290 else if (which == "MpiSeqTSQR") 00291 { 00292 using Teuchos::RCP; 00293 typedef SequentialTsqr< Ordinal, Scalar > node_tsqr_type; 00294 typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type; 00295 typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type; 00296 00297 RCP< node_tsqr_type > node_tsqr (new node_tsqr_type (cache_block_size)); 00298 RCP< dist_tsqr_type > dist_tsqr (new dist_tsqr_type (scalarComm)); 00299 tsqr_type tsqr (node_tsqr, dist_tsqr); 00300 00301 // Compute the factorization and explicit Q factor. 00302 TsqrVerifier< tsqr_type >::verify (tsqr, scalarComm, A_local, A_copy, 00303 Q_local, R, contiguousCacheBlocks, 00304 b_debug); 00305 // Save the "actual" cache block size 00306 actual_cache_block_size = tsqr.cache_block_size(); 00307 } 00308 else 00309 throw std::logic_error("Unknown TSQR implementation type \"" + which + "\""); 00310 00311 // Print out the Q and R factors 00312 if (b_extra_debug && b_debug) 00313 { 00314 if (my_rank == 0) 00315 cerr << endl << "Q factor:" << endl; 00316 scalarComm->barrier(); 00317 printGlobalMatrix (cerr, Q_local, scalarComm.get(), ordinalComm.get()); 00318 scalarComm->barrier (); 00319 if (my_rank == 0) 00320 { 00321 cerr << endl << "R factor:" << endl; 00322 print_local_matrix (cerr, ncols, ncols, R.get(), R.lda()); 00323 cerr << endl; 00324 } 00325 scalarComm->barrier (); 00326 } 00327 00328 // Test accuracy of the resulting factorization 00329 std::pair< magnitude_type, magnitude_type > result = 00330 global_verify (nrows_local, ncols, A_local.get(), A_local.lda(), 00331 Q_local.get(), Q_local.lda(), R.get(), R.lda(), 00332 scalarComm.get()); 00333 if (b_debug) 00334 { 00335 scalarComm->barrier(); 00336 if (my_rank == 0) 00337 cerr << "-- Finished global_verify" << endl; 00338 } 00339 00340 // Print the results on Proc 0. 00341 if (my_rank == 0) 00342 { 00343 if (human_readable) 00344 { 00345 std::string human_readable_name; 00346 00347 if (which == "MpiSeqTSQR") 00348 human_readable_name = "MPI parallel / cache-blocked TSQR"; 00349 else if (which == "MpiTbbTSQR") 00350 { 00351 #ifdef HAVE_TSQR_INTEL_TBB 00352 human_readable_name = "MPI parallel / TBB parallel / cache-blocked TSQR"; 00353 #else 00354 throw std::logic_error("TSQR not built with Intel TBB support"); 00355 #endif // HAVE_TSQR_INTEL_TBB 00356 } 00357 else 00358 throw std::logic_error("Unknown TSQR implementation type \"" + which + "\""); 00359 00360 cout << human_readable_name << ":" << endl 00361 << "# rows = " << nrows_global << endl 00362 << "# columns = " << ncols << endl 00363 << "# MPI processes = " << nprocs << endl; 00364 #ifdef HAVE_TSQR_INTEL_TBB 00365 if (which == "MpiTbbTSQR") 00366 cout << "# cores per process = " << num_cores << endl; 00367 #endif // HAVE_TSQR_INTEL_TBB 00368 cout << "cache block # bytes = " << actual_cache_block_size << endl 00369 << "contiguous cache blocks? " << contiguousCacheBlocks << endl 00370 << "Relative residual $\\|A - Q*R\\|_2 / \\|A\\|_2$ = " 00371 << result.first << endl 00372 << "Relative orthogonality $\\|I - Q^T*Q\\|_2$ = " 00373 << result.second << endl 00374 << endl; 00375 } 00376 else 00377 { 00378 cout << which 00379 << "," << nrows_global 00380 << "," << ncols 00381 << "," << nprocs; 00382 #ifdef HAVE_TSQR_INTEL_TBB 00383 if (which == "MpiTbbTSQR") 00384 cout << "," << num_cores << endl; 00385 #endif // HAVE_TSQR_INTEL_TBB 00386 cout << "," << actual_cache_block_size 00387 << "," << contiguousCacheBlocks 00388 << "," << result.first 00389 << "," << result.second 00390 << endl; 00391 } 00392 } 00393 } 00394 00395 00396 template< class TsqrBase, class TimerType > 00397 double 00398 do_tsqr_benchmark (const std::string& which, 00399 TsqrBase& tsqr, 00400 const Teuchos::RCP< MessengerBase< typename TsqrBase::scalar_type > >& messenger, 00401 const Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& A_local, 00402 Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& A_copy, 00403 Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& Q_local, 00404 Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& R, 00405 const int ntrials, 00406 const bool contiguousCacheBlocks, 00407 const bool human_readable, 00408 const bool b_debug = false) 00409 { 00410 typedef typename TsqrBase::FactorOutput factor_output_type; 00411 typedef typename TsqrBase::ordinal_type ordinal_type; 00412 using std::cerr; 00413 using std::cout; 00414 using std::endl; 00415 00416 const ordinal_type nrows_local = A_local.nrows(); 00417 const ordinal_type ncols = A_local.ncols(); 00418 00419 if (contiguousCacheBlocks) 00420 { 00421 tsqr.cache_block (nrows_local, ncols, A_copy.get(), 00422 A_local.get(), A_local.lda()); 00423 if (b_debug) 00424 { 00425 messenger->barrier(); 00426 if (messenger->rank() == 0) 00427 cerr << "-- Cache-blocked input matrix to factor." << endl; 00428 } 00429 } 00430 else 00431 A_copy.copy (A_local); 00432 00433 if (b_debug) 00434 { 00435 messenger->barrier(); 00436 if (messenger->rank() == 0) 00437 cerr << "-- Starting timing loop" << endl; 00438 } 00439 00440 // Benchmark TSQR for ntrials trials. The answer (the numerical 00441 // results of the factorization) is only valid if ntrials == 1, 00442 // but this is a benchmark and not a verification routine. Call 00443 // tsqr_verify() if you want to determine whether TSQR computes 00444 // the right answer. 00445 // 00446 // Name of timer doesn't matter here; we only need the timing. 00447 TSQR::Test::verifyTimerConcept< TimerType >(); 00448 TimerType timer (which); 00449 00450 00451 const bool testFactorExplicit = true; 00452 double tsqr_timing; 00453 if (testFactorExplicit) 00454 { 00455 timer.start(); 00456 for (int trial_num = 0; trial_num < ntrials; ++trial_num) 00457 tsqr.factorExplicit (A_copy.view(), Q_local.view(), R.view(), 00458 contiguousCacheBlocks); 00459 tsqr_timing = timer.stop(); 00460 } 00461 else 00462 { 00463 timer.start(); 00464 for (int trial_num = 0; trial_num < ntrials; ++trial_num) 00465 { 00466 // Factor the matrix and compute the explicit Q factor. 00467 // Don't worry about the fact that we're overwriting the 00468 // input; this is a benchmark, not a numerical verification 00469 // test. (We have the latter implemented as tsqr_verify() 00470 // in this file.) For the same reason, don't worry about 00471 // un-cache-blocking the output (when cache blocks are 00472 // stored contiguously). 00473 factor_output_type factor_output = 00474 tsqr.factor (nrows_local, ncols, A_copy.get(), A_copy.lda(), 00475 R.get(), R.lda(), contiguousCacheBlocks); 00476 tsqr.explicit_Q (nrows_local, 00477 ncols, A_copy.get(), A_copy.lda(), factor_output, 00478 ncols, Q_local.get(), Q_local.lda(), 00479 contiguousCacheBlocks); 00480 // Timings in debug mode likely won't make sense, because 00481 // Proc 0 is outputting the debug messages to cerr. 00482 // Nevertheless, we don't put any "if(b_debug)" calls in the 00483 // timing loop. 00484 } 00485 // Compute the resulting total time (in seconds) to execute 00486 // ntrials runs of Tsqr::factor() and Tsqr::explicit_Q(). The 00487 // time may differ on different MPI processes. 00488 tsqr_timing = timer.stop(); 00489 } 00490 00491 if (b_debug) 00492 { 00493 messenger->barrier(); 00494 if (messenger->rank() == 0) 00495 cerr << "-- Finished timing loop" << endl; 00496 } 00497 return tsqr_timing; 00498 } 00499 00548 template< class Ordinal, class Scalar, class Generator, class TimerType > 00549 void 00550 benchmarkTsqr (const std::string& which, 00551 Generator& generator, 00552 const int ntrials, 00553 const Ordinal nrows_global, 00554 const Ordinal ncols, 00555 const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm, 00556 const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm, 00557 const Ordinal num_cores, 00558 const size_t cache_block_size, 00559 const bool contiguousCacheBlocks, 00560 const bool human_readable, 00561 const bool b_debug) 00562 { 00563 using std::cerr; 00564 using std::cout; 00565 using std::endl; 00566 00567 TSQR::Test::verifyTimerConcept< TimerType >(); 00568 const bool b_extra_debug = false; 00569 const int nprocs = scalarComm->size(); 00570 const int my_rank = scalarComm->rank(); 00571 if (b_debug) 00572 { 00573 scalarComm->barrier(); 00574 if (my_rank == 0) 00575 cerr << "tsqr_benchmark:" << endl; 00576 scalarComm->barrier(); 00577 } 00578 const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs); 00579 00580 // Set up storage for the test problem. 00581 Matrix< Ordinal, Scalar > A_local (nrows_local, ncols); 00582 Matrix< Ordinal, Scalar > Q_local (nrows_local, ncols); 00583 if (std::numeric_limits< Scalar >::has_quiet_NaN) 00584 { 00585 A_local.fill (std::numeric_limits< Scalar >::quiet_NaN()); 00586 Q_local.fill (std::numeric_limits< Scalar >::quiet_NaN()); 00587 } 00588 Matrix< Ordinal, Scalar > R (ncols, ncols, Scalar(0)); 00589 00590 // Generate the test problem. 00591 distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get()); 00592 if (b_debug) 00593 { 00594 scalarComm->barrier(); 00595 if (my_rank == 0) 00596 cerr << "-- Generated test problem." << endl; 00597 } 00598 00599 // Make sure that the test problem (the matrix to factor) was 00600 // distributed correctly. 00601 if (b_extra_debug && b_debug) 00602 { 00603 if (my_rank == 0) 00604 cerr << "Test matrix A:" << endl; 00605 scalarComm->barrier (); 00606 printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get()); 00607 scalarComm->barrier (); 00608 } 00609 00610 // Factoring the matrix stored in A_local overwrites it, so we 00611 // make a copy of A_local. If specified, rearrange cache blocks 00612 // in the copy. Initialize with NaNs to make sure that cache 00613 // blocking worked correctly. 00614 Matrix< Ordinal, Scalar > A_copy (nrows_local, ncols); 00615 if (std::numeric_limits< Scalar >::has_quiet_NaN) 00616 A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN()); 00617 00618 // actual_cache_block_size: "cache_block_size" is just a 00619 // suggestion. TSQR determines the cache block size itself; 00620 // this remembers it so we can print it out later. 00621 size_t actual_cache_block_size; 00622 // Run time (in seconds, as a double-precision floating-point 00623 // value) for TSQR on this MPI node. 00624 double tsqr_timing; 00625 00626 if (which == "MpiTbbTSQR") 00627 { 00628 #ifdef HAVE_TSQR_INTEL_TBB 00629 using Teuchos::RCP; 00630 typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar > node_tsqr_type; 00631 typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type; 00632 typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type; 00633 00634 RCP< node_tsqr_type > nodeTsqr (new node_tsqr_type (num_cores, cache_block_size)); 00635 RCP< dist_tsqr_type > distTsqr (new dist_tsqr_type (scalarComm)); 00636 tsqr_type tsqr (nodeTsqr, distTsqr); 00637 00638 // Run the benchmark. 00639 tsqr_timing = 00640 do_tsqr_benchmark< tsqr_type, TimerType > (which, tsqr, scalarComm, A_local, 00641 A_copy, Q_local, R, ntrials, 00642 contiguousCacheBlocks, 00643 human_readable, b_debug); 00644 00645 // Save the "actual" cache block size 00646 actual_cache_block_size = tsqr.cache_block_size(); 00647 #else 00648 throw std::logic_error("TSQR not built with Intel TBB support"); 00649 #endif // HAVE_TSQR_INTEL_TBB 00650 } 00651 else if (which == "MpiSeqTSQR") 00652 { 00653 using Teuchos::RCP; 00654 typedef SequentialTsqr< Ordinal, Scalar > node_tsqr_type; 00655 typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type; 00656 typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type; 00657 00658 // Set up TSQR. 00659 RCP< node_tsqr_type > nodeTsqr (new node_tsqr_type (cache_block_size)); 00660 RCP< dist_tsqr_type > distTsqr (new dist_tsqr_type (scalarComm)); 00661 tsqr_type tsqr (nodeTsqr, distTsqr); 00662 00663 // Run the benchmark. 00664 tsqr_timing = 00665 do_tsqr_benchmark< tsqr_type, TimerType > (which, tsqr, scalarComm, A_local, 00666 A_copy, Q_local, R, ntrials, 00667 contiguousCacheBlocks, 00668 human_readable, b_debug); 00669 // Save the "actual" cache block size 00670 actual_cache_block_size = tsqr.cache_block_size(); 00671 } 00672 else 00673 throw std::logic_error("Unknown TSQR implementation type \"" + which + "\""); 00674 00675 // Find the min and max TSQR timing on all processors. 00676 const double min_tsqr_timing = scalarComm->globalMin (tsqr_timing); 00677 const double max_tsqr_timing = scalarComm->globalMax (tsqr_timing); 00678 00679 // Print the results on Proc 0. 00680 if (my_rank == 0) 00681 { 00682 if (human_readable) 00683 { 00684 std::string human_readable_name; 00685 00686 if (which == "MpiSeqTSQR") 00687 human_readable_name = "MPI parallel / cache-blocked TSQR"; 00688 else if (which == "MpiTbbTSQR") 00689 { 00690 #ifdef HAVE_TSQR_INTEL_TBB 00691 human_readable_name = "MPI parallel / TBB parallel / cache-blocked TSQR"; 00692 #else 00693 throw std::logic_error("TSQR not built with Intel TBB support"); 00694 #endif // HAVE_TSQR_INTEL_TBB 00695 } 00696 else 00697 throw std::logic_error("Unknown TSQR implementation type \"" + which + "\""); 00698 00699 cout << human_readable_name << ":" << endl 00700 << "# rows = " << nrows_global << endl 00701 << "# columns = " << ncols << endl 00702 << "# MPI processes = " << nprocs << endl; 00703 00704 #ifdef HAVE_TSQR_INTEL_TBB 00705 if (which == "MpiTbbTSQR") 00706 cout << "# cores per process = " << num_cores << endl; 00707 #endif // HAVE_TSQR_INTEL_TBB 00708 00709 cout << "cache block # bytes = " << actual_cache_block_size << endl 00710 << "contiguous cache blocks? " << contiguousCacheBlocks << endl 00711 << "# trials = " << ntrials << endl 00712 << "Min total time (s) over all MPI processes = " 00713 << min_tsqr_timing << endl 00714 << "Max total time (s) over all MPI processes = " 00715 << max_tsqr_timing << endl 00716 << endl; 00717 } 00718 else 00719 { 00720 cout << which 00721 << "," << nrows_global 00722 << "," << ncols 00723 << "," << nprocs ; 00724 #ifdef HAVE_TSQR_INTEL_TBB 00725 if (which == "MpiTbbTSQR") 00726 cout << "," << num_cores << endl; 00727 #endif // HAVE_TSQR_INTEL_TBB 00728 cout << "," << actual_cache_block_size 00729 << "," << contiguousCacheBlocks 00730 << "," << ntrials 00731 << "," << min_tsqr_timing 00732 << "," << max_tsqr_timing 00733 << endl; 00734 } 00735 } 00736 } 00737 00738 00739 } // namespace Test 00740 } // namespace TSQR 00741 00742 #endif // __TSQR_Test_TsqrTest_hpp
1.7.4