|
Anasazi Version of the Day
|
00001 // @HEADER 00002 // *********************************************************************** 00003 // 00004 // Anasazi: Block Eigensolvers Package 00005 // Copyright (2010) Sandia Corporation 00006 // 00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 00008 // license for use of this work by or on behalf of the U.S. Government. 00009 // 00010 // This library is free software; you can redistribute it and/or modify 00011 // it under the terms of the GNU Lesser General Public License as 00012 // published by the Free Software Foundation; either version 2.1 of the 00013 // License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, but 00016 // WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00023 // USA 00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 00025 // 00026 // *********************************************************************** 00027 // @HEADER 00028 00029 #ifndef __TSQR_Test_MgsTest_hpp 00030 #define __TSQR_Test_MgsTest_hpp 00031 00032 #include <Tsqr_Config.hpp> 00033 00034 #include <Tsqr_Mgs.hpp> 00035 #ifdef HAVE_TSQR_INTEL_TBB 00036 # include <TbbTsqr_TbbMgs.hpp> 00037 #endif // HAVE_TSQR_INTEL_TBB 00038 #include <Tsqr_TestSetup.hpp> 00039 #include <Tsqr_GlobalVerify.hpp> 00040 #include <Tsqr_printGlobalMatrix.hpp> 00041 #include <Tsqr_verifyTimerConcept.hpp> 00042 00043 #include <Teuchos_RCP.hpp> 00044 00045 #include <algorithm> 00046 #include <sstream> 00047 #include <limits> 00048 #include <iostream> 00049 #include <stdexcept> 00050 #include <utility> 00051 00054 00055 namespace TSQR { 00056 namespace Test { 00057 00058 static std::string 00059 mgs_human_readable_name (const std::string& which) 00060 { 00061 if (which == "MpiSeqMGS") 00062 return std::string ("MPI parallel / sequential MGS"); 00063 else if (which == "MpiTbbMGS") 00064 { 00065 #ifdef HAVE_TSQR_INTEL_TBB 00066 return std::string ("MPI parallel / TBB parallel MGS"); 00067 #else 00068 throw std::logic_error("MGS not built with Intel TBB support"); 00069 #endif // HAVE_TSQR_INTEL_TBB 00070 } 00071 else 00072 throw std::logic_error("Unknown MGS implementation type \"" + which + "\""); 00073 } 00074 00075 template< class MgsType > 00076 class MgsVerifier { 00077 public: 00078 typedef MgsType mgs_type; 00079 typedef typename MgsType::ordinal_type ordinal_type; 00080 typedef typename MgsType::scalar_type scalar_type; 00081 typedef Matrix< ordinal_type, scalar_type > matrix_type; 00082 typedef MessengerBase< scalar_type > messenger_type; 00083 typedef Teuchos::RCP< messenger_type > messenger_ptr; 00084 00085 static void 00086 verify (mgs_type& orthogonalizer, 00087 const messenger_ptr& messenger, 00088 matrix_type& Q_local, 00089 matrix_type& R, 00090 const bool b_debug = false) 00091 { 00092 using std::cerr; 00093 using std::endl; 00094 00095 // Factor the (copy of the) matrix. On output, the explicit Q 00096 // factor (of A_local) is in Q_local and the R factor is in R. 00097 orthogonalizer.mgs (Q_local.nrows(), Q_local.ncols(), 00098 Q_local.get(), Q_local.lda(), 00099 R.get(), R.lda()); 00100 if (b_debug) 00101 { 00102 messenger->barrier(); 00103 if (messenger->rank() == 0) 00104 cerr << "-- Finished MGS::mgs" << endl; 00105 } 00106 } 00107 }; 00108 00109 template< class Ordinal, class Scalar, class Generator > 00110 void 00111 verifyMgs (const std::string& which, 00112 Generator& generator, 00113 const Ordinal nrows_global, 00114 const Ordinal ncols, 00115 const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm, 00116 const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm, 00117 const int num_cores, 00118 const bool human_readable, 00119 const bool b_debug) 00120 { 00121 typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type; 00122 using std::cerr; 00123 using std::cout; 00124 using std::endl; 00125 00126 const bool b_extra_debug = false; 00127 const int nprocs = scalarComm->size(); 00128 const int my_rank = scalarComm->rank(); 00129 if (b_debug) 00130 { 00131 scalarComm->barrier(); 00132 if (my_rank == 0) 00133 cerr << "mgs_verify:" << endl; 00134 scalarComm->barrier(); 00135 } 00136 const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs); 00137 00138 // Set up storage for the test problem 00139 Matrix< Ordinal, Scalar > A_local (nrows_local, ncols); 00140 if (std::numeric_limits< Scalar >::has_quiet_NaN) 00141 A_local.fill (std::numeric_limits< Scalar >::quiet_NaN()); 00142 Matrix< Ordinal, Scalar > R (ncols, ncols, Scalar(0)); 00143 00144 // Generate the test problem. 00145 distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get()); 00146 if (b_debug) 00147 { 00148 scalarComm->barrier(); 00149 if (my_rank == 0) 00150 cerr << "-- Generated test problem." << endl; 00151 } 00152 00153 // Make sure that the test problem (the matrix to factor) was 00154 // distributed correctly. 00155 if (b_extra_debug && b_debug) 00156 { 00157 if (my_rank == 0) 00158 cerr << "Test matrix A:" << endl; 00159 scalarComm->barrier(); 00160 printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get()); 00161 scalarComm->barrier(); 00162 } 00163 00164 // Factoring the matrix stored in A_local overwrites it, so we 00165 // copy A_local into Q_local. MGS orthogonalization does not 00166 // support contiguously stored cache blocks, unlike TSQR, so we 00167 // don't have to consider whether or not to rearrange cache 00168 // blocks here (unlike with TSQR). 00169 Matrix< Ordinal, Scalar > Q_local (A_local); 00170 00171 if (b_debug) 00172 { 00173 scalarComm->barrier(); 00174 if (my_rank == 0) 00175 cerr << "-- Starting verification" << endl; 00176 } 00177 00178 if (which == "MpiTbbMGS") 00179 { 00180 #ifdef HAVE_TSQR_INTEL_TBB 00181 typedef TSQR::TBB::TbbMgs< Ordinal, Scalar > mgs_type; 00182 mgs_type mgser (scalarComm); 00183 MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug); 00184 #else 00185 throw std::logic_error("MGS not built with Intel TBB support"); 00186 #endif // HAVE_TSQR_INTEL_TBB 00187 } 00188 else if (which == "MpiSeqMGS") 00189 { 00190 typedef MGS< Ordinal, Scalar > mgs_type; 00191 mgs_type mgser (scalarComm); 00192 MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug); 00193 } 00194 else 00195 throw std::logic_error ("Invalid MGS implementation type \"" + which + "\""); 00196 00197 // Print out the Q and R factors 00198 if (b_extra_debug && b_debug) 00199 { 00200 if (my_rank == 0) 00201 cerr << endl << "Q factor:" << endl; 00202 scalarComm->barrier (); 00203 printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get()); 00204 scalarComm->barrier (); 00205 if (my_rank == 0) 00206 { 00207 cerr << endl << "R factor:" << endl; 00208 print_local_matrix (cerr, ncols, ncols, R.get(), R.lda()); 00209 cerr << endl; 00210 } 00211 scalarComm->barrier (); 00212 } 00213 00214 // Test accuracy of the resulting factorization 00215 std::pair< magnitude_type, magnitude_type > result = 00216 global_verify (nrows_local, ncols, A_local.get(), A_local.lda(), 00217 Q_local.get(), Q_local.lda(), R.get(), R.lda(), 00218 scalarComm.get()); 00219 if (b_debug) 00220 { 00221 scalarComm->barrier(); 00222 if (my_rank == 0) 00223 cerr << "-- Finished global_verify" << endl; 00224 scalarComm->barrier(); 00225 } 00226 00227 // Print the results on Proc 0. 00228 if (my_rank == 0) 00229 { 00230 if (human_readable) 00231 { 00232 cout << mgs_human_readable_name(which) << endl 00233 << "# rows = " << nrows_global << endl 00234 << "# columns = " << ncols << endl 00235 << "# MPI processes = " << nprocs << endl; 00236 if (which == "MpiTbbTSQR") 00237 cout << "# cores per process = " << num_cores << endl; 00238 cout << "Relative residual $\\|A - Q*R\\|_2 / \\|A\\|_2$ = " 00239 << result.first << endl 00240 << "Relative orthogonality $\\|I - Q^T*Q\\|_2$ = " 00241 << result.second << endl 00242 << endl; 00243 } 00244 else 00245 { 00246 cout << which 00247 << "," << nrows_global 00248 << "," << ncols 00249 << "," << nprocs; 00250 if (which == "MpiTbbTSQR") 00251 cout << "," << num_cores << endl; 00252 cout << "," << result.first 00253 << "," << result.second 00254 << endl; 00255 } 00256 } 00257 } 00258 00259 00260 template< class MgsBase, class TimerType > 00261 static double // returns timing in s 00262 do_mgs_benchmark (MgsBase& orthogonalizer, 00263 Matrix< typename MgsBase::ordinal_type, typename MgsBase::scalar_type >& Q_local, 00264 Matrix< typename MgsBase::ordinal_type, typename MgsBase::scalar_type >& R, 00265 const int num_trials, 00266 const bool human_readable) 00267 { 00268 typedef typename MgsBase::ordinal_type ordinal_type; 00269 using std::cout; 00270 00271 TSQR::Test::verifyTimerConcept< TimerType >(); 00272 00273 const ordinal_type nrows_local = Q_local.nrows(); 00274 const ordinal_type ncols = Q_local.ncols(); 00275 00276 // Benchmark MGS for ntrials trials. The answer (the numerical 00277 // results of the factorization) is only valid if ntrials == 1, 00278 // but this is a benchmark and not a verification routine. Call 00279 // mgs_verify() if you want to determine whether MGS computes 00280 // the right answer. 00281 // 00282 // Name of timer doesn't matter here; we only need the timing. 00283 TimerType timer("MGS"); 00284 timer.start(); 00285 for (int trial_num = 0; trial_num < num_trials; ++trial_num) 00286 { 00287 // Orthogonalize the columns of A using MGS. Don't worry about 00288 // the fact that we're overwriting the input; this is a 00289 // benchmark, not a numerical verification test. (We have the 00290 // latter implemented as mgs_verify() in this file.) 00291 orthogonalizer.mgs (nrows_local, ncols, Q_local.get(), 00292 Q_local.lda(), R.get(), R.lda()); 00293 // Timings in debug mode likely won't make sense, because 00294 // Proc 0 is outputting the debug messages to cerr. 00295 // Nevertheless, we don't put any "if(b_debug)" calls in the 00296 // timing loop. 00297 } 00298 // Compute the resulting total time (in seconds) to execute 00299 // num_trials runs of :mgs(). The time may differ on different 00300 // MPI processes. 00301 const double mgs_timing = timer.stop(); 00302 return mgs_timing; 00303 } 00304 00305 template< class Ordinal, class Scalar, class Generator, class TimerType > 00306 void 00307 benchmarkMgs (const std::string& which, 00308 Generator& generator, 00309 const int ntrials, 00310 const Ordinal nrows_global, 00311 const Ordinal ncols, 00312 const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm, 00313 const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm, 00314 const int num_cores, 00315 const bool human_readable, 00316 const bool b_debug) 00317 { 00318 typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type; 00319 using std::cerr; 00320 using std::cout; 00321 using std::endl; 00322 00323 TSQR::Test::verifyTimerConcept< TimerType >(); 00324 00325 const bool b_extra_debug = false; 00326 const int nprocs = scalarComm->size(); 00327 const int my_rank = scalarComm->rank(); 00328 if (b_debug) 00329 { 00330 scalarComm->barrier(); 00331 if (my_rank == 0) 00332 cerr << "mgs_benchmark:" << endl; 00333 scalarComm->barrier(); 00334 } 00335 const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs); 00336 00337 // Set up storage for the test problem. 00338 Matrix<Ordinal, Scalar> A_local (nrows_local, ncols); 00339 if (std::numeric_limits< Scalar >::has_quiet_NaN) 00340 A_local.fill (std::numeric_limits< Scalar >::quiet_NaN()); 00341 Matrix<Ordinal, Scalar> R (ncols, ncols, Scalar(0)); 00342 00343 // Generate the test problem. 00344 distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get()); 00345 if (b_debug) 00346 { 00347 scalarComm->barrier(); 00348 if (my_rank == 0) 00349 cerr << "-- Generated test problem." << endl; 00350 } 00351 00352 // Make sure that the test problem (the matrix to factor) was 00353 // distributed correctly. 00354 if (b_extra_debug && b_debug) 00355 { 00356 if (my_rank == 0) 00357 cerr << "Test matrix A:" << endl; 00358 scalarComm->barrier (); 00359 printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get()); 00360 scalarComm->barrier (); 00361 } 00362 00363 // Factoring the matrix stored in A_local overwrites it, so we 00364 // make a copy of A_local. MGS orthogonalization does not 00365 // support contiguously stored cache blocks, unlike TSQR, so we 00366 // don't have to consider whether or not to rearrange cache 00367 // blocks here (unlike with TSQR). 00368 Matrix< Ordinal, Scalar > Q_local (A_local); 00369 00370 if (b_debug) 00371 { 00372 scalarComm->barrier(); 00373 if (my_rank == 0) 00374 cerr << "-- Starting timing loop" << endl; 00375 } 00376 00377 // Set up MGS and run the benchmark. 00378 double mgs_timing; // Total run time in seconds of all ntrials trials 00379 if (which == "MpiTbbMGS") 00380 { 00381 #ifdef HAVE_TSQR_INTEL_TBB 00382 typedef TSQR::TBB::TbbMgs< Ordinal, Scalar > mgs_type; 00383 mgs_type mgser (scalarComm); 00384 mgs_timing = do_mgs_benchmark< mgs_type, TimerType > (mgser, Q_local, R, 00385 ntrials, human_readable); 00386 #else 00387 throw std::logic_error("MGS not built with Intel TBB support"); 00388 #endif // HAVE_TSQR_INTEL_TBB 00389 } 00390 else if (which == "MpiSeqMGS") 00391 { 00392 typedef MGS< Ordinal, Scalar > mgs_type; 00393 mgs_type mgser (scalarComm); 00394 mgs_timing = do_mgs_benchmark< mgs_type, TimerType > (mgser, Q_local, R, 00395 ntrials, human_readable); 00396 } 00397 else 00398 throw std::logic_error ("Invalid MGS implementation type \"" + which + "\""); 00399 00400 if (b_debug) 00401 { 00402 scalarComm->barrier(); 00403 if (my_rank == 0) 00404 cerr << "-- Finished timing loop" << endl; 00405 } 00406 00407 // Find the min and max MGS timing on all processors. 00408 const double min_mgs_timing = scalarComm->globalMin (mgs_timing); 00409 const double max_mgs_timing = scalarComm->globalMax (mgs_timing); 00410 00411 if (b_debug) 00412 { 00413 scalarComm->barrier(); 00414 if (my_rank == 0) 00415 cerr << "-- Computed min and max timings" << endl; 00416 } 00417 00418 // Print the results on Proc 0. 00419 if (my_rank == 0) 00420 { 00421 if (human_readable) 00422 { 00423 cout << mgs_human_readable_name(which) << ":" << endl 00424 << "# rows = " << nrows_global << endl 00425 << "# columns = " << ncols << endl 00426 << "# MPI processes = " << nprocs << endl; 00427 if (which == "MpiTbbTSQR") 00428 cout << "# cores per process = " << num_cores << endl; 00429 cout << "# trials = " << ntrials << endl 00430 << "Min total time (s) over all MPI processes = " 00431 << min_mgs_timing << endl 00432 << "Max total time (s) over all MPI processes = " 00433 << max_mgs_timing << endl 00434 << endl; 00435 } 00436 else 00437 { 00438 cout << which 00439 << "," << nrows_global 00440 << "," << ncols 00441 << "," << nprocs; 00442 if (which == "MpiTbbTSQR") 00443 cout << "," << num_cores << endl; 00444 cout << "," << ntrials 00445 << "," << min_mgs_timing 00446 << "," << max_mgs_timing 00447 << endl; 00448 } 00449 } 00450 } 00451 00452 00453 } // namespace Test 00454 } // namespace TSQR 00455 00456 #endif // __TSQR_Test_MgsTest_hpp
1.7.4