|
Anasazi Version of the Day
|
00001 // @HEADER 00002 // *********************************************************************** 00003 // 00004 // Anasazi: Block Eigensolvers Package 00005 // Copyright (2010) Sandia Corporation 00006 // 00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 00008 // license for use of this work by or on behalf of the U.S. Government. 00009 // 00010 // This library is free software; you can redistribute it and/or modify 00011 // it under the terms of the GNU Lesser General Public License as 00012 // published by the Free Software Foundation; either version 2.1 of the 00013 // License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, but 00016 // WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00023 // USA 00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 00025 // 00026 // *********************************************************************** 00027 // @HEADER 00028 00029 #ifndef __TSQR_Test_TbbTest_hpp 00030 #define __TSQR_Test_TbbTest_hpp 00031 00032 #include <Tsqr_nodeTestProblem.hpp> 00033 #include <Tsqr_verifyTimerConcept.hpp> 00034 #include <Tsqr_Random_NormalGenerator.hpp> 00035 00036 #include <Tsqr_Blas.hpp> 00037 #include <Tsqr_Lapack.hpp> 00038 #include <Tsqr_LocalVerify.hpp> 00039 #include <Tsqr_Matrix.hpp> 00040 #include <Tsqr_Util.hpp> 00041 #include <Tsqr_ScalarTraits.hpp> 00042 #include <TbbTsqr.hpp> 00043 00044 #include <Teuchos_Time.hpp> 00045 00046 #include <algorithm> 00047 #include <cstring> // size_t definition 00048 //#include <iomanip> 00049 #include <iostream> 00050 #include <limits> 00051 #include <stdexcept> 00052 #include <vector> 00053 00054 using std::make_pair; 00055 using std::pair; 00056 using std::vector; 00057 00058 using std::cerr; 00059 using std::cout; 00060 using std::endl; 00061 00064 00065 namespace TSQR { 00066 namespace Test { 00067 00071 template< class Ordinal, class Scalar > 00072 void 00073 verifyTbbTsqr (TSQR::Random::NormalGenerator< Ordinal, Scalar >& generator, 00074 const Ordinal nrows, 00075 const Ordinal ncols, 00076 const int num_cores, 00077 const size_t cache_block_size, 00078 const bool contiguous_cache_blocks, 00079 const bool human_readable, 00080 const bool b_debug = false) 00081 { 00082 typedef Teuchos::Time timer_type; 00083 typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar, timer_type > node_tsqr_type; 00084 typedef typename node_tsqr_type::FactorOutput factor_output_type; 00085 typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type; 00086 using std::cerr; 00087 using std::cout; 00088 using std::endl; 00089 00090 node_tsqr_type actor (num_cores, cache_block_size); 00091 00092 if (b_debug) 00093 { 00094 cerr << "Intel TBB TSQR test problem:" << endl 00095 << "* " << nrows << " x " << ncols << endl 00096 << "* # cores: " << num_cores << endl 00097 << "* Cache block of " << actor.cache_block_size() << " bytes" << endl; 00098 if (contiguous_cache_blocks) 00099 cerr << "* Contiguous cache blocks" << endl; 00100 } 00101 00102 Matrix< Ordinal, Scalar > A (nrows, ncols); 00103 Matrix< Ordinal, Scalar > A_copy (nrows, ncols); 00104 Matrix< Ordinal, Scalar > Q (nrows, ncols); 00105 Matrix< Ordinal, Scalar > R (ncols, ncols); 00106 if (std::numeric_limits< Scalar >::has_quiet_NaN) 00107 { 00108 A.fill (std::numeric_limits< Scalar>::quiet_NaN()); 00109 A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN()); 00110 Q.fill (std::numeric_limits< Scalar >::quiet_NaN()); 00111 R.fill (std::numeric_limits< Scalar >::quiet_NaN()); 00112 } 00113 const Ordinal lda = nrows; 00114 const Ordinal ldq = nrows; 00115 const Ordinal ldr = ncols; 00116 00117 // Create a test problem 00118 nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), true); 00119 00120 if (b_debug) 00121 cerr << "-- Generated test problem" << endl; 00122 00123 // Copy A into A_copy, since TSQR overwrites the input. If 00124 // specified, rearrange the data in A_copy so that the data in 00125 // each cache block is contiguously stored. 00126 if (! contiguous_cache_blocks) 00127 { 00128 A_copy.copy (A); 00129 if (b_debug) 00130 cerr << "-- Copied test problem from A into A_copy" << endl; 00131 } 00132 else 00133 { 00134 actor.cache_block (nrows, ncols, A_copy.get(), A.get(), A.lda()); 00135 if (b_debug) 00136 cerr << "-- Reorganized test matrix to have contiguous " 00137 "cache blocks" << endl; 00138 00139 // Verify cache blocking, when in debug mode. 00140 if (b_debug) 00141 { 00142 Matrix< Ordinal, Scalar > A2 (nrows, ncols); 00143 if (std::numeric_limits< Scalar >::has_quiet_NaN) 00144 A2.fill (std::numeric_limits< Scalar >::quiet_NaN()); 00145 00146 actor.un_cache_block (nrows, ncols, A2.get(), A2.lda(), A_copy.get()); 00147 if (A == A2) 00148 { 00149 if (b_debug) 00150 cerr << "-- Cache blocking test succeeded!" << endl; 00151 } 00152 else 00153 throw std::logic_error ("Cache blocking failed"); 00154 } 00155 } 00156 00157 // Fill R with zeros, since the factorization may not overwrite 00158 // the strict lower triangle of R. 00159 R.fill (Scalar(0)); 00160 00161 // Factor the matrix and compute the explicit Q factor 00162 factor_output_type factor_output = 00163 actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), R.get(), 00164 R.lda(), contiguous_cache_blocks); 00165 if (b_debug) 00166 cerr << "-- Finished TbbTsqr::factor" << endl; 00167 actor.explicit_Q (nrows, ncols, A_copy.get(), A_copy.lda(), factor_output, 00168 ncols, Q.get(), Q.lda(), contiguous_cache_blocks); 00169 if (b_debug) 00170 cerr << "-- Finished TbbTsqr::explicit_Q" << endl; 00171 00172 // "Un"-cache-block the output Q (the explicit Q factor), if 00173 // contiguous cache blocks were used. This is only necessary 00174 // because local_verify() doesn't currently support contiguous 00175 // cache blocks. 00176 if (contiguous_cache_blocks) 00177 { 00178 // Use A_copy as temporary storage for un-cache-blocking Q. 00179 actor.un_cache_block (nrows, ncols, A_copy.get(), A_copy.lda(), Q.get()); 00180 Q.copy (A_copy); 00181 if (b_debug) 00182 cerr << "-- Un-cache-blocked output Q factor" << endl; 00183 } 00184 00185 // Print out the R factor 00186 if (b_debug) 00187 { 00188 cerr << endl << "-- R factor:" << endl; 00189 print_local_matrix (cerr, ncols, ncols, R.get(), R.lda()); 00190 cerr << endl; 00191 } 00192 00193 // Validate the factorization 00194 std::pair< magnitude_type, magnitude_type > results = 00195 local_verify (nrows, ncols, A.get(), lda, Q.get(), ldq, R.get(), ldr); 00196 if (b_debug) 00197 cerr << "-- Finished local_verify" << endl; 00198 00199 // Print the results 00200 if (human_readable) 00201 cout << "Parallel (via Intel\'s Threading Building Blocks) / cache-blocked) TSQR:" << endl 00202 << "# rows = " << nrows << endl 00203 << "# columns = " << ncols << endl 00204 << "# cores: " << num_cores << endl 00205 << "cache block # bytes = " << actor.cache_block_size() << endl 00206 << "contiguous cache blocks? " << contiguous_cache_blocks << endl 00207 << "Relative residual $\\|A - Q*R\\|_2 / \\|A\\|_2$ = " 00208 << results.first << endl 00209 << "Relative orthogonality $\\|I - Q^T*Q\\|_2$ = " 00210 << results.second << endl 00211 << endl; 00212 else 00213 cout << "TbbTsqr" 00214 << "," << nrows 00215 << "," << ncols 00216 << "," << num_cores 00217 << "," << actor.cache_block_size() 00218 << "," << contiguous_cache_blocks 00219 << "," << results.first 00220 << "," << results.second 00221 << endl; 00222 } 00223 00231 template< class Ordinal, class Scalar > 00232 void 00233 benchmarkTbbTsqr (const int ntrials, 00234 const Ordinal nrows, 00235 const Ordinal ncols, 00236 const int num_cores, 00237 const size_t cache_block_size, 00238 const bool contiguous_cache_blocks, 00239 const bool human_readable) 00240 { 00241 using TSQR::TBB::TbbTsqr; 00242 using std::cerr; 00243 using std::cout; 00244 using std::endl; 00245 00246 typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type; 00247 typedef Teuchos::Time timer_type; 00248 typedef Ordinal ordinal_type; 00249 typedef Scalar scalar_type; 00250 typedef Matrix< ordinal_type, scalar_type > matrix_type; 00251 typedef TbbTsqr< ordinal_type, scalar_type, timer_type > node_tsqr_type; 00252 00253 // Pseudorandom normal(0,1) generator. Default seed is OK, 00254 // because this is a benchmark, not an accuracy test. 00255 TSQR::Random::NormalGenerator< ordinal_type, scalar_type > generator; 00256 00257 // Set up TSQR implementation. 00258 node_tsqr_type actor (num_cores, cache_block_size); 00259 00260 matrix_type A (nrows, ncols); 00261 matrix_type A_copy (nrows, ncols); 00262 matrix_type Q (nrows, ncols); 00263 matrix_type R (ncols, ncols, scalar_type(0)); 00264 00265 // Fill R with zeros, since the factorization may not overwrite 00266 // the strict lower triangle of R. 00267 R.fill (scalar_type(0)); 00268 00269 // Create a test problem 00270 nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), false); 00271 00272 // Copy A into A_copy, since TSQR overwrites the input. If 00273 // specified, rearrange the data in A_copy so that the data in 00274 // each cache block is contiguously stored. 00275 if (contiguous_cache_blocks) 00276 actor.cache_block (nrows, ncols, A_copy.get(), A.get(), A.lda()); 00277 else 00278 A_copy.copy (A); 00279 00280 // Benchmark TBB-based TSQR for ntrials trials. 00281 // 00282 // Name of timer doesn't matter here; we only need the timing. 00283 timer_type timer("TbbTsqr"); 00284 timer.start(); 00285 for (int trial_num = 0; trial_num < ntrials; ++trial_num) 00286 { 00287 // Factor the matrix in-place in A_copy, and extract the 00288 // resulting R factor into R. 00289 typedef typename node_tsqr_type::FactorOutput factor_output_type; 00290 factor_output_type factor_output = 00291 actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), 00292 R.get(), R.lda(), contiguous_cache_blocks); 00293 // Compute the explicit Q factor (which was stored 00294 // implicitly in A_copy and factor_output) and store in Q. 00295 // We don't need to un-cache-block the output, because we 00296 // aren't verifying it here. 00297 actor.explicit_Q (nrows, ncols, A_copy.get(), A_copy.lda(), 00298 factor_output, ncols, Q.get(), Q.lda(), 00299 contiguous_cache_blocks); 00300 } 00301 const double tbb_tsqr_timing = timer.stop(); 00302 00303 // Print the results 00304 if (human_readable) 00305 { 00306 cout << "(Intel TBB / cache-blocked) TSQR cumulative timings:" << endl 00307 << "# rows = " << nrows << endl 00308 << "# columns = " << ncols << endl 00309 << "# cores: " << num_cores << endl 00310 << "cache block # bytes = " << actor.cache_block_size() << endl 00311 << "contiguous cache blocks? " << contiguous_cache_blocks << endl 00312 << "# trials = " << ntrials << endl 00313 << "Total time (s) = " << tbb_tsqr_timing << endl 00314 << "Total time (s) in factor() (min over all tasks): " 00315 << (ntrials * actor.min_seq_factor_timing()) << endl 00316 << "Total time (s) in factor() (max over all tasks): " 00317 << (ntrials * actor.max_seq_factor_timing()) << endl 00318 << "Total time (s) in apply() (min over all tasks): " 00319 << (ntrials * actor.min_seq_apply_timing()) << endl 00320 << "Total time (s) in apply() (max over all tasks): " 00321 << (ntrials * actor.max_seq_apply_timing()) << endl 00322 << endl << endl; 00323 cout << "(Intel TBB / cache-blocked) TSQR per-invocation timings:" << endl; 00324 00325 std::vector< TimeStats > stats; 00326 actor.getStats (stats); 00327 std::vector< std::string> labels; 00328 actor.getStatsLabels (labels); 00329 00330 const std::string labelLabel ("label"); 00331 for (std::vector< std::string >::size_type k = 0; k < labels.size(); ++k) 00332 { 00333 const bool printHeaders = (k == 0); 00334 if (stats[k].count() > 0) 00335 stats[k].print (cout, human_readable, labels[k], labelLabel, printHeaders); 00336 } 00337 } 00338 else 00339 { 00340 // We don't include {min,max}_seq_apply_timing() here, because 00341 // those times don't benefit from the accuracy of benchmarking 00342 // for ntrials > 1. Thus, it's misleading to include them 00343 // with tbb_tsqr_timing, the total time over ntrials trials. 00344 cout << "TbbTsqr" 00345 << "," << nrows 00346 << "," << ncols 00347 << "," << num_cores 00348 << "," << actor.cache_block_size() 00349 << "," << contiguous_cache_blocks 00350 << "," << ntrials 00351 << "," << tbb_tsqr_timing 00352 << endl; 00353 } 00354 } 00355 } // namespace Test 00356 } // namespace TSQR 00357 00358 #endif // __TSQR_Test_TbbTest_hpp
1.7.4