|
Anasazi Version of the Day
|
00001 // @HEADER 00002 // *********************************************************************** 00003 // 00004 // Anasazi: Block Eigensolvers Package 00005 // Copyright (2010) Sandia Corporation 00006 // 00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 00008 // license for use of this work by or on behalf of the U.S. Government. 00009 // 00010 // This library is free software; you can redistribute it and/or modify 00011 // it under the terms of the GNU Lesser General Public License as 00012 // published by the Free Software Foundation; either version 2.1 of the 00013 // License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, but 00016 // WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00023 // USA 00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 00025 // 00026 // *********************************************************************** 00027 // @HEADER 00028 00029 #ifndef __TSQR_Test_SeqTest_hpp 00030 #define __TSQR_Test_SeqTest_hpp 00031 00032 #include <Tsqr_Config.hpp> 00033 #include <Tsqr_Random_NormalGenerator.hpp> 00034 #include <Tsqr_nodeTestProblem.hpp> 00035 #include <Tsqr_verifyTimerConcept.hpp> 00036 00037 #include <Tsqr_Blas.hpp> 00038 #include <Tsqr_Lapack.hpp> 00039 #include <Tsqr_LocalVerify.hpp> 00040 #include <Tsqr_Matrix.hpp> 00041 #include <Tsqr_ScalarTraits.hpp> 00042 #include <Tsqr_SequentialTsqr.hpp> 00043 #include <Tsqr_Util.hpp> 00044 00045 #include <algorithm> 00046 #include <cstring> // size_t definition 00047 #include <iomanip> 00048 #include <iostream> 00049 #include <limits> 00050 #include <sstream> 00051 #include <stdexcept> 00052 #include <vector> 00053 00056 00057 namespace TSQR { 00058 namespace Test { 00059 00060 template< class Ordinal, class Scalar > 00061 static Ordinal 00062 lworkQueryLapackQr (LAPACK< Ordinal, Scalar >& lapack, 00063 const Ordinal nrows, 00064 const Ordinal ncols, 00065 const Ordinal lda) 00066 { 00067 typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type; 00068 using std::ostringstream; 00069 using std::endl; 00070 00071 Scalar d_lwork_geqrf = Scalar (0); 00072 int INFO = 0; 00073 lapack.GEQRF (nrows, ncols, NULL, lda, NULL, &d_lwork_geqrf, -1, &INFO); 00074 if (INFO != 0) 00075 { 00076 ostringstream os; 00077 os << "LAPACK _GEQRF workspace size query failed: INFO = " << INFO; 00078 // It's a logic error and not a runtime error, because the 00079 // LWORK query should only fail if the input parameters have 00080 // invalid (e.g., out of range) values. 00081 throw std::logic_error (os.str()); 00082 } 00083 00084 Scalar d_lwork_orgqr = Scalar (0); 00085 // A workspace query appropriate for computing the explicit Q 00086 // factor (nrows x ncols) in place, from the QR factorization of 00087 // an nrows x ncols matrix with leading dimension lda. 00088 lapack.ORGQR (nrows, ncols, ncols, NULL, lda, NULL, &d_lwork_orgqr, -1, &INFO); 00089 if (INFO != 0) 00090 { 00091 ostringstream os; 00092 os << "LAPACK _ORGQR workspace size query failed: INFO = " << INFO; 00093 // It's a logic error and not a runtime error, because the 00094 // LWORK query should only fail if the input parameters have 00095 // invalid (e.g., out of range) values. 00096 throw std::logic_error (os.str()); 00097 } 00098 00099 // LAPACK workspace queries do return their results as a 00100 // double-precision floating-point value, but LAPACK promises 00101 // that that value will fit in an int. Thus, we don't need to 00102 // check for valid casts to int below. I include the checks 00103 // just to be "bulletproof" and also to show how to do the 00104 // checks for later reference. 00105 const magnitude_type lwork_geqrf_test = 00106 static_cast< magnitude_type > (static_cast< Ordinal > (ScalarTraits< Scalar >::abs (d_lwork_geqrf))); 00107 if (lwork_geqrf_test != ScalarTraits< Scalar >::abs (d_lwork_geqrf)) 00108 { 00109 ostringstream os; 00110 os << "LAPACK _GEQRF workspace query returned a result, " 00111 << d_lwork_geqrf << ", bigger than the max Ordinal value, " 00112 << std::numeric_limits< Ordinal >::max(); 00113 throw std::range_error (os.str()); 00114 } 00115 const Scalar lwork_orgqr_test = 00116 static_cast< magnitude_type > (static_cast< Ordinal > (ScalarTraits< Scalar >::abs ((d_lwork_orgqr)))); 00117 if (lwork_orgqr_test != ScalarTraits< Scalar >::abs (d_lwork_orgqr)) 00118 { 00119 ostringstream os; 00120 os << "LAPACK _ORGQR workspace query returned a result, " 00121 << d_lwork_orgqr << ", bigger than the max Ordinal value, " 00122 << std::numeric_limits< Ordinal >::max(); 00123 throw std::range_error (os.str()); 00124 } 00125 return std::max (static_cast< Ordinal > (ScalarTraits< Scalar >::abs (d_lwork_geqrf)), 00126 static_cast< Ordinal > (ScalarTraits< Scalar >::abs (d_lwork_orgqr))); 00127 } 00128 00129 00133 void 00134 verifySeqTsqr (std::ostream& out, 00135 const int nrows, 00136 const int ncols, 00137 const size_t cache_block_size, 00138 const bool test_complex_arithmetic, 00139 const bool save_matrices, 00140 const bool contiguous_cache_blocks, 00141 const bool human_readable, 00142 const bool b_debug = false); 00143 00144 00147 void 00148 verifyLapack (const int nrows, 00149 const int ncols, 00150 const bool test_complex_arithmetic, 00151 const bool human_readable, 00152 const bool b_debug = false); 00153 00154 00160 template< class Ordinal, class Scalar, class TimerType > 00161 class SeqTsqrBenchmarker { 00162 public: 00163 typedef Ordinal ordinal_type; 00164 typedef Scalar scalar_type; 00165 00170 SeqTsqrBenchmarker (const std::string& scalarTypeName, 00171 std::ostream& out = std::cout, 00172 const bool humanReadable = false) : 00173 scalarTypeName_ (scalarTypeName), 00174 out_ (out), 00175 humanReadable_ (humanReadable) 00176 { 00177 TSQR::Test::verifyTimerConcept< TimerType >(); 00178 } 00179 00180 void 00181 benchmark (const int numTrials, 00182 const Ordinal numRows, 00183 const Ordinal numCols, 00184 const size_t cacheBlockSize, 00185 const bool contiguousCacheBlocks) 00186 { 00187 SequentialTsqr< Ordinal, Scalar > actor (cacheBlockSize); 00188 00189 Matrix< Ordinal, Scalar > A (numRows, numCols); 00190 Matrix< Ordinal, Scalar > A_copy (numRows, numCols); 00191 Matrix< Ordinal, Scalar > Q (numRows, numCols); 00192 Matrix< Ordinal, Scalar > R (numCols, numCols); 00193 const Ordinal lda = numRows; 00194 const Ordinal ldq = numRows; 00195 00196 // Create a test problem 00197 nodeTestProblem (gen_, numRows, numCols, A.get(), lda, false); 00198 00199 // Copy A into A_copy, since TSQR overwrites the input 00200 A_copy.copy (A); 00201 00202 // Benchmark sequential TSQR for numTrials trials. 00203 // 00204 // Name of timer doesn't matter here; we only need the timing. 00205 TimerType timer("SeqTSQR"); 00206 timer.start(); 00207 for (int trialNum = 0; trialNum < numTrials; ++trialNum) 00208 { 00209 // Factor the matrix and extract the resulting R factor 00210 typedef typename SequentialTsqr< Ordinal, Scalar >::FactorOutput 00211 factor_output_type; 00212 factor_output_type factorOutput = 00213 actor.factor (numRows, numCols, A_copy.get(), lda, 00214 R.get(), R.lda(), contiguousCacheBlocks); 00215 // Compute the explicit Q factor. Unlike with LAPACK QR, 00216 // this doesn't happen in place: the implicit Q factor is 00217 // stored in A_copy, and the explicit Q factor is written to 00218 // Q. 00219 actor.explicit_Q (numRows, numCols, A_copy.get(), lda, factorOutput, 00220 numCols, Q.get(), ldq, contiguousCacheBlocks); 00221 } 00222 const double seqTsqrTiming = timer.stop(); 00223 reportResults (numTrials, numRows, numCols, actor.cache_block_size(), 00224 contiguousCacheBlocks, seqTsqrTiming); 00225 } 00226 00227 00228 private: 00231 TSQR::Random::NormalGenerator< ordinal_type, scalar_type > gen_; 00232 00235 std::string scalarTypeName_; 00236 00239 std::ostream& out_; 00240 00243 bool humanReadable_; 00244 00247 void 00248 reportResults (const int numTrials, 00249 const Ordinal numRows, 00250 const Ordinal numCols, 00251 const size_t actualCacheBlockSize, 00252 const bool contiguousCacheBlocks, 00253 const double seqTsqrTiming) 00254 { 00255 using std::endl; 00256 if (humanReadable_) 00257 out_ << "Sequential (cache-blocked) TSQR:" << endl 00258 << "Scalar type = " << scalarTypeName_ << endl 00259 << "# rows = " << numRows << endl 00260 << "# columns = " << numCols << endl 00261 << "cache block # bytes = " << actualCacheBlockSize << endl 00262 << "contiguous cache blocks? " << contiguousCacheBlocks << endl 00263 << "# trials = " << numTrials << endl 00264 << "Total time (s) = " << seqTsqrTiming << endl 00265 << endl; 00266 else 00267 out_ << "SeqTSQR" 00268 << "," << scalarTypeName_ 00269 << "," << numRows 00270 << "," << numCols 00271 << "," << actualCacheBlockSize 00272 << "," << contiguousCacheBlocks 00273 << "," << numTrials << "," 00274 << seqTsqrTiming << endl; 00275 } 00276 }; 00277 00278 00287 template< class TimerType > 00288 void 00289 benchmarkSeqTsqr (std::ostream& out, 00290 const int numRows, 00291 const int numCols, 00292 const int numTrials, 00293 const size_t cacheBlockSize, 00294 const bool contiguousCacheBlocks, 00295 const bool testComplex, 00296 const bool humanReadable) 00297 { 00298 typedef TimerType timer_type; 00299 const bool testReal = true; 00300 using std::string; 00301 00302 if (testReal) 00303 { 00304 { // Scalar=float 00305 typedef SeqTsqrBenchmarker< int, float, timer_type > benchmark_type; 00306 string scalarTypeName ("float"); 00307 benchmark_type widget (scalarTypeName, out, humanReadable); 00308 widget.benchmark (numTrials, numRows, numCols, cacheBlockSize, 00309 contiguousCacheBlocks); 00310 } 00311 { // Scalar=double 00312 typedef SeqTsqrBenchmarker< int, double, timer_type > benchmark_type; 00313 string scalarTypeName ("double"); 00314 benchmark_type widget (scalarTypeName, out, humanReadable); 00315 widget.benchmark (numTrials, numRows, numCols, cacheBlockSize, 00316 contiguousCacheBlocks); 00317 } 00318 } 00319 00320 if (testComplex) 00321 { 00322 #ifdef HAVE_TSQR_COMPLEX 00323 using std::complex; 00324 { // Scalar=complex<float> 00325 typedef SeqTsqrBenchmarker< int, complex<float>, timer_type > benchmark_type; 00326 string scalarTypeName ("complex<float>"); 00327 benchmark_type widget (scalarTypeName, out, humanReadable); 00328 widget.benchmark (numTrials, numRows, numCols, cacheBlockSize, 00329 contiguousCacheBlocks); 00330 } 00331 { // Scalar=complex<double> 00332 typedef SeqTsqrBenchmarker< int, complex<double>, timer_type > benchmark_type; 00333 string scalarTypeName ("complex<double>"); 00334 benchmark_type widget (scalarTypeName, out, humanReadable); 00335 widget.benchmark (numTrials, numRows, numCols, cacheBlockSize, 00336 contiguousCacheBlocks); 00337 } 00338 #else // Don't HAVE_TSQR_COMPLEX 00339 throw std::logic_error("TSQR not built with complex arithmetic support"); 00340 #endif // HAVE_TSQR_COMPLEX 00341 } 00342 } 00343 00344 00352 template< class Ordinal, class Scalar, class Generator, class TimerType > 00353 void 00354 benchmarkLapack (Generator& generator, 00355 const int ntrials, 00356 const Ordinal nrows, 00357 const Ordinal ncols, 00358 const bool human_readable) 00359 { 00360 using std::ostringstream; 00361 using std::cerr; 00362 using std::cout; 00363 using std::endl; 00364 00365 TSQR::Test::verifyTimerConcept< TimerType >(); 00366 00367 LAPACK< Ordinal, Scalar > lapack; 00368 Matrix< Ordinal, Scalar > A (nrows, ncols); 00369 Matrix< Ordinal, Scalar > Q (nrows, ncols); 00370 Matrix< Ordinal, Scalar > R (ncols, ncols); 00371 const Ordinal lda = nrows; 00372 const Ordinal ldq = nrows; 00373 const Ordinal ldr = ncols; 00374 00375 // Create a test problem 00376 nodeTestProblem (generator, nrows, ncols, A.get(), lda, false); 00377 00378 // Copy A into Q, since LAPACK QR overwrites the input. We only 00379 // need Q because LAPACK's computation of the explicit Q factor 00380 // occurs in place. This doesn't work with TSQR. To give 00381 // LAPACK QR the fullest possible advantage over TSQR, we don't 00382 // allocate an A_copy here (as we would when benchmarking TSQR). 00383 Q.copy (A); 00384 00385 // Determine the required workspace for the factorization 00386 const Ordinal lwork = lworkQueryLapackQr (lapack, nrows, ncols, lda); 00387 std::vector< Scalar > work (lwork); 00388 std::vector< Scalar > tau (ncols); 00389 00390 // Benchmark LAPACK's QR factorization for ntrials trials 00391 TimerType timer("LapackQR"); 00392 timer.start(); 00393 for (int trial_num = 0; trial_num < ntrials; ++trial_num) 00394 { 00395 // Compute the QR factorization 00396 int info = 0; // INFO is always an int 00397 lapack.GEQRF (nrows, ncols, Q.get(), ldq, &tau[0], &work[0], lwork, &info); 00398 if (info != 0) 00399 { 00400 ostringstream os; 00401 os << "LAPACK QR factorization (_GEQRF) failed: INFO = " << info; 00402 throw std::runtime_error (os.str()); 00403 } 00404 00405 // Extract the upper triangular factor R from Q (where it 00406 // was computed in place by GEQRF), since ORGQR will 00407 // overwrite all of Q with the explicit Q factor. 00408 copy_upper_triangle (nrows, ncols, R.get(), ldr, Q.get(), ldq); 00409 00410 // Compute the explicit Q factor 00411 lapack.ORGQR (nrows, ncols, ncols, Q.get(), ldq, 00412 &tau[0], &work[0], lwork, &info); 00413 if (info != 0) 00414 { 00415 ostringstream os; 00416 os << "LAPACK explicit Q computation (_ORGQR) failed: INFO = " << info; 00417 throw std::runtime_error (os.str()); 00418 } 00419 } 00420 const double lapack_timing = timer.stop(); 00421 00422 // Print the results 00423 if (human_readable) 00424 cout << "LAPACK\'s QR factorization (DGEQRF + DORGQR):" << endl 00425 << "nrows = " << nrows << endl 00426 << "ncols = " << ncols << endl 00427 << "ntrials = " << ntrials << endl 00428 << "Total time (s) = " << lapack_timing << endl << endl; 00429 else 00430 // "0" refers to the cache block size, which is not applicable 00431 // in this case. 00432 cout << "LAPACK" 00433 << "," << nrows 00434 << "," << ncols 00435 << "," << 0 00436 << "," << ntrials 00437 << "," << lapack_timing 00438 << endl; 00439 } 00440 00441 } // namespace Test 00442 } // namespace TSQR 00443 00444 #endif // __TSQR_Test_SeqTest_hpp
1.7.4