|
Anasazi Version of the Day
|
00001 // @HEADER 00002 // *********************************************************************** 00003 // 00004 // Anasazi: Block Eigensolvers Package 00005 // Copyright (2010) Sandia Corporation 00006 // 00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 00008 // license for use of this work by or on behalf of the U.S. Government. 00009 // 00010 // This library is free software; you can redistribute it and/or modify 00011 // it under the terms of the GNU Lesser General Public License as 00012 // published by the Free Software Foundation; either version 2.1 of the 00013 // License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, but 00016 // WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00023 // USA 00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 00025 // 00026 // *********************************************************************** 00027 // @HEADER 00028 00029 #ifndef __TSQR_CacheBlockingStrategy_hpp 00030 #define __TSQR_CacheBlockingStrategy_hpp 00031 00032 #include <algorithm> 00033 #include <limits> 00034 #include <sstream> 00035 #include <stdexcept> 00036 #include <utility> // std::pair 00037 00040 00041 namespace TSQR { 00042 00047 template< class LocalOrdinal, class Scalar > 00048 class CacheBlockingStrategy { 00049 public: 00054 CacheBlockingStrategy (const size_t cacheBlockSize = 0) : 00055 cache_block_size_ (default_cache_block_size (cacheBlockSize)) 00056 {} 00057 00061 CacheBlockingStrategy (const CacheBlockingStrategy& rhs) : 00062 cache_block_size_ (rhs.cache_block_size()) 00063 {} 00064 00068 CacheBlockingStrategy& operator= (const CacheBlockingStrategy& rhs) { 00069 cache_block_size_ = rhs.cache_block_size(); 00070 return *this; 00071 } 00072 00076 size_t cache_block_size () const { return cache_block_size_; } 00077 00082 bool operator== (const CacheBlockingStrategy& rhs) const { 00083 return (cache_block_size() == rhs.cache_block_size()); 00084 } 00085 00090 bool operator!= (const CacheBlockingStrategy& rhs) const { 00091 return (cache_block_size() != rhs.cache_block_size()); 00092 } 00093 00098 std::pair< LocalOrdinal, LocalOrdinal > 00099 cache_block (const LocalOrdinal index, 00100 const LocalOrdinal nrows, 00101 const LocalOrdinal ncols, 00102 const LocalOrdinal nrows_cache_block) const 00103 { 00104 const LocalOrdinal nblocks = nrows / nrows_cache_block; 00105 const LocalOrdinal remainder = nrows - nblocks * nrows_cache_block; 00106 LocalOrdinal my_row_start, my_nrows; 00107 00108 my_row_start = index * nrows_cache_block; 00109 if (index < nblocks - 1) 00110 my_nrows = nrows_cache_block; 00111 else if (index == nblocks - 1) 00112 { 00113 if (remainder > 0 && remainder < ncols) 00114 my_nrows = nrows_cache_block + remainder; 00115 else 00116 my_nrows = nrows_cache_block; 00117 } 00118 else if (index == nblocks) 00119 { 00120 if (remainder >= ncols) 00121 my_nrows = remainder; 00122 else 00123 my_nrows = 0; 00124 } 00125 else 00126 my_nrows = 0; 00127 00128 return std::make_pair (my_row_start, my_nrows); 00129 } 00130 00135 LocalOrdinal 00136 num_cache_blocks (const LocalOrdinal nrows, 00137 const LocalOrdinal ncols, 00138 const LocalOrdinal nrows_cache_block) const 00139 { 00140 const LocalOrdinal quotient = nrows / nrows_cache_block; 00141 const LocalOrdinal remainder = nrows - nrows_cache_block * quotient; 00142 const LocalOrdinal nblocks = (0 < remainder && remainder < ncols) ? (quotient+1) : quotient; 00143 00144 return nblocks; 00145 } 00146 00153 LocalOrdinal 00154 top_block_split_nrows (const LocalOrdinal nrows, 00155 const LocalOrdinal ncols, 00156 const LocalOrdinal nrows_cache_block) const 00157 { 00158 // We want to partition the nrows by ncols matrix A into [A_top; 00159 // A_bot], where A_top has nrows_cache_block rows. However, we 00160 // don't want A_bot to have less than ncols rows. If it would, 00161 // then we partition A so that A_top has nrows rows and A_bot is 00162 // empty. 00163 if (nrows < nrows_cache_block + ncols) 00164 return nrows; 00165 else 00166 // Don't ask for a bigger cache block than there are rows in 00167 // the matrix left to process. 00168 return std::min (nrows_cache_block, nrows); 00169 } 00170 00171 00178 LocalOrdinal 00179 bottom_block_split_nrows (const LocalOrdinal nrows, 00180 const LocalOrdinal ncols, 00181 const LocalOrdinal nrows_cache_block) const 00182 { 00183 // We want to split off the bottom block using the same 00184 // splitting as if we had split off as many top blocks of 00185 // nrows_cache_block rows as permissible. The last block may 00186 // have fewer than nrows_cache_block rows, but it may not have 00187 // fewer than ncols rows (since we don't want any cache block to 00188 // have fewer rows than columns). 00189 00190 if (nrows < nrows_cache_block + ncols) 00191 { 00192 // The nrows by ncols matrix A is just big enough for one 00193 // cache block. Partition A == [A_top; A_bot] with A_top 00194 // empty and A_bot == A. 00195 return nrows; 00196 } 00197 else 00198 { 00199 const LocalOrdinal quotient = nrows / nrows_cache_block; 00200 const LocalOrdinal remainder = nrows - quotient * nrows_cache_block; 00201 00202 LocalOrdinal nrows_bottom; 00203 if (remainder == 0 || remainder < ncols) 00204 nrows_bottom = nrows_cache_block + remainder; 00205 else if (remainder >= ncols) 00206 nrows_bottom = remainder; 00207 else 00208 throw std::logic_error("Should never get here!"); 00209 return nrows_bottom; 00210 } 00211 } 00212 00213 00221 size_t 00222 default_cache_block_size (const size_t suggested_cache_size) const 00223 { 00224 if (suggested_cache_size == 0) 00225 { 00226 // 64 KB: reasonable default upper bound for L2 cache 00227 // capacity. 00228 const size_t default_cache_size = 65536; 00229 const size_t default_nwords = default_cache_size / sizeof(Scalar); 00230 00231 // We want to be able to work on two blocks of size at least 00232 // 4x4 in cache. Otherwise TSQR is more or less pointless. 00233 // If we can't do that, bail out; it's likely that Scalar is 00234 // the wrong data type for TSQR. 00235 if (default_nwords < 32) 00236 { 00237 std::ostringstream os; 00238 os << "Error: sizeof(Scalar) == " << sizeof(Scalar) 00239 << ", which is too large for us to pick a reasonable " 00240 << "default cache size."; 00241 throw std::range_error (os.str()); 00242 } 00243 else 00244 return default_cache_size; 00245 } 00246 else 00247 { 00248 const size_t nwords = suggested_cache_size / sizeof(Scalar); 00249 00250 // We want to be able to work on two blocks of size at least 00251 // 4x4 in cache. Otherwise TSQR is more or less pointless. 00252 // If we can't do that, bail out; it's likely that Scalar is 00253 // the wrong data type for TSQR. 00254 if (nwords < 32) 00255 { 00256 std::ostringstream os; 00257 os << "Error: suggested cache size " << suggested_cache_size 00258 << " bytes is too small for sizeof(Scalar) == " 00259 << sizeof(Scalar) << " bytes"; 00260 throw std::invalid_argument (os.str()); 00261 } 00262 else 00263 return suggested_cache_size; 00264 } 00265 } 00266 00267 00278 LocalOrdinal 00279 cache_block_num_rows (const LocalOrdinal ncols) const 00280 { 00281 // Suppose the cache can hold W words (of size sizeof(Scalar) 00282 // bytes each). We have to use the same number of rows per 00283 // cache block for both the factorization and applying the Q 00284 // factor. 00285 // 00286 // The factorization requires a working set of 00287 // ncols*(nrows_cache_block + ncols) + 2*ncols words: 00288 // 00289 // 1. One ncols by ncols R factor (not packed) 00290 // 2. One nrows_cache_block by ncols cache block 00291 // 3. tau array of length ncols 00292 // 4. workspace array of length ncols 00293 // 00294 // That means nrows_cache_block should be <= W/ncols - ncols - 2. 00295 // 00296 // Applying the Q factor to a matrix C with the same number of 00297 // columns as Q requires a working set of 00298 // 2*nrows_cache_block*ncols + ncols*ncols + 2*ncols 00299 // 00300 // 1. Cache block of Q: nrows_cache_block by ncols 00301 // 2. C_top block: ncols by ncols 00302 // 3. C_cur block: nrows_cache_block by ncols 00303 // 4. tau array of length ncols 00304 // 5. workspace array of length ncols 00305 // 00306 // That means nrows_cache_block should be <= (W/(2*N) - N/2 - 00307 // 1). Obviously this is smaller than for the factorization, so 00308 // we use this formula to pick nrows_cache_block. 00309 00310 const size_t cache_block_nwords = cache_block_size() / sizeof(Scalar); 00311 00312 // Compute everything in size_t first, and cast to LocalOrdinal 00313 // at the end. This may avoid overflow if the cache is very 00314 // large and/or LocalOrdinal is very small (say a short int). 00315 // 00316 // Also, since size_t is unsigned, make sure that the 00317 // subtractions don't make it negative. If it does, then either 00318 // ncols is too big or the cache is too small. 00319 00320 const size_t term1 = cache_block_nwords / (2*ncols); 00321 const size_t term2 = ncols / 2 + 1; 00322 if (term1 < term2) 00323 { 00324 std::ostringstream os; 00325 os << "Error: While deciding on the number of rows in a cache " 00326 "block for sequential TSQR, the specified number of columns " 00327 << ncols << " in the matrix to factor is too big for the " 00328 "specified cache size, which can hold " << cache_block_nwords 00329 << " words of size sizeof(Scalar) == " << sizeof(Scalar) 00330 << " each."; 00331 throw std::invalid_argument (os.str()); 00332 } 00333 else 00334 { 00335 // The compiler can easily prove that term1 - term2 >= 0, 00336 // since we've gotten to this point. Of course that's 00337 // assuming that C++ compilers are smart... 00338 const size_t nrows_cache_block = term1 - term2; 00339 00340 // Make sure that nrows_cache_block fits in a LocalOrdinal type. 00341 if (static_cast<size_t>(static_cast< LocalOrdinal >(nrows_cache_block)) != nrows_cache_block) 00342 { 00343 std::ostringstream os; 00344 os << "Error: While deciding on the number of rows in a cache " 00345 "block for sequential TSQR, the decided-upon number of rows " 00346 << nrows_cache_block << " does not fit in a LocalOrdinal " 00347 "type, whose max value is " 00348 << std::numeric_limits<LocalOrdinal>::max() << "."; 00349 throw std::range_error (os.str()); 00350 } 00351 else 00352 return static_cast< LocalOrdinal > (nrows_cache_block); 00353 } 00354 } 00355 00356 private: 00358 size_t cache_block_size_; 00359 }; 00360 } // namespace TSQR 00361 00362 #endif // __TSQR_CacheBlockingStrategy_hpp
1.7.4