|
Anasazi Version of the Day
|
00001 // @HEADER 00002 // *********************************************************************** 00003 // 00004 // Anasazi: Block Eigensolvers Package 00005 // Copyright (2010) Sandia Corporation 00006 // 00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 00008 // license for use of this work by or on behalf of the U.S. Government. 00009 // 00010 // This library is free software; you can redistribute it and/or modify 00011 // it under the terms of the GNU Lesser General Public License as 00012 // published by the Free Software Foundation; either version 2.1 of the 00013 // License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, but 00016 // WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00023 // USA 00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 00025 // 00026 // *********************************************************************** 00027 // @HEADER 00028 00029 #ifndef __TSQR_TBB_FactorTask_hpp 00030 #define __TSQR_TBB_FactorTask_hpp 00031 00032 #include <tbb/task.h> 00033 #include <TbbTsqr_Partitioner.hpp> 00034 #include <Tsqr_SequentialTsqr.hpp> 00035 #include <algorithm> 00036 00039 00040 namespace TSQR { 00041 namespace TBB { 00042 00043 template< class LocalOrdinal, class Scalar, class TimerType > 00044 class FactorTask : public tbb::task { 00045 public: 00046 typedef MatView< LocalOrdinal, Scalar > mat_view; 00047 typedef ConstMatView< LocalOrdinal, Scalar > const_mat_view; 00048 typedef std::pair< mat_view, mat_view > split_t; 00049 typedef std::pair< const_mat_view, const_mat_view > const_split_t; 00050 00052 typedef typename SequentialTsqr< LocalOrdinal, Scalar >::FactorOutput SeqOutput; 00055 typedef std::vector< std::vector< Scalar > > ParOutput; 00059 typedef typename std::pair< std::vector< SeqOutput >, ParOutput > FactorOutput; 00060 00061 FactorTask (const size_t P_first__, 00062 const size_t P_last__, 00063 mat_view A, 00064 mat_view* const A_top_ptr, 00065 std::vector< SeqOutput >& seq_outputs, 00066 ParOutput& par_output, 00067 const SequentialTsqr< LocalOrdinal, Scalar > seq, 00068 double& my_seq_timing, 00069 double& min_seq_timing, 00070 double& max_seq_timing, 00071 const bool contiguous_cache_blocks) : 00072 P_first_ (P_first__), 00073 P_last_ (P_last__), 00074 A_ (A), 00075 A_top_ptr_ (A_top_ptr), 00076 seq_outputs_ (seq_outputs), 00077 par_output_ (par_output), 00078 seq_ (seq), 00079 contiguous_cache_blocks_ (contiguous_cache_blocks), 00080 my_seq_timing_ (my_seq_timing), 00081 min_seq_timing_ (min_seq_timing), 00082 max_seq_timing_ (max_seq_timing) 00083 {} 00084 00085 tbb::task* execute () { 00086 if (P_first_ > P_last_ || A_.empty()) 00087 return NULL; 00088 else if (P_first_ == P_last_) 00089 { 00090 execute_base_case (); 00091 return NULL; 00092 } 00093 else 00094 { 00095 // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last] 00096 const size_t P_mid = (P_first_ + P_last_) / 2; 00097 split_t A_split = 00098 partitioner_.split (A_, P_first_, P_mid, P_last_, 00099 contiguous_cache_blocks_); 00100 00101 double top_timing; 00102 double top_min_timing = 0.0; 00103 double top_max_timing = 0.0; 00104 double bot_timing; 00105 double bot_min_timing = 0.0; 00106 double bot_max_timing = 0.0; 00107 00108 FactorTask& topTask = *new( allocate_child() ) 00109 FactorTask (P_first_, P_mid, A_split.first, A_top_ptr_, 00110 seq_outputs_, par_output_, seq_, 00111 top_timing, top_min_timing, top_max_timing, 00112 contiguous_cache_blocks_); 00113 // After the task finishes, A_bot will be set to the topmost 00114 // partition of A_split.second. This will let us combine 00115 // the two subproblems (using factor_pair()) after their 00116 // tasks complete. 00117 mat_view A_bot; 00118 FactorTask& botTask = *new( allocate_child() ) 00119 FactorTask (P_mid+1, P_last_, A_split.second, &A_bot, 00120 seq_outputs_, par_output_, seq_, 00121 bot_timing, bot_min_timing, bot_max_timing, 00122 contiguous_cache_blocks_); 00123 set_ref_count (3); // 3 children (2 + 1 for the wait) 00124 spawn (topTask); 00125 spawn_and_wait_for_all (botTask); 00126 00127 // Combine the two results 00128 factor_pair (P_first_, P_mid+1, *A_top_ptr_, A_bot); 00129 00130 top_min_timing = (top_min_timing == 0.0) ? top_timing : top_min_timing; 00131 top_max_timing = (top_max_timing == 0.0) ? top_timing : top_max_timing; 00132 00133 bot_min_timing = (bot_min_timing == 0.0) ? bot_timing : bot_min_timing; 00134 bot_max_timing = (bot_max_timing == 0.0) ? bot_timing : bot_max_timing; 00135 00136 min_seq_timing_ = std::min (top_min_timing, bot_min_timing); 00137 max_seq_timing_ = std::min (top_max_timing, bot_max_timing); 00138 00139 return NULL; 00140 } 00141 } 00142 00143 private: 00144 const size_t P_first_, P_last_; 00145 mat_view A_; 00146 mat_view* const A_top_ptr_; 00147 std::vector< SeqOutput >& seq_outputs_; 00148 ParOutput& par_output_; 00149 SequentialTsqr< LocalOrdinal, Scalar > seq_; 00150 TSQR::Combine< LocalOrdinal, Scalar > combine_; 00151 Partitioner< LocalOrdinal, Scalar > partitioner_; 00152 const bool contiguous_cache_blocks_; 00153 double& my_seq_timing_; 00154 double& min_seq_timing_; 00155 double& max_seq_timing_; 00156 00157 void 00158 factor_pair (const size_t P_top, 00159 const size_t P_bot, 00160 mat_view& A_top, // different than A_top_ 00161 mat_view& A_bot) 00162 { 00163 if (P_top == P_bot) 00164 throw std::logic_error("factor_pair: should never get here!"); 00165 00166 // We only read and write the upper ncols x ncols triangle of 00167 // each block. 00168 const LocalOrdinal ncols = A_top.ncols(); 00169 if (A_bot.ncols() != ncols) 00170 throw std::logic_error("A_bot.ncols() != A_top.ncols()"); 00171 00172 std::vector< Scalar >& tau = par_output_[P_bot]; 00173 std::vector< Scalar > work (ncols); 00174 combine_.factor_pair (ncols, A_top.get(), A_top.lda(), 00175 A_bot.get(), A_bot.lda(), &tau[0], &work[0]); 00176 } 00177 00178 void 00179 execute_base_case () 00180 { 00181 TimerType timer(""); 00182 timer.start(); 00183 seq_outputs_[P_first_] = 00184 seq_.factor (A_.nrows(), A_.ncols(), A_.get(), 00185 A_.lda(), contiguous_cache_blocks_); 00186 // Assign the topmost cache block of the current partition to 00187 // *A_top_ptr_. Every base case invocation does this, so that 00188 // we can combine subproblems. The root task also does this, 00189 // but for a different reason: so that we can extract the R 00190 // factor, once we're done with the factorization. 00191 *A_top_ptr_ = seq_.top_block (A_, contiguous_cache_blocks_); 00192 my_seq_timing_ = timer.stop(); 00193 } 00194 }; 00195 } // namespace TBB 00196 } // namespace TSQR 00197 00198 #endif // __TSQR_TBB_FactorTask_hpp
1.7.4