|
Teuchos Package Browser (Single Doxygen Collection) Version of the Day
|
00001 // @HEADER 00002 // *********************************************************************** 00003 // 00004 // Teuchos: Common Tools Package 00005 // Copyright (2004) Sandia Corporation 00006 // 00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 00008 // license for use of this work by or on behalf of the U.S. Government. 00009 // 00010 // This library is free software; you can redistribute it and/or modify 00011 // it under the terms of the GNU Lesser General Public License as 00012 // published by the Free Software Foundation; either version 2.1 of the 00013 // License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, but 00016 // WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00023 // USA 00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 00025 // 00026 // *********************************************************************** 00027 // @HEADER 00028 00029 #include "Teuchos_GlobalMPISession.hpp" 00030 #include "Teuchos_MPIComm.hpp" 00031 #include "Teuchos_ErrorPolling.hpp" 00032 #include "Teuchos_Version.hpp" 00033 00034 using namespace Teuchos; 00035 using std::string; 00036 00037 /* \example Test of polling for exceptions on other processors */ 00038 00039 int main( int argc, char* argv[] ) 00040 { 00041 /* return value */ 00042 int state=0; 00043 00044 Teuchos::GlobalMPISession mpiSession(&argc, &argv); 00045 00046 std::cout << Teuchos::Teuchos_Version() << std::endl << std::endl; 00047 00048 try 00049 { 00050 00051 MPIComm comm = MPIComm::world(); 00052 00053 00054 /*----- Demonstrate detection of an off-processor error -------- */ 00055 00056 try 00057 { 00058 /* Try some code that will fail on one of the processors */ 00059 try 00060 { 00061 /* Generate an std::exception on proc 1 */ 00062 TEST_FOR_EXCEPTION(comm.getRank()==1, std::runtime_error, 00063 "std::exception [expected] detected on proc=" 00064 << comm.getRank()); 00065 /* On all other procs, do some calculation */ 00066 double x=0; 00067 for (int i=0; i<100; i++) x += i; 00068 00069 } 00070 catch(std::exception& ex1) 00071 { 00072 /* If we catch an std::exception, report the failure to the other 00073 * processors. This call to reportFailure() must be 00074 * paired with a call to pollForFailures() in the 00075 * branch that did not detect an std::exception. 00076 */ 00077 ErrorPolling::reportFailure(comm); 00078 TEUCHOS_TRACE(ex1); 00079 } 00080 00081 /* 00082 * Here we poll for the state of other processors. If all processors 00083 * report OK, pollForFailures() will return zero and an 00084 * std::exception will not be thrown. If another 00085 * processor has called reportFailure(), then pollForFailures() 00086 * will return a nonzero number and an std::exception will be thrown. 00087 */ 00088 TEST_FOR_EXCEPTION(ErrorPolling::pollForFailures(comm), 00089 std::runtime_error, 00090 "off-processor error [expected] detected " 00091 "on proc=" << comm.getRank()); 00092 00093 00094 00095 /* Do a collective operation. In the present example, 00096 * this code should never be reached 00097 * because all processors should have detected either a local 00098 * std::exception or a remote std::exception. */ 00099 std::cerr << "this is bad! Processor=" << comm.getRank() 00100 << "should not have reached this point" << std::endl; 00101 00102 /* report the bad news to the testharness 00103 * using the return value... */ 00104 state = 1; 00105 00106 /* Throw an std::exception. This is not a drill!!! */ 00107 TEST_FOR_EXCEPTION(state, std::runtime_error, 00108 "std::exception [UNEXPECTED!!!] detected in test " 00109 "of polling on processor=" << comm.getRank()); 00110 00111 /* This collective operation would fail if executed here, because 00112 * one of the processors has thrown an std::exception and never 00113 * reached this point. Good thing we've polled for errors! */ 00114 int x=comm.getRank(); 00115 int sum; 00116 comm.allReduce( (void*) &x, (void*) &sum, 1, MPIComm::INT, 00117 MPIComm::SUM); 00118 std::cerr << "sum=" << sum << std::endl; 00119 } 00120 catch(std::exception& ex) 00121 { 00122 std::cerr << ex.what() << std::endl; 00123 } 00124 00125 std::cerr << "p=" << MPIComm::world().getRank() 00126 << ": std::exception polling successful" << std::endl; 00127 00128 00129 /*-- Demonstrate safe pass-through when no off-proc error happens --- */ 00130 00131 try 00132 { 00133 /* Try some code that will not fail on any processors */ 00134 try 00135 { 00136 /* On all procs, do some foolproof calculation */ 00137 double x=0; 00138 for (int i=0; i<100; i++) x += i; 00139 00140 } 00141 catch(std::exception& ex1) 00142 { 00143 /* If we catch an std::exception, report the failure to the other 00144 * processors. This call to reportFailure() must be 00145 * paired with a call to pollForFailures() in the 00146 * branch that did not detect an std::exception. 00147 */ 00148 ErrorPolling::reportFailure(comm); 00149 TEUCHOS_TRACE(ex1); 00150 } 00151 00152 /* 00153 * Here we poll for the state of other processors. If all processors 00154 * report OK, pollForFailures() will return zero and an 00155 * std::exception will not be thrown. If another 00156 * processor has called reportFailure(), then pollForFailures() 00157 * will return a nonzero number and an std::exception will be thrown. 00158 */ 00159 TEST_FOR_EXCEPTION(ErrorPolling::pollForFailures(comm), 00160 std::runtime_error, 00161 "off-processor error [UNEXPECTED!!!] detected " 00162 "on proc=" << comm.getRank()); 00163 00164 00165 00166 /* 00167 * Do a collective operation. In the present example, 00168 * this code will be reached on all processors because 00169 * no std::exception has been thrown by any processor. 00170 */ 00171 std::cerr << "Processor=" << comm.getRank() 00172 << "ready to do collective operation" << std::endl; 00173 00174 /* 00175 * This collective operation is safe because we have polled 00176 * all processors and known that everyone is still up and running. 00177 */ 00178 int x=comm.getRank(); 00179 int sum; 00180 comm.allReduce( (void*) &x, (void*) &sum, 1, MPIComm::INT, 00181 MPIComm::SUM); 00182 if (comm.getRank()==0) std::cerr << "sum=" << sum << std::endl; 00183 } 00184 catch(std::exception& ex) 00185 { 00186 std::cerr << "std::exception [UNEXPECTED!!!] detected" << std::endl; 00187 std::cerr << ex.what() << std::endl; 00188 state = 1; 00189 } 00190 } 00191 catch(std::exception& e) 00192 { 00193 std::cerr << e.what() << std::endl; 00194 state = 1; 00195 } 00196 00197 return state; 00198 00199 }
1.7.4