Tpetra Matrix/Vector Services Version of the Day
Tpetra_Distributor.hpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //          Tpetra: Templated Linear Algebra Services Package
00005 //                 Copyright (2008) Sandia Corporation
00006 //
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00038 //
00039 // ************************************************************************
00040 // @HEADER
00041 
00042 #ifndef TPETRA_DISTRIBUTOR_HPP
00043 #define TPETRA_DISTRIBUTOR_HPP
00044 
00045 #include "Tpetra_Util.hpp"
00046 #include <Teuchos_as.hpp>
00047 #include <Teuchos_Describable.hpp>
00048 #include <Teuchos_ParameterListAcceptorDefaultBase.hpp>
00049 #include <Teuchos_VerboseObject.hpp>
00050 
00051 // If TPETRA_DISTRIBUTOR_TIMERS is defined, Distributor will time
00052 // doPosts (both versions) and doWaits, and register those timers with
00053 // Teuchos::TimeMonitor so that summarize() or report() will show
00054 // results.
00055 
00056 // #ifndef TPETRA_DISTRIBUTOR_TIMERS
00057 // #  define TPETRA_DISTRIBUTOR_TIMERS 1
00058 // #endif // TPETRA_DISTRIBUTOR_TIMERS
00059 
00060 #ifdef TPETRA_DISTRIBUTOR_TIMERS
00061 #  undef TPETRA_DISTRIBUTOR_TIMERS
00062 #endif // TPETRA_DISTRIBUTOR_TIMERS
00063 
00064 #if TPETRA_USE_KOKKOS_DISTOBJECT || defined(TPETRA_HAVE_KOKKOS_REFACTOR)
00065 #include "KokkosCompat_View.hpp"
00066 #include "Kokkos_View.hpp"
00067 #include "Kokkos_TeuchosCommAdapters.hpp"
00068 #endif
00069 
00070 
00071 namespace Tpetra {
00072 
00073   namespace Details {
00078     enum EDistributorSendType {
00079       DISTRIBUTOR_ISEND, // Use MPI_Isend (Teuchos::isend)
00080       DISTRIBUTOR_RSEND, // Use MPI_Rsend (Teuchos::readySend)
00081       DISTRIBUTOR_SEND,  // Use MPI_Send (Teuchos::send)
00082       DISTRIBUTOR_SSEND  // Use MPI_Ssend (Teuchos::ssend)
00083     };
00084 
00089     std::string
00090     DistributorSendTypeEnumToString (EDistributorSendType sendType);
00091 
00096     enum EDistributorHowInitialized {
00097       DISTRIBUTOR_NOT_INITIALIZED, // Not initialized yet
00098       DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS, // By createFromSends
00099       DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS, // By createFromRecvs
00100       DISTRIBUTOR_INITIALIZED_BY_REVERSE, // By createReverseDistributor
00101       DISTRIBUTOR_INITIALIZED_BY_COPY // By copy constructor
00102     };
00103 
00108     std::string
00109     DistributorHowInitializedEnumToString (EDistributorHowInitialized how);
00110 
00111   } // namespace Details
00112 
00119   Array<std::string> distributorSendTypes ();
00120 
00188   class Distributor :
00189     public Teuchos::Describable,
00190     public Teuchos::ParameterListAcceptorDefaultBase,
00191     public Teuchos::VerboseObject<Distributor> {
00192   public:
00194 
00195 
00203     explicit Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm);
00204 
00214     Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
00215                  const Teuchos::RCP<Teuchos::FancyOStream>& out);
00216 
00229     Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
00230                  const Teuchos::RCP<Teuchos::ParameterList>& plist);
00231 
00246     Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
00247                  const Teuchos::RCP<Teuchos::FancyOStream>& out,
00248                  const Teuchos::RCP<Teuchos::ParameterList>& plist);
00249 
00251     Distributor (const Distributor &distributor);
00252 
00254     virtual ~Distributor ();
00255 
00261     void swap (Distributor& rhs);
00262 
00264 
00265 
00266 
00271     void setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist);
00272 
00277     Teuchos::RCP<const Teuchos::ParameterList> getValidParameters () const;
00278 
00280 
00281 
00282 
00302     size_t createFromSends (const ArrayView<const int>& exportNodeIDs);
00303 
00337     template <class Ordinal>
00338     void
00339     createFromRecvs (const ArrayView<const Ordinal>& remoteIDs,
00340                      const ArrayView<const int>& remoteNodeIDs,
00341                      Array<Ordinal>& exportIDs,
00342                      Array<int>& exportNodeIDs);
00343 
00345 
00346 
00347 
00351     size_t getNumReceives() const;
00352 
00356     size_t getNumSends() const;
00357 
00359     bool hasSelfMessage() const;
00360 
00362     size_t getMaxSendLength() const;
00363 
00365     size_t getTotalReceiveLength() const;
00366 
00371     ArrayView<const int> getImagesFrom() const;
00372 
00377     ArrayView<const int> getImagesTo() const;
00378 
00386     ArrayView<const size_t> getLengthsFrom() const;
00387 
00395     ArrayView<const size_t> getLengthsTo() const;
00396 
00401     Details::EDistributorHowInitialized howInitialized () const {
00402       return howInitialized_;
00403     }
00404 
00406 
00407 
00408 
00419     RCP<Distributor> getReverse() const;
00420 
00422 
00423 
00424 
00445     template <class Packet>
00446     void
00447     doPostsAndWaits (const ArrayView<const Packet> &exports,
00448                      size_t numPackets,
00449                      const ArrayView<Packet> &imports);
00450 
00472     template <class Packet>
00473     void
00474     doPostsAndWaits (const ArrayView<const Packet> &exports,
00475                      const ArrayView<size_t> &numExportPacketsPerLID,
00476                      const ArrayView<Packet> &imports,
00477                      const ArrayView<size_t> &numImportPacketsPerLID);
00478 
00503     template <class Packet>
00504     void
00505     doPosts (const ArrayRCP<const Packet> &exports,
00506              size_t numPackets,
00507              const ArrayRCP<Packet> &imports);
00508 
00527     template <class Packet>
00528     void
00529     doPosts (const ArrayRCP<const Packet> &exports,
00530              const ArrayView<size_t> &numExportPacketsPerLID,
00531              const ArrayRCP<Packet> &imports,
00532              const ArrayView<size_t> &numImportPacketsPerLID);
00533 
00540     void doWaits ();
00541 
00546     template <class Packet>
00547     void
00548     doReversePostsAndWaits (const ArrayView<const Packet> &exports,
00549                             size_t numPackets,
00550                             const ArrayView<Packet> &imports);
00551 
00556     template <class Packet>
00557     void
00558     doReversePostsAndWaits (const ArrayView<const Packet> &exports,
00559                             const ArrayView<size_t> &numExportPacketsPerLID,
00560                             const ArrayView<Packet> &imports,
00561                             const ArrayView<size_t> &numImportPacketsPerLID);
00562 
00567     template <class Packet>
00568     void
00569     doReversePosts (const ArrayRCP<const Packet> &exports,
00570                     size_t numPackets,
00571                     const ArrayRCP<Packet> &imports);
00572 
00577     template <class Packet>
00578     void
00579     doReversePosts (const ArrayRCP<const Packet> &exports,
00580                     const ArrayView<size_t> &numExportPacketsPerLID,
00581                     const ArrayRCP<Packet> &imports,
00582                     const ArrayView<size_t> &numImportPacketsPerLID);
00583 
00590     void doReverseWaits ();
00591 
00592 #if TPETRA_USE_KOKKOS_DISTOBJECT || defined(TPETRA_HAVE_KOKKOS_REFACTOR)
00593 
00614     template <class Packet, class Layout, class Device, class Mem>
00615     void
00616     doPostsAndWaits (
00617       const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00618       size_t numPackets,
00619       const Kokkos::View<Packet*, Layout, Device, Mem> &imports);
00620 
00642     template <class Packet, class Layout, class Device, class Mem>
00643     void
00644     doPostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00645                      const ArrayView<size_t> &numExportPacketsPerLID,
00646                      const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
00647                      const ArrayView<size_t> &numImportPacketsPerLID);
00648 
00673     template <class Packet, class Layout, class Device, class Mem>
00674     void
00675     doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00676              size_t numPackets,
00677              const Kokkos::View<Packet*, Layout, Device, Mem> &imports);
00678 
00697     template <class Packet, class Layout, class Device, class Mem>
00698     void
00699     doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00700              const ArrayView<size_t> &numExportPacketsPerLID,
00701              const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
00702              const ArrayView<size_t> &numImportPacketsPerLID);
00703 
00708     template <class Packet, class Layout, class Device, class Mem>
00709     void
00710     doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00711                             size_t numPackets,
00712                             const Kokkos::View<Packet*, Layout, Device, Mem> &imports);
00713 
00718     template <class Packet, class Layout, class Device, class Mem>
00719     void
00720     doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00721                             const ArrayView<size_t> &numExportPacketsPerLID,
00722                             const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
00723                             const ArrayView<size_t> &numImportPacketsPerLID);
00724 
00729     template <class Packet, class Layout, class Device, class Mem>
00730     void
00731     doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00732                     size_t numPackets,
00733                     const Kokkos::View<Packet*, Layout, Device, Mem> &imports);
00734 
00739     template <class Packet, class Layout, class Device, class Mem>
00740     void
00741     doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00742                     const ArrayView<size_t> &numExportPacketsPerLID,
00743                     const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
00744                     const ArrayView<size_t> &numImportPacketsPerLID);
00745 
00746 #endif
00747 
00749 
00750 
00751 
00753     std::string description() const;
00754 
00756     void describe (Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const;
00757 
00759 
00760   private:
00762     RCP<const Comm<int> > comm_;
00763 
00765     Teuchos::RCP<Teuchos::FancyOStream> out_;
00766 
00768     Details::EDistributorHowInitialized howInitialized_;
00769 
00771 
00772 
00774     Details::EDistributorSendType sendType_;
00775 
00777     bool barrierBetween_;
00778 
00780     bool debug_;
00781 
00783 
00792     bool enable_cuda_rdma_;
00793 
00795 
00804     size_t numExports_;
00805 
00809     bool selfMessage_;
00810 
00820     size_t numSends_;
00821 
00826     Array<int> imagesTo_;
00827 
00836     Array<size_t> startsTo_;
00837 
00843     Array<size_t> lengthsTo_;
00844 
00848     size_t maxSendLength_;
00849 
00865     Array<size_t> indicesTo_;
00866 
00876     size_t numReceives_;
00877 
00884     size_t totalReceiveLength_;
00885 
00891     Array<size_t> lengthsFrom_;
00892 
00898     Array<int> imagesFrom_;
00899 
00905     Array<size_t> startsFrom_;
00906 
00913     Array<size_t> indicesFrom_;
00914 
00921     Array<RCP<Teuchos::CommRequest<int> > > requests_;
00922 
00927     mutable RCP<Distributor> reverseDistributor_;
00928 
00929 #ifdef TPETRA_DISTRIBUTOR_TIMERS
00930     Teuchos::RCP<Teuchos::Time> timer_doPosts3_;
00931     Teuchos::RCP<Teuchos::Time> timer_doPosts4_;
00932     Teuchos::RCP<Teuchos::Time> timer_doWaits_;
00933     Teuchos::RCP<Teuchos::Time> timer_doPosts3_recvs_;
00934     Teuchos::RCP<Teuchos::Time> timer_doPosts4_recvs_;
00935     Teuchos::RCP<Teuchos::Time> timer_doPosts3_barrier_;
00936     Teuchos::RCP<Teuchos::Time> timer_doPosts4_barrier_;
00937     Teuchos::RCP<Teuchos::Time> timer_doPosts3_sends_;
00938     Teuchos::RCP<Teuchos::Time> timer_doPosts4_sends_;
00939 
00941     void makeTimers ();
00942 #endif // TPETRA_DISTRIBUTOR_TIMERS
00943 
00955     bool useDistinctTags_;
00956 
00961     int getTag (const int pathTag) const;
00962 
00976     void
00977     init (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
00978           const Teuchos::RCP<Teuchos::ParameterList>& plist);
00979 
00990     void computeReceives ();
00991 
01004     template <class Ordinal>
01005     void computeSends (const ArrayView<const Ordinal> &importIDs,
01006                        const ArrayView<const int> &importNodeIDs,
01007                        Array<Ordinal> &exportIDs,
01008                        Array<int> &exportNodeIDs);
01009 
01011     void createReverseDistributor() const;
01012 
01013   }; // class Distributor
01014 
01015 
01016   template <class Packet>
01017   void Distributor::
01018   doPostsAndWaits (const ArrayView<const Packet>& exports,
01019                    size_t numPackets,
01020                    const ArrayView<Packet>& imports)
01021   {
01022     using Teuchos::as;
01023     using Teuchos::arcp;
01024     using Teuchos::ArrayRCP;
01025 
01026     TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error,
01027       Teuchos::typeName(*this) << "::doPostsAndWaits(): There are "
01028       << requests_.size() << " outstanding nonblocking messages pending.  It is "
01029       "incorrect to call doPostsAndWaits with posts outstanding.");
01030 
01031     // doPosts() accepts the exports and imports arrays as ArrayRCPs,
01032     // requiring that the memory location is persisting (as is
01033     // necessary for nonblocking receives).  However, it need only
01034     // persist until doWaits() completes, so it is safe for us to use
01035     // a nonpersisting reference in this case.  The use of a
01036     // nonpersisting reference is purely a performance optimization.
01037 
01038     typedef typename ArrayRCP<const Packet>::size_type size_type;
01039     //const Packet* exportsPtr = exports.getRawPtr();
01040     //ArrayRCP<const Packet> exportsArcp (exportsPtr, as<size_type> (0), exports.size(), false);
01041     ArrayRCP<const Packet> exportsArcp (exports.getRawPtr(), as<size_type> (0),
01042                                         exports.size(), false);
01043 
01044     // For some reason, neither of the options below (that use arcp)
01045     // compile for Packet=std::complex<double> with GCC 4.5.1.  The
01046     // issue only arises with the exports array.  This is why we
01047     // construct a separate nonowning ArrayRCP.
01048 
01049     // doPosts (arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
01050     //              numPackets,
01051     //              arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
01052     // doPosts (arcp<const Packet> (exportsPtr, 0, exports.size(), false),
01053     //              numPackets,
01054     //              arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
01055     doPosts (exportsArcp,
01056              numPackets,
01057              arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
01058     doWaits();
01059   }
01060 
01061   template <class Packet>
01062   void Distributor::
01063   doPostsAndWaits (const ArrayView<const Packet>& exports,
01064                    const ArrayView<size_t> &numExportPacketsPerLID,
01065                    const ArrayView<Packet> &imports,
01066                    const ArrayView<size_t> &numImportPacketsPerLID)
01067   {
01068     using Teuchos::arcp;
01069     using Teuchos::ArrayRCP;
01070     using Teuchos::as;
01071 
01072     TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error,
01073       Teuchos::typeName(*this) << "::doPostsAndWaits(): There are "
01074       << requests_.size() << " outstanding nonblocking messages pending.  It is "
01075       "incorrect to call doPostsAndWaits with posts outstanding.");
01076 
01077     // doPosts() accepts the exports and imports arrays as ArrayRCPs,
01078     // requiring that the memory location is persisting (as is
01079     // necessary for nonblocking receives).  However, it need only
01080     // persist until doWaits() completes, so it is safe for us to use
01081     // a nonpersisting reference in this case.
01082 
01083     // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
01084     // for Packet=std::complex<T> (e.g., T=float) fails to compile
01085     // with some versions of GCC.  The issue only arises with the
01086     // exports array.  This is why we construct a separate nonowning
01087     // ArrayRCP.
01088     typedef typename ArrayRCP<const Packet>::size_type size_type;
01089     ArrayRCP<const Packet> exportsArcp (exports.getRawPtr(), as<size_type> (0),
01090                                         exports.size(), false);
01091     // mfh 04 Apr 2012: This is the offending code.  This statement
01092     // would normally be in place of "exportsArcp" in the
01093     // doPosts() call below.
01094     //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
01095     doPosts (exportsArcp,
01096              numExportPacketsPerLID,
01097              arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false),
01098              numImportPacketsPerLID);
01099     doWaits();
01100   }
01101 
01102 
01103   template <class Packet>
01104   void Distributor::
01105   doPosts (const ArrayRCP<const Packet>& exports,
01106            size_t numPackets,
01107            const ArrayRCP<Packet>& imports)
01108   {
01109     using Teuchos::Array;
01110     using Teuchos::as;
01111     using Teuchos::FancyOStream;
01112     using Teuchos::includesVerbLevel;
01113     using Teuchos::ireceive;
01114     using Teuchos::isend;
01115     using Teuchos::OSTab;
01116     using Teuchos::readySend;
01117     using Teuchos::send;
01118     using Teuchos::ssend;
01119     using Teuchos::TypeNameTraits;
01120     using Teuchos::typeName;
01121     using std::endl;
01122     typedef Array<size_t>::size_type size_type;
01123 
01124     Teuchos::OSTab tab (out_);
01125 
01126 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01127     Teuchos::TimeMonitor timeMon (*timer_doPosts3_);
01128 #endif // TPETRA_DISTRIBUTOR_TIMERS
01129 
01130     // Run-time configurable parameters that come from the input
01131     // ParameterList set by setParameterList().
01132     const Details::EDistributorSendType sendType = sendType_;
01133     const bool doBarrier = barrierBetween_;
01134 
01135 // #ifdef HAVE_TEUCHOS_DEBUG
01136 //     // Prepare for verbose output, if applicable.
01137 //     Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
01138 //     (void) verbLevel; // Silence "unused variable" compiler warning.
01139 //     RCP<FancyOStream> out = this->getOStream ();
01140 //     // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
01141 //     //   includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
01142 //     const bool doPrint = out.get () && (comm_->getRank () == 0);
01143 
01144 //     if (doPrint) {
01145 //       // Only need one process to print out parameters.
01146 //       *out << "Distributor::doPosts (3 args)" << endl;
01147 //     }
01148 //     // Add one tab level.  We declare this outside the doPrint scopes
01149 //     // so that the tab persists until the end of this method.
01150 //     OSTab tab = this->getOSTab ();
01151 //     if (doPrint) {
01152 //       *out << "Parameters:" << endl;
01153 //       {
01154 //         OSTab tab2 (out);
01155 //         *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
01156 //              << endl << "barrierBetween: " << doBarrier << endl;
01157 //       }
01158 //     }
01159 // #endif // HAVE_TEUCHOS_DEBUG
01160 
01161     TEUCHOS_TEST_FOR_EXCEPTION(
01162       sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
01163       std::logic_error, "Ready send implementation requires a barrier between "
01164       "posting receives and posting ready sends.  This should have been checked "
01165       "before.  Please report this bug to the Tpetra developers.");
01166 
01167     const int myImageID = comm_->getRank();
01168     size_t selfReceiveOffset = 0;
01169 
01170     // Each message has the same number of packets.
01171     const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
01172     TEUCHOS_TEST_FOR_EXCEPTION(
01173       static_cast<size_t> (imports.size ()) != totalNumImportPackets,
01174       std::invalid_argument, "Tpetra::Distributor::doPosts<" <<
01175       TypeNameTraits<Packet>::name () << ">(3 args): imports must be "
01176       "large enough to store the imported data.  imports.size() = "
01177       << imports.size() << ", but total number of import packets = "
01178       << totalNumImportPackets << ".");
01179 
01180     // MPI tag for nonblocking receives and blocking sends in this
01181     // method.  Some processes might take the "fast" path
01182     // (indicesTo_.empty()) and others might take the "slow" path for
01183     // the same doPosts() call, so the path tag must be the same for
01184     // both.
01185     const int pathTag = 0;
01186     const int tag = this->getTag (pathTag);
01187 
01188     if (debug_) {
01189       TEUCHOS_TEST_FOR_EXCEPTION(
01190         requests_.size () != 0, std::logic_error, "doPosts(3): Process "
01191         << myImageID << ": requests_.size() = " << requests_.size ()
01192         << " != 0.");
01193       std::ostringstream os;
01194       os << myImageID << ": doPosts(3,"
01195          << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
01196       *out_ << os.str ();
01197     }
01198 
01199     // Distributor uses requests_.size() as the number of outstanding
01200     // nonblocking message requests, so we resize to zero to maintain
01201     // this invariant.
01202     //
01203     // numReceives_ does _not_ include the self message, if there is
01204     // one.  Here, we do actually send a message to ourselves, so we
01205     // include any self message in the "actual" number of receives to
01206     // post.
01207     //
01208     // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
01209     // doesn't (re)allocate its array of requests.  That happens in
01210     // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
01211     // demand), or Resize_().
01212     const size_type actualNumReceives = as<size_type> (numReceives_) +
01213       as<size_type> (selfMessage_ ? 1 : 0);
01214     requests_.resize (0);
01215 
01216     // Post the nonblocking receives.  It's common MPI wisdom to post
01217     // receives before sends.  In MPI terms, this means favoring
01218     // adding to the "posted queue" (of receive requests) over adding
01219     // to the "unexpected queue" (of arrived messages not yet matched
01220     // with a receive).
01221     {
01222 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01223       Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_);
01224 #endif // TPETRA_DISTRIBUTOR_TIMERS
01225 
01226       size_t curBufferOffset = 0;
01227       for (size_type i = 0; i < actualNumReceives; ++i) {
01228         if (imagesFrom_[i] != myImageID) {
01229           // If my process is receiving these packet(s) from another
01230           // process (not a self-receive):
01231           //
01232           // 1. Set up the persisting view (recvBuf) of the imports
01233           //    array, given the offset and size (total number of
01234           //    packets from process imagesFrom_[i]).
01235           // 2. Start the Irecv and save the resulting request.
01236           ArrayRCP<Packet> recvBuf =
01237             imports.persistingView (curBufferOffset, lengthsFrom_[i]*numPackets);
01238           requests_.push_back (ireceive<int, Packet> (recvBuf, imagesFrom_[i],
01239                                                       tag, *comm_));
01240           if (debug_) {
01241             std::ostringstream os;
01242             os << myImageID << ": doPosts(3,"
01243                << (indicesTo_.empty () ? "fast" : "slow") << "): "
01244                << "Posted irecv from Proc " << imagesFrom_[i] << " with "
01245               "specified tag " << tag << endl;
01246             *out_ << os.str ();
01247           }
01248         }
01249         else { // Receiving from myself
01250           selfReceiveOffset = curBufferOffset; // Remember the self-recv offset
01251         }
01252         curBufferOffset += lengthsFrom_[i]*numPackets;
01253       }
01254     }
01255 
01256     if (doBarrier) {
01257 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01258       Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_);
01259 #endif // TPETRA_DISTRIBUTOR_TIMERS
01260       // If we are using ready sends (MPI_Rsend) below, we need to do
01261       // a barrier before we post the ready sends.  This is because a
01262       // ready send requires that its matching receive has already
01263       // been posted before the send has been posted.  The only way to
01264       // guarantee that in this case is to use a barrier.
01265       comm_->barrier ();
01266     }
01267 
01268 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01269     Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_);
01270 #endif // TPETRA_DISTRIBUTOR_TIMERS
01271 
01272     // setup scan through imagesTo_ list starting with higher numbered images
01273     // (should help balance message traffic)
01274     //
01275     // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
01276     // It doesn't depend on the input at all.
01277     size_t numBlocks = numSends_ + selfMessage_;
01278     size_t imageIndex = 0;
01279     while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
01280       ++imageIndex;
01281     }
01282     if (imageIndex == numBlocks) {
01283       imageIndex = 0;
01284     }
01285 
01286     size_t selfNum = 0;
01287     size_t selfIndex = 0;
01288 
01289     if (indicesTo_.empty()) {
01290       if (debug_) {
01291         std::ostringstream os;
01292         os << myImageID << ": doPosts(3,fast): posting sends" << endl;
01293         *out_ << os.str ();
01294       }
01295 
01296       // Data are already blocked (laid out) by process, so we don't
01297       // need a separate send buffer (besides the exports array).
01298       for (size_t i = 0; i < numBlocks; ++i) {
01299         size_t p = i + imageIndex;
01300         if (p > (numBlocks - 1)) {
01301           p -= numBlocks;
01302         }
01303 
01304         if (imagesTo_[p] != myImageID) {
01305           ArrayView<const Packet> tmpSend =
01306             exports.view (startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
01307 
01308           if (sendType == Details::DISTRIBUTOR_SEND) {
01309             send<int, Packet> (tmpSend.getRawPtr (),
01310                                as<int> (tmpSend.size ()),
01311                                imagesTo_[p], tag, *comm_);
01312           }
01313           else if (sendType == Details::DISTRIBUTOR_ISEND) {
01314             ArrayRCP<const Packet> tmpSendBuf =
01315               exports.persistingView (startsTo_[p] * numPackets,
01316                                       lengthsTo_[p] * numPackets);
01317             requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
01318                                                      tag, *comm_));
01319           }
01320           else if (sendType == Details::DISTRIBUTOR_RSEND) {
01321             readySend<int, Packet> (tmpSend.getRawPtr (),
01322                                     as<int> (tmpSend.size ()),
01323                                     imagesTo_[p], tag, *comm_);
01324           }
01325           else if (sendType == Details::DISTRIBUTOR_SSEND) {
01326             ssend<int, Packet> (tmpSend.getRawPtr (),
01327                                 as<int> (tmpSend.size ()),
01328                                 imagesTo_[p], tag, *comm_);
01329           } else {
01330             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Tpetra::"
01331               "Distributor (3 args): Invalid send type.  We should never get "
01332               "here.  Please report this bug to the Tpetra developers.");
01333           }
01334 
01335           if (debug_) {
01336             std::ostringstream os;
01337             os << myImageID << ": doPosts(3,fast): "
01338                << "Posted send to Proc " << imagesTo_[i]
01339                << " w/ specified tag " << tag << endl;
01340             *out_ << os.str ();
01341           }
01342         }
01343         else { // "Sending" the message to myself
01344           selfNum = p;
01345         }
01346       }
01347 
01348       if (selfMessage_) {
01349         // This is how we "send a message to ourself": we copy from
01350         // the export buffer to the import buffer.  That saves
01351         // Teuchos::Comm implementations other than MpiComm (in
01352         // particular, SerialComm) the trouble of implementing self
01353         // messages correctly.  (To do this right, SerialComm would
01354         // need internal buffer space for messages, keyed on the
01355         // message's tag.)
01356         std::copy (exports.begin()+startsTo_[selfNum]*numPackets,
01357                    exports.begin()+startsTo_[selfNum]*numPackets+lengthsTo_[selfNum]*numPackets,
01358                    imports.begin()+selfReceiveOffset);
01359       }
01360       if (debug_) {
01361         std::ostringstream os;
01362         os << myImageID << ": doPosts(3,fast) done" << endl;
01363         *out_ << os.str ();
01364       }
01365     }
01366     else { // data are not blocked by image, use send buffer
01367       if (debug_) {
01368         std::ostringstream os;
01369         os << myImageID << ": doPosts(3,slow): posting sends" << endl;
01370         *out_ << os.str ();
01371       }
01372 
01373       // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
01374       // sends), because the buffer is only long enough for one send.
01375       ArrayRCP<Packet> sendArray (maxSendLength_ * numPackets); // send buffer
01376 
01377       TEUCHOS_TEST_FOR_EXCEPTION(
01378         sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
01379         "Tpetra::Distributor::doPosts<" << TypeNameTraits<Packet>::name()
01380         << "> (3-argument version):" << endl
01381         << "The \"send buffer\" code path doesn't currently work with nonblocking sends.");
01382 
01383       for (size_t i = 0; i < numBlocks; ++i) {
01384         size_t p = i + imageIndex;
01385         if (p > (numBlocks - 1)) {
01386           p -= numBlocks;
01387         }
01388 
01389         if (imagesTo_[p] != myImageID) {
01390           typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
01391           size_t sendArrayOffset = 0;
01392           size_t j = startsTo_[p];
01393           for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
01394             srcBegin = exports.begin() + indicesTo_[j]*numPackets;
01395             srcEnd   = srcBegin + numPackets;
01396             std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
01397             sendArrayOffset += numPackets;
01398           }
01399           ArrayView<const Packet> tmpSend =
01400             sendArray.view (0, lengthsTo_[p]*numPackets);
01401 
01402           if (sendType == Details::DISTRIBUTOR_SEND) {
01403             send<int, Packet> (tmpSend.getRawPtr (),
01404                                as<int> (tmpSend.size ()),
01405                                imagesTo_[p], tag, *comm_);
01406           }
01407           else if (sendType == Details::DISTRIBUTOR_ISEND) {
01408             ArrayRCP<const Packet> tmpSendBuf =
01409               sendArray.persistingView (0, lengthsTo_[p] * numPackets);
01410             requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
01411                                                      tag, *comm_));
01412           }
01413           else if (sendType == Details::DISTRIBUTOR_RSEND) {
01414             readySend<int, Packet> (tmpSend.getRawPtr (),
01415                                     as<int> (tmpSend.size ()),
01416                                     imagesTo_[p], tag, *comm_);
01417           }
01418           else if (sendType == Details::DISTRIBUTOR_SSEND) {
01419             ssend<int, Packet> (tmpSend.getRawPtr (),
01420                                 as<int> (tmpSend.size ()),
01421                                 imagesTo_[p], tag, *comm_);
01422           }
01423           else {
01424             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Tpetra::"
01425               "Distributor (3 args): Invalid send type.  We should never get "
01426               "here.  Please report this bug to the Tpetra developers.");
01427           }
01428 
01429           if (debug_) {
01430             std::ostringstream os;
01431             os << myImageID << ": doPosts(3,slow): "
01432                << "Posted send to Proc " << imagesTo_[i]
01433                << " w/ specified tag " << tag << endl;
01434             *out_ << os.str ();
01435           }
01436         }
01437         else { // "Sending" the message to myself
01438           selfNum = p;
01439           selfIndex = startsTo_[p];
01440         }
01441       }
01442 
01443       if (selfMessage_) {
01444         for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
01445           std::copy (exports.begin()+indicesTo_[selfIndex]*numPackets,
01446                      exports.begin()+indicesTo_[selfIndex]*numPackets + numPackets,
01447                      imports.begin() + selfReceiveOffset);
01448           ++selfIndex;
01449           selfReceiveOffset += numPackets;
01450         }
01451       }
01452       if (debug_) {
01453         std::ostringstream os;
01454         os << myImageID << ": doPosts(3,slow) done" << endl;
01455         *out_ << os.str ();
01456       }
01457     }
01458   }
01459 
01460   template <class Packet>
01461   void Distributor::
01462   doPosts (const ArrayRCP<const Packet>& exports,
01463            const ArrayView<size_t>& numExportPacketsPerLID,
01464            const ArrayRCP<Packet>& imports,
01465            const ArrayView<size_t>& numImportPacketsPerLID)
01466   {
01467     using Teuchos::Array;
01468     using Teuchos::as;
01469     using Teuchos::ireceive;
01470     using Teuchos::isend;
01471     using Teuchos::readySend;
01472     using Teuchos::send;
01473     using Teuchos::ssend;
01474     using Teuchos::TypeNameTraits;
01475 #ifdef HAVE_TEUCHOS_DEBUG
01476     using Teuchos::OSTab;
01477 #endif // HAVE_TEUCHOS_DEBUG
01478     using std::endl;
01479     typedef Array<size_t>::size_type size_type;
01480 
01481     Teuchos::OSTab tab (out_);
01482 
01483 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01484     Teuchos::TimeMonitor timeMon (*timer_doPosts4_);
01485 #endif // TPETRA_DISTRIBUTOR_TIMERS
01486 
01487     // Run-time configurable parameters that come from the input
01488     // ParameterList set by setParameterList().
01489     const Details::EDistributorSendType sendType = sendType_;
01490     const bool doBarrier = barrierBetween_;
01491 
01492 // #ifdef HAVE_TEUCHOS_DEBUG
01493 //     // Prepare for verbose output, if applicable.
01494 //     Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
01495 //     RCP<Teuchos::FancyOStream> out = this->getOStream ();
01496 //     const bool doPrint = out.get () && (comm_->getRank () == 0) &&
01497 //       includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
01498 
01499 //     if (doPrint) {
01500 //       // Only need one process to print out parameters.
01501 //       *out << "Distributor::doPosts (4 args)" << endl;
01502 //     }
01503 //     // Add one tab level.  We declare this outside the doPrint scopes
01504 //     // so that the tab persists until the end of this method.
01505 //     Teuchos::OSTab tab = this->getOSTab ();
01506 //     if (doPrint) {
01507 //       *out << "Parameters:" << endl;
01508 //       {
01509 //         OSTab tab2 (out);
01510 //         *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
01511 //              << endl << "barrierBetween: " << doBarrier << endl;
01512 //       }
01513 //     }
01514 // #endif // HAVE_TEUCHOS_DEBUG
01515 
01516     TEUCHOS_TEST_FOR_EXCEPTION(sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
01517       std::logic_error, "Ready send implementation requires a barrier between "
01518       "posting receives and posting ready sends.  This should have been checked "
01519       "before.  Please report this bug to the Tpetra developers.");
01520 
01521     const int myImageID = comm_->getRank();
01522     size_t selfReceiveOffset = 0;
01523 
01524 #ifdef HAVE_TEUCHOS_DEBUG
01525     // Different messages may have different numbers of packets.
01526     size_t totalNumImportPackets = 0;
01527     for (int ii = 0; ii < numImportPacketsPerLID.size(); ++ii) {
01528       totalNumImportPackets += numImportPacketsPerLID[ii];
01529     }
01530     TEUCHOS_TEST_FOR_EXCEPTION(as<size_t> (imports.size ()) != totalNumImportPackets,
01531       std::runtime_error, Teuchos::typeName (*this) << "::doPosts(): The "
01532       "imports array argument must be large enough to store the imported "
01533       "data.  imports.size() = " << imports.size() << ", but the total number "
01534       "of packets is " << totalNumImportPackets << ".");
01535 #endif // HAVE_TEUCHOS_DEBUG
01536 
01537     // MPI tag for nonblocking receives and blocking sends in this
01538     // method.  Some processes might take the "fast" path
01539     // (indicesTo_.empty()) and others might take the "slow" path for
01540     // the same doPosts() call, so the path tag must be the same for
01541     // both.
01542     const int pathTag = 1;
01543     const int tag = this->getTag (pathTag);
01544 
01545     if (debug_) {
01546       TEUCHOS_TEST_FOR_EXCEPTION(
01547         requests_.size () != 0, std::logic_error, "doPosts(4): Process "
01548         << myImageID << ": requests_.size() = " << requests_.size ()
01549         << " != 0.");
01550       std::ostringstream os;
01551       os << myImageID << ": doPosts(4,"
01552          << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
01553       *out_ << os.str ();
01554     }
01555 
01556     // Distributor uses requests_.size() as the number of outstanding
01557     // nonblocking message requests, so we resize to zero to maintain
01558     // this invariant.
01559     //
01560     // numReceives_ does _not_ include the self message, if there is
01561     // one.  Here, we do actually send a message to ourselves, so we
01562     // include any self message in the "actual" number of receives to
01563     // post.
01564     //
01565     // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
01566     // doesn't (re)allocate its array of requests.  That happens in
01567     // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
01568     // demand), or Resize_().
01569     const size_type actualNumReceives = as<size_type> (numReceives_) +
01570       as<size_type> (selfMessage_ ? 1 : 0);
01571     requests_.resize (0);
01572 
01573     // Post the nonblocking receives.  It's common MPI wisdom to post
01574     // receives before sends.  In MPI terms, this means favoring
01575     // adding to the "posted queue" (of receive requests) over adding
01576     // to the "unexpected queue" (of arrived messages not yet matched
01577     // with a receive).
01578     {
01579 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01580       Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_);
01581 #endif // TPETRA_DISTRIBUTOR_TIMERS
01582 
01583       size_t curBufferOffset = 0;
01584       size_t curLIDoffset = 0;
01585       for (size_type i = 0; i < actualNumReceives; ++i) {
01586         size_t totalPacketsFrom_i = 0;
01587         for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
01588           totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
01589         }
01590         curLIDoffset += lengthsFrom_[i];
01591         if (imagesFrom_[i] != myImageID && totalPacketsFrom_i) {
01592           // If my process is receiving these packet(s) from another
01593           // process (not a self-receive), and if there is at least
01594           // one packet to receive:
01595           //
01596           // 1. Set up the persisting view (recvBuf) into the imports
01597           //    array, given the offset and size (total number of
01598           //    packets from process imagesFrom_[i]).
01599           // 2. Start the Irecv and save the resulting request.
01600           ArrayRCP<Packet> recvBuf =
01601             imports.persistingView (curBufferOffset, totalPacketsFrom_i);
01602           requests_.push_back (ireceive<int, Packet> (recvBuf, imagesFrom_[i],
01603                                                       tag, *comm_));
01604         }
01605         else { // Receiving these packet(s) from myself
01606           selfReceiveOffset = curBufferOffset; // Remember the offset
01607         }
01608         curBufferOffset += totalPacketsFrom_i;
01609       }
01610     }
01611 
01612     if (doBarrier) {
01613 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01614       Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_);
01615 #endif // TPETRA_DISTRIBUTOR_TIMERS
01616       // If we are using ready sends (MPI_Rsend) below, we need to do
01617       // a barrier before we post the ready sends.  This is because a
01618       // ready send requires that its matching receive has already
01619       // been posted before the send has been posted.  The only way to
01620       // guarantee that in this case is to use a barrier.
01621       comm_->barrier ();
01622     }
01623 
01624 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01625     Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_);
01626 #endif // TPETRA_DISTRIBUTOR_TIMERS
01627 
01628     // setup arrays containing starting-offsets into exports for each send,
01629     // and num-packets-to-send for each send.
01630     Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
01631     size_t maxNumPackets = 0;
01632     size_t curPKToffset = 0;
01633     for (size_t pp=0; pp<numSends_; ++pp) {
01634       sendPacketOffsets[pp] = curPKToffset;
01635       size_t numPackets = 0;
01636       for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
01637         numPackets += numExportPacketsPerLID[j];
01638       }
01639       if (numPackets > maxNumPackets) maxNumPackets = numPackets;
01640       packetsPerSend[pp] = numPackets;
01641       curPKToffset += numPackets;
01642     }
01643 
01644     // setup scan through imagesTo_ list starting with higher numbered images
01645     // (should help balance message traffic)
01646     size_t numBlocks = numSends_+ selfMessage_;
01647     size_t imageIndex = 0;
01648     while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
01649       ++imageIndex;
01650     }
01651     if (imageIndex == numBlocks) {
01652       imageIndex = 0;
01653     }
01654 
01655     size_t selfNum = 0;
01656     size_t selfIndex = 0;
01657 
01658     if (indicesTo_.empty()) {
01659       if (debug_) {
01660         std::ostringstream os;
01661         os << myImageID << ": doPosts(4,fast): posting sends" << endl;
01662         *out_ << os.str ();
01663       }
01664 
01665       // Data are already blocked (laid out) by process, so we don't
01666       // need a separate send buffer (besides the exports array).
01667       for (size_t i = 0; i < numBlocks; ++i) {
01668         size_t p = i + imageIndex;
01669         if (p > (numBlocks - 1)) {
01670           p -= numBlocks;
01671         }
01672 
01673         if (imagesTo_[p] != myImageID && packetsPerSend[p] > 0) {
01674           ArrayView<const Packet> tmpSend =
01675             exports.view (sendPacketOffsets[p], packetsPerSend[p]);
01676 
01677           if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
01678             send<int, Packet> (tmpSend.getRawPtr (),
01679                                as<int> (tmpSend.size ()),
01680                                imagesTo_[p], tag, *comm_);
01681           }
01682           else if (sendType == Details::DISTRIBUTOR_RSEND) {
01683             readySend<int, Packet> (tmpSend.getRawPtr (),
01684                                     as<int> (tmpSend.size ()),
01685                                     imagesTo_[p], tag, *comm_);
01686           }
01687           else if (sendType == Details::DISTRIBUTOR_ISEND) {
01688             ArrayRCP<const Packet> tmpSendBuf =
01689               exports.persistingView (sendPacketOffsets[p], packetsPerSend[p]);
01690             requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
01691                                                      tag, *comm_));
01692           }
01693           else if (sendType == Details::DISTRIBUTOR_SSEND) {
01694             ssend<int, Packet> (tmpSend.getRawPtr (),
01695                                 as<int> (tmpSend.size ()),
01696                                 imagesTo_[p], tag, *comm_);
01697           }
01698           else {
01699             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Tpetra::"
01700               "Distributor (4 args): Invalid send type.  We should never get "
01701               "here.  Please report this bug to the Tpetra developers.");
01702           }
01703         }
01704         else { // "Sending" the message to myself
01705           selfNum = p;
01706         }
01707       }
01708 
01709       if (selfMessage_) {
01710         std::copy (exports.begin()+sendPacketOffsets[selfNum],
01711                    exports.begin()+sendPacketOffsets[selfNum]+packetsPerSend[selfNum],
01712                    imports.begin()+selfReceiveOffset);
01713       }
01714       if (debug_) {
01715         std::ostringstream os;
01716         os << myImageID << ": doPosts(4,fast) done" << endl;
01717         *out_ << os.str ();
01718       }
01719     }
01720     else { // data are not blocked by image, use send buffer
01721       if (debug_) {
01722         std::ostringstream os;
01723         os << myImageID << ": doPosts(4,slow): posting sends" << endl;
01724         *out_ << os.str ();
01725       }
01726 
01727       // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
01728       ArrayRCP<Packet> sendArray (maxNumPackets); // send buffer
01729 
01730       TEUCHOS_TEST_FOR_EXCEPTION(
01731         sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
01732         "Tpetra::Distributor::doPosts<" << TypeNameTraits<Packet>::name()
01733         << "> (3-argument version):" << endl
01734         << "The \"send buffer\" code path may not necessarily work with nonblocking sends.");
01735 
01736       Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
01737       size_t ioffset = 0;
01738       for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
01739         indicesOffsets[j] = ioffset;
01740         ioffset += numExportPacketsPerLID[j];
01741       }
01742 
01743       for (size_t i = 0; i < numBlocks; ++i) {
01744         size_t p = i + imageIndex;
01745         if (p > (numBlocks - 1)) {
01746           p -= numBlocks;
01747         }
01748 
01749         if (imagesTo_[p] != myImageID) {
01750           typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
01751           size_t sendArrayOffset = 0;
01752           size_t j = startsTo_[p];
01753           size_t numPacketsTo_p = 0;
01754           for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
01755             srcBegin = exports.begin() + indicesOffsets[j];
01756             srcEnd   = srcBegin + numExportPacketsPerLID[j];
01757             numPacketsTo_p += numExportPacketsPerLID[j];
01758             std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
01759             sendArrayOffset += numExportPacketsPerLID[j];
01760           }
01761           if (numPacketsTo_p > 0) {
01762             ArrayView<const Packet> tmpSend =
01763               sendArray.view (0, numPacketsTo_p);
01764 
01765             if (sendType == Details::DISTRIBUTOR_RSEND) {
01766               readySend<int, Packet> (tmpSend.getRawPtr (),
01767                                       as<int> (tmpSend.size ()),
01768                                       imagesTo_[p], tag, *comm_);
01769             }
01770             else if (sendType == Details::DISTRIBUTOR_ISEND) {
01771               ArrayRCP<const Packet> tmpSendBuf =
01772                 sendArray.persistingView (0, numPacketsTo_p);
01773               requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
01774                                                        tag, *comm_));
01775             }
01776             else if (sendType == Details::DISTRIBUTOR_SSEND) {
01777               ssend<int, Packet> (tmpSend.getRawPtr (),
01778                                   as<int> (tmpSend.size ()),
01779                                   imagesTo_[p], tag, *comm_);
01780             }
01781             else { // if (sendType == Details::DISTRIBUTOR_SSEND)
01782               send<int, Packet> (tmpSend.getRawPtr (),
01783                                  as<int> (tmpSend.size ()),
01784                                  imagesTo_[p], tag, *comm_);
01785             }
01786           }
01787         }
01788         else { // "Sending" the message to myself
01789           selfNum = p;
01790           selfIndex = startsTo_[p];
01791         }
01792       }
01793 
01794       if (selfMessage_) {
01795         for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
01796           std::copy (exports.begin()+indicesOffsets[selfIndex],
01797                      exports.begin()+indicesOffsets[selfIndex]+numExportPacketsPerLID[selfIndex],
01798                      imports.begin() + selfReceiveOffset);
01799           selfReceiveOffset += numExportPacketsPerLID[selfIndex];
01800           ++selfIndex;
01801         }
01802       }
01803       if (debug_) {
01804         std::ostringstream os;
01805         os << myImageID << ": doPosts(4,slow) done" << endl;
01806         *out_ << os.str ();
01807       }
01808     }
01809   }
01810 
01811   template <class Packet>
01812   void Distributor::
01813   doReversePostsAndWaits (const ArrayView<const Packet>& exports,
01814                           size_t numPackets,
01815                           const ArrayView<Packet>& imports)
01816   {
01817     using Teuchos::as;
01818 
01819     // doReversePosts() takes exports and imports as ArrayRCPs,
01820     // requiring that the memory locations are persisting.  However,
01821     // they need only persist within the scope of that routine, so it
01822     // is safe for us to use nonpersisting references in this case.
01823 
01824     // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
01825     // for Packet=std::complex<T> (e.g., T=float) fails to compile
01826     // with some versions of GCC.  The issue only arises with the
01827     // exports array.  This is why we construct a separate nonowning
01828     // ArrayRCP.
01829     typedef typename ArrayRCP<const Packet>::size_type size_type;
01830     ArrayRCP<const Packet> exportsArcp (exports.getRawPtr(), as<size_type> (0),
01831                                         exports.size(), false);
01832     // mfh 04 Apr 2012: This is the offending code.  This statement
01833     // would normally be in place of "exportsArcp" in the
01834     // doReversePosts() call below.
01835     //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false)
01836     doReversePosts (exportsArcp,
01837                     numPackets,
01838                     arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
01839     doReverseWaits ();
01840   }
01841 
01842   template <class Packet>
01843   void Distributor::
01844   doReversePostsAndWaits (const ArrayView<const Packet>& exports,
01845                           const ArrayView<size_t> &numExportPacketsPerLID,
01846                           const ArrayView<Packet> &imports,
01847                           const ArrayView<size_t> &numImportPacketsPerLID)
01848   {
01849     using Teuchos::as;
01850     using Teuchos::arcp;
01851     using Teuchos::ArrayRCP;
01852 
01853     TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error,
01854       Teuchos::typeName(*this) << "::doReversePostsAndWaits(): There are "
01855       << requests_.size() << " outstanding nonblocking messages pending.  It is "
01856       "incorrect to call doReversePostsAndWaits with posts outstanding.");
01857 
01858     // doReversePosts() accepts the exports and imports arrays as
01859     // ArrayRCPs, requiring that the memory location is persisting (as
01860     // is necessary for nonblocking receives).  However, it need only
01861     // persist until doReverseWaits() completes, so it is safe for us
01862     // to use a nonpersisting reference in this case.  The use of a
01863     // nonpersisting reference is purely a performance optimization.
01864 
01865     // mfh 02 Apr 2012: For some reason, calling arcp<const Packet>
01866     // for Packet=std::complex<double> fails to compile with some
01867     // versions of GCC.  The issue only arises with the exports array.
01868     // This is why we construct a separate nonowning ArrayRCP.
01869     typedef typename ArrayRCP<const Packet>::size_type size_type;
01870     ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (), as<size_type> (0),
01871                                         exports.size (), false);
01872     doReversePosts (exportsArcp,
01873                     numExportPacketsPerLID,
01874                     arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
01875                     numImportPacketsPerLID);
01876     doReverseWaits ();
01877   }
01878 
01879   template <class Packet>
01880   void Distributor::
01881   doReversePosts (const ArrayRCP<const Packet>& exports,
01882                   size_t numPackets,
01883                   const ArrayRCP<Packet>& imports)
01884   {
01885     // FIXME (mfh 29 Mar 2012) WHY?
01886     TEUCHOS_TEST_FOR_EXCEPTION(! indicesTo_.empty (), std::runtime_error,
01887       Teuchos::typeName (*this) << "::doReversePosts(): Can only do reverse "
01888       "communication when original data are blocked by process.");
01889     if (reverseDistributor_.is_null ()) {
01890       createReverseDistributor ();
01891     }
01892     reverseDistributor_->doPosts (exports, numPackets, imports);
01893   }
01894 
01895   template <class Packet>
01896   void Distributor::
01897   doReversePosts (const ArrayRCP<const Packet>& exports,
01898                   const ArrayView<size_t>& numExportPacketsPerLID,
01899                   const ArrayRCP<Packet>& imports,
01900                   const ArrayView<size_t>& numImportPacketsPerLID)
01901   {
01902     // FIXME (mfh 29 Mar 2012) WHY?
01903     TEUCHOS_TEST_FOR_EXCEPTION(! indicesTo_.empty (), std::runtime_error,
01904       Teuchos::typeName (*this) << "::doReversePosts(): Can only do reverse "
01905       "communication when original data are blocked by process.");
01906     if (reverseDistributor_.is_null ()) {
01907       createReverseDistributor ();
01908     }
01909     reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
01910                                   imports, numImportPacketsPerLID);
01911   }
01912 
01913 #if TPETRA_USE_KOKKOS_DISTOBJECT || defined(TPETRA_HAVE_KOKKOS_REFACTOR)
01914 
01915   template <class Packet, class Layout, class Device, class Mem>
01916   void Distributor::
01917   doPostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
01918                    size_t numPackets,
01919                    const Kokkos::View<Packet*, Layout, Device, Mem> &imports)
01920   {
01921     // If MPI library doesn't support RDMA for communication directly
01922     // to/from GPU, transfer exports to the host, do the communication, then
01923     // transfer imports back to the GPU
01924     //
01925     // We need to do this here instead of doPosts() because the copy back
01926     // to the GPU must occur after the waits.
01927     typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view;
01928     typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view;
01929     if (!enable_cuda_rdma_ && !exports_view::is_hostspace) {
01930       typename exports_view::HostMirror host_exports =
01931         Kokkos::create_mirror_view(exports);
01932       typename imports_view::HostMirror host_imports =
01933         Kokkos::create_mirror_view(imports);
01934       Kokkos::deep_copy (host_exports, exports);
01935       doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports),
01936                       numPackets,
01937                       host_imports);
01938       Kokkos::deep_copy (imports, host_imports);
01939       return;
01940     }
01941 
01942     TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error,
01943       Teuchos::typeName(*this) << "::doPostsAndWaits(): There are "
01944       << requests_.size() << " outstanding nonblocking messages pending.  It is "
01945       "incorrect to call doPostsAndWaits with posts outstanding.");
01946 
01947     doPosts (exports, numPackets, imports);
01948     doWaits();
01949   }
01950 
01951   template <class Packet, class Layout, class Device, class Mem>
01952   void Distributor::
01953   doPostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
01954                    const ArrayView<size_t> &numExportPacketsPerLID,
01955                    const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
01956                    const ArrayView<size_t> &numImportPacketsPerLID)
01957   {
01958     // If MPI library doesn't support RDMA for communication directly
01959     // to/from GPU, transfer exports to the host, do the communication, then
01960     // transfer imports back to the GPU
01961     //
01962     // We need to do this here instead of doPosts() because the copy back
01963     // to the GPU must occur after the waits.
01964     typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view;
01965     typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view;
01966     if (!enable_cuda_rdma_ && !exports_view::is_hostspace) {
01967       typename exports_view::HostMirror host_exports =
01968         Kokkos::create_mirror_view(exports);
01969       typename imports_view::HostMirror host_imports =
01970         Kokkos::create_mirror_view(imports);
01971       Kokkos::deep_copy (host_exports, exports);
01972       doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports),
01973                       numExportPacketsPerLID,
01974                       host_imports,
01975                       numImportPacketsPerLID);
01976       Kokkos::deep_copy (imports, host_imports);
01977       return;
01978     }
01979 
01980     TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error,
01981       Teuchos::typeName(*this) << "::doPostsAndWaits(): There are "
01982       << requests_.size() << " outstanding nonblocking messages pending.  It is "
01983       "incorrect to call doPostsAndWaits with posts outstanding.");
01984 
01985     doPosts (exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
01986     doWaits();
01987   }
01988 
01989 
01990   template <class Packet, class Layout, class Device, class Mem>
01991   void Distributor::
01992   doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
01993            size_t numPackets,
01994            const Kokkos::View<Packet*, Layout, Device, Mem> &imports)
01995   {
01996     using Teuchos::Array;
01997     using Teuchos::as;
01998     using Teuchos::FancyOStream;
01999     using Teuchos::includesVerbLevel;
02000     using Teuchos::ireceive;
02001     using Teuchos::isend;
02002     using Teuchos::OSTab;
02003     using Teuchos::readySend;
02004     using Teuchos::send;
02005     using Teuchos::ssend;
02006     using Teuchos::TypeNameTraits;
02007     using Teuchos::typeName;
02008     using std::endl;
02009     using Kokkos::Compat::create_const_view;
02010     using Kokkos::Compat::create_view;
02011     using Kokkos::Compat::subview_offset;
02012     using Kokkos::Compat::deep_copy_offset;
02013     typedef Array<size_t>::size_type size_type;
02014     typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view_type;
02015     typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view_type;
02016 
02017     Teuchos::OSTab tab (out_);
02018 
02019 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02020     Teuchos::TimeMonitor timeMon (*timer_doPosts3_);
02021 #endif // TPETRA_DISTRIBUTOR_TIMERS
02022 
02023     // Run-time configurable parameters that come from the input
02024     // ParameterList set by setParameterList().
02025     const Details::EDistributorSendType sendType = sendType_;
02026     const bool doBarrier = barrierBetween_;
02027 
02028 // #ifdef HAVE_TEUCHOS_DEBUG
02029 //     // Prepare for verbose output, if applicable.
02030 //     Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
02031 //     (void) verbLevel; // Silence "unused variable" compiler warning.
02032 //     RCP<FancyOStream> out = this->getOStream ();
02033 //     // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
02034 //     //   includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
02035 //     const bool doPrint = out.get () && (comm_->getRank () == 0);
02036 
02037 //     if (doPrint) {
02038 //       // Only need one process to print out parameters.
02039 //       *out << "Distributor::doPosts (3 args)" << endl;
02040 //     }
02041 //     // Add one tab level.  We declare this outside the doPrint scopes
02042 //     // so that the tab persists until the end of this method.
02043 //     OSTab tab = this->getOSTab ();
02044 //     if (doPrint) {
02045 //       *out << "Parameters:" << endl;
02046 //       {
02047 //         OSTab tab2 (out);
02048 //         *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
02049 //              << endl << "barrierBetween: " << doBarrier << endl;
02050 //       }
02051 //     }
02052 // #endif // HAVE_TEUCHOS_DEBUG
02053 
02054     TEUCHOS_TEST_FOR_EXCEPTION(
02055       sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
02056       std::logic_error, "Ready send implementation requires a barrier between "
02057       "posting receives and posting ready sends.  This should have been checked "
02058       "before.  Please report this bug to the Tpetra developers.");
02059 
02060     const int myImageID = comm_->getRank();
02061     size_t selfReceiveOffset = 0;
02062 
02063 #ifdef HAVE_TEUCHOS_DEBUG
02064     // Each message has the same number of packets.
02065     const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
02066     TEUCHOS_TEST_FOR_EXCEPTION(as<size_t> (imports.size ()) != totalNumImportPackets,
02067       std::runtime_error, typeName (*this) << "::doPosts(): imports must be "
02068       "large enough to store the imported data.  imports.size() = "
02069       << imports.size() << ", but total number of import packets = "
02070       << totalNumImportPackets << ".");
02071 #endif // HAVE_TEUCHOS_DEBUG
02072 
02073     // MPI tag for nonblocking receives and blocking sends in this
02074     // method.  Some processes might take the "fast" path
02075     // (indicesTo_.empty()) and others might take the "slow" path for
02076     // the same doPosts() call, so the path tag must be the same for
02077     // both.
02078     const int pathTag = 0;
02079     const int tag = this->getTag (pathTag);
02080 
02081     if (debug_) {
02082       TEUCHOS_TEST_FOR_EXCEPTION(
02083         requests_.size () != 0, std::logic_error, "doPosts(3): Process "
02084         << myImageID << ": requests_.size() = " << requests_.size ()
02085         << " != 0.");
02086       std::ostringstream os;
02087       os << myImageID << ": doPosts(3,"
02088          << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
02089       *out_ << os.str ();
02090     }
02091 
02092     // Distributor uses requests_.size() as the number of outstanding
02093     // nonblocking message requests, so we resize to zero to maintain
02094     // this invariant.
02095     //
02096     // numReceives_ does _not_ include the self message, if there is
02097     // one.  Here, we do actually send a message to ourselves, so we
02098     // include any self message in the "actual" number of receives to
02099     // post.
02100     //
02101     // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
02102     // doesn't (re)allocate its array of requests.  That happens in
02103     // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
02104     // demand), or Resize_().
02105     const size_type actualNumReceives = as<size_type> (numReceives_) +
02106       as<size_type> (selfMessage_ ? 1 : 0);
02107     requests_.resize (0);
02108 
02109     // Post the nonblocking receives.  It's common MPI wisdom to post
02110     // receives before sends.  In MPI terms, this means favoring
02111     // adding to the "posted queue" (of receive requests) over adding
02112     // to the "unexpected queue" (of arrived messages not yet matched
02113     // with a receive).
02114     {
02115 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02116       Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_);
02117 #endif // TPETRA_DISTRIBUTOR_TIMERS
02118 
02119       size_t curBufferOffset = 0;
02120       for (size_type i = 0; i < actualNumReceives; ++i) {
02121         if (imagesFrom_[i] != myImageID) {
02122           // If my process is receiving these packet(s) from another
02123           // process (not a self-receive):
02124           //
02125           // 1. Set up the persisting view (recvBuf) of the imports
02126           //    array, given the offset and size (total number of
02127           //    packets from process imagesFrom_[i]).
02128           // 2. Start the Irecv and save the resulting request.
02129           imports_view_type recvBuf =
02130             subview_offset (imports, curBufferOffset, lengthsFrom_[i]*numPackets);
02131           requests_.push_back (ireceive<int> (recvBuf, imagesFrom_[i],
02132                                               tag, *comm_));
02133           if (debug_) {
02134             std::ostringstream os;
02135             os << myImageID << ": doPosts(3,"
02136                << (indicesTo_.empty () ? "fast" : "slow") << "): "
02137                << "Posted irecv from Proc " << imagesFrom_[i] << " with "
02138               "specified tag " << tag << endl;
02139             *out_ << os.str ();
02140           }
02141         }
02142         else { // Receiving from myself
02143           selfReceiveOffset = curBufferOffset; // Remember the self-recv offset
02144         }
02145         curBufferOffset += lengthsFrom_[i]*numPackets;
02146       }
02147     }
02148 
02149     if (doBarrier) {
02150 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02151       Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_);
02152 #endif // TPETRA_DISTRIBUTOR_TIMERS
02153       // If we are using ready sends (MPI_Rsend) below, we need to do
02154       // a barrier before we post the ready sends.  This is because a
02155       // ready send requires that its matching receive has already
02156       // been posted before the send has been posted.  The only way to
02157       // guarantee that in this case is to use a barrier.
02158       comm_->barrier ();
02159     }
02160 
02161 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02162     Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_);
02163 #endif // TPETRA_DISTRIBUTOR_TIMERS
02164 
02165     // setup scan through imagesTo_ list starting with higher numbered images
02166     // (should help balance message traffic)
02167     //
02168     // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
02169     // It doesn't depend on the input at all.
02170     size_t numBlocks = numSends_ + selfMessage_;
02171     size_t imageIndex = 0;
02172     while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
02173       ++imageIndex;
02174     }
02175     if (imageIndex == numBlocks) {
02176       imageIndex = 0;
02177     }
02178 
02179     size_t selfNum = 0;
02180     size_t selfIndex = 0;
02181 
02182     if (indicesTo_.empty()) {
02183       if (debug_) {
02184         std::ostringstream os;
02185         os << myImageID << ": doPosts(3,fast): posting sends" << endl;
02186         *out_ << os.str ();
02187       }
02188 
02189       // Data are already blocked (laid out) by process, so we don't
02190       // need a separate send buffer (besides the exports array).
02191       for (size_t i = 0; i < numBlocks; ++i) {
02192         size_t p = i + imageIndex;
02193         if (p > (numBlocks - 1)) {
02194           p -= numBlocks;
02195         }
02196 
02197         if (imagesTo_[p] != myImageID) {
02198           exports_view_type tmpSend = subview_offset(
02199             exports, startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
02200 
02201           if (sendType == Details::DISTRIBUTOR_SEND) {
02202             send<int> (tmpSend,
02203                        as<int> (tmpSend.size ()),
02204                        imagesTo_[p], tag, *comm_);
02205           }
02206           else if (sendType == Details::DISTRIBUTOR_ISEND) {
02207             exports_view_type tmpSendBuf =
02208               subview_offset (exports, startsTo_[p] * numPackets,
02209                               lengthsTo_[p] * numPackets);
02210             requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
02211                                              tag, *comm_));
02212           }
02213           else if (sendType == Details::DISTRIBUTOR_RSEND) {
02214             readySend<int> (tmpSend,
02215                             as<int> (tmpSend.size ()),
02216                             imagesTo_[p], tag, *comm_);
02217           }
02218           else if (sendType == Details::DISTRIBUTOR_SSEND) {
02219             ssend<int> (tmpSend,
02220                         as<int> (tmpSend.size ()),
02221                         imagesTo_[p], tag, *comm_);
02222           } else {
02223             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Tpetra::"
02224               "Distributor (3 args): Invalid send type.  We should never get "
02225               "here.  Please report this bug to the Tpetra developers.");
02226           }
02227 
02228           if (debug_) {
02229             std::ostringstream os;
02230             os << myImageID << ": doPosts(3,fast): "
02231                << "Posted send to Proc " << imagesTo_[i]
02232                << " w/ specified tag " << tag << endl;
02233             *out_ << os.str ();
02234           }
02235         }
02236         else { // "Sending" the message to myself
02237           selfNum = p;
02238         }
02239       }
02240 
02241       if (selfMessage_) {
02242         // This is how we "send a message to ourself": we copy from
02243         // the export buffer to the import buffer.  That saves
02244         // Teuchos::Comm implementations other than MpiComm (in
02245         // particular, SerialComm) the trouble of implementing self
02246         // messages correctly.  (To do this right, SerialComm would
02247         // need internal buffer space for messages, keyed on the
02248         // message's tag.)
02249         deep_copy_offset(imports, exports, selfReceiveOffset,
02250                          startsTo_[selfNum]*numPackets,
02251                          lengthsTo_[selfNum]*numPackets);
02252       }
02253       if (debug_) {
02254         std::ostringstream os;
02255         os << myImageID << ": doPosts(3,fast) done" << endl;
02256         *out_ << os.str ();
02257       }
02258     }
02259     else { // data are not blocked by image, use send buffer
02260       if (debug_) {
02261         std::ostringstream os;
02262         os << myImageID << ": doPosts(3,slow): posting sends" << endl;
02263         *out_ << os.str ();
02264       }
02265 
02266       // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
02267       // sends), because the buffer is only long enough for one send.
02268       Kokkos::View<Packet*,Layout,Device,Mem> sendArray =
02269         create_view<Packet,Layout,Device,Mem>("sendArray", maxSendLength_ * numPackets); // send buffer
02270 
02271       TEUCHOS_TEST_FOR_EXCEPTION(
02272         sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
02273         "Tpetra::Distributor::doPosts<" << TypeNameTraits<Packet>::name()
02274         << "> (3-argument version):" << endl
02275         << "The \"send buffer\" code path doesn't currently work with nonblocking sends.");
02276 
02277       for (size_t i = 0; i < numBlocks; ++i) {
02278         size_t p = i + imageIndex;
02279         if (p > (numBlocks - 1)) {
02280           p -= numBlocks;
02281         }
02282 
02283         if (imagesTo_[p] != myImageID) {
02284           size_t sendArrayOffset = 0;
02285           size_t j = startsTo_[p];
02286           for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
02287             deep_copy_offset(sendArray, exports, sendArrayOffset,
02288                              indicesTo_[j]*numPackets, numPackets);
02289             sendArrayOffset += numPackets;
02290           }
02291           Kokkos::View<Packet*, Layout, Device, Mem> tmpSend =
02292             subview_offset(sendArray, size_t(0), lengthsTo_[p]*numPackets);
02293 
02294           if (sendType == Details::DISTRIBUTOR_SEND) {
02295             send<int> (tmpSend,
02296                        as<int> (tmpSend.size ()),
02297                        imagesTo_[p], tag, *comm_);
02298           }
02299           else if (sendType == Details::DISTRIBUTOR_ISEND) {
02300             exports_view_type tmpSendBuf =
02301               subview_offset (sendArray, size_t(0), lengthsTo_[p] * numPackets);
02302             requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
02303                                              tag, *comm_));
02304           }
02305           else if (sendType == Details::DISTRIBUTOR_RSEND) {
02306             readySend<int> (tmpSend,
02307                             as<int> (tmpSend.size ()),
02308                             imagesTo_[p], tag, *comm_);
02309           }
02310           else if (sendType == Details::DISTRIBUTOR_SSEND) {
02311             ssend<int> (tmpSend,
02312                         as<int> (tmpSend.size ()),
02313                         imagesTo_[p], tag, *comm_);
02314           }
02315           else {
02316             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Tpetra::"
02317               "Distributor (3 args): Invalid send type.  We should never get "
02318               "here.  Please report this bug to the Tpetra developers.");
02319           }
02320 
02321           if (debug_) {
02322             std::ostringstream os;
02323             os << myImageID << ": doPosts(3,slow): "
02324                << "Posted send to Proc " << imagesTo_[i]
02325                << " w/ specified tag " << tag << endl;
02326             *out_ << os.str ();
02327           }
02328         }
02329         else { // "Sending" the message to myself
02330           selfNum = p;
02331           selfIndex = startsTo_[p];
02332         }
02333       }
02334 
02335       if (selfMessage_) {
02336         for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
02337           deep_copy_offset(imports, exports, selfReceiveOffset,
02338                            indicesTo_[selfIndex]*numPackets, numPackets);
02339           ++selfIndex;
02340           selfReceiveOffset += numPackets;
02341         }
02342       }
02343       if (debug_) {
02344         std::ostringstream os;
02345         os << myImageID << ": doPosts(3,slow) done" << endl;
02346         *out_ << os.str ();
02347       }
02348     }
02349   }
02350 
02351   template <class Packet, class Layout, class Device, class Mem>
02352   void Distributor::
02353   doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
02354            const ArrayView<size_t> &numExportPacketsPerLID,
02355            const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
02356            const ArrayView<size_t> &numImportPacketsPerLID)
02357   {
02358     using Teuchos::Array;
02359     using Teuchos::as;
02360     using Teuchos::ireceive;
02361     using Teuchos::isend;
02362     using Teuchos::readySend;
02363     using Teuchos::send;
02364     using Teuchos::ssend;
02365     using Teuchos::TypeNameTraits;
02366 #ifdef HAVE_TEUCHOS_DEBUG
02367     using Teuchos::OSTab;
02368 #endif // HAVE_TEUCHOS_DEBUG
02369     using std::endl;
02370     using Kokkos::Compat::create_const_view;
02371     using Kokkos::Compat::create_view;
02372     using Kokkos::Compat::subview_offset;
02373     using Kokkos::Compat::deep_copy_offset;
02374     typedef Array<size_t>::size_type size_type;
02375     typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view_type;
02376     typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view_type;
02377 
02378     Teuchos::OSTab tab (out_);
02379 
02380 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02381     Teuchos::TimeMonitor timeMon (*timer_doPosts4_);
02382 #endif // TPETRA_DISTRIBUTOR_TIMERS
02383 
02384     // Run-time configurable parameters that come from the input
02385     // ParameterList set by setParameterList().
02386     const Details::EDistributorSendType sendType = sendType_;
02387     const bool doBarrier = barrierBetween_;
02388 
02389 // #ifdef HAVE_TEUCHOS_DEBUG
02390 //     // Prepare for verbose output, if applicable.
02391 //     Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
02392 //     RCP<Teuchos::FancyOStream> out = this->getOStream ();
02393 //     const bool doPrint = out.get () && (comm_->getRank () == 0) &&
02394 //       includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
02395 
02396 //     if (doPrint) {
02397 //       // Only need one process to print out parameters.
02398 //       *out << "Distributor::doPosts (4 args)" << endl;
02399 //     }
02400 //     // Add one tab level.  We declare this outside the doPrint scopes
02401 //     // so that the tab persists until the end of this method.
02402 //     Teuchos::OSTab tab = this->getOSTab ();
02403 //     if (doPrint) {
02404 //       *out << "Parameters:" << endl;
02405 //       {
02406 //         OSTab tab2 (out);
02407 //         *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
02408 //              << endl << "barrierBetween: " << doBarrier << endl;
02409 //       }
02410 //     }
02411 // #endif // HAVE_TEUCHOS_DEBUG
02412 
02413     TEUCHOS_TEST_FOR_EXCEPTION(sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
02414       std::logic_error, "Ready send implementation requires a barrier between "
02415       "posting receives and posting ready sends.  This should have been checked "
02416       "before.  Please report this bug to the Tpetra developers.");
02417 
02418     const int myImageID = comm_->getRank();
02419     size_t selfReceiveOffset = 0;
02420 
02421 #ifdef HAVE_TEUCHOS_DEBUG
02422     // Different messages may have different numbers of packets.
02423     size_t totalNumImportPackets = 0;
02424     for (int ii = 0; ii < numImportPacketsPerLID.size(); ++ii) {
02425       totalNumImportPackets += numImportPacketsPerLID[ii];
02426     }
02427     TEUCHOS_TEST_FOR_EXCEPTION(as<size_t> (imports.size ()) != totalNumImportPackets,
02428       std::runtime_error, Teuchos::typeName (*this) << "::doPosts(): The "
02429       "imports array argument must be large enough to store the imported "
02430       "data.  imports.size() = " << imports.size() << ", but the total number "
02431       "of packets is " << totalNumImportPackets << ".");
02432 #endif // HAVE_TEUCHOS_DEBUG
02433 
02434     // MPI tag for nonblocking receives and blocking sends in this
02435     // method.  Some processes might take the "fast" path
02436     // (indicesTo_.empty()) and others might take the "slow" path for
02437     // the same doPosts() call, so the path tag must be the same for
02438     // both.
02439     const int pathTag = 1;
02440     const int tag = this->getTag (pathTag);
02441 
02442     if (debug_) {
02443       TEUCHOS_TEST_FOR_EXCEPTION(
02444         requests_.size () != 0, std::logic_error, "doPosts(4): Process "
02445         << myImageID << ": requests_.size() = " << requests_.size ()
02446         << " != 0.");
02447       std::ostringstream os;
02448       os << myImageID << ": doPosts(4,"
02449          << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
02450       *out_ << os.str ();
02451     }
02452 
02453     // Distributor uses requests_.size() as the number of outstanding
02454     // nonblocking message requests, so we resize to zero to maintain
02455     // this invariant.
02456     //
02457     // numReceives_ does _not_ include the self message, if there is
02458     // one.  Here, we do actually send a message to ourselves, so we
02459     // include any self message in the "actual" number of receives to
02460     // post.
02461     //
02462     // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
02463     // doesn't (re)allocate its array of requests.  That happens in
02464     // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
02465     // demand), or Resize_().
02466     const size_type actualNumReceives = as<size_type> (numReceives_) +
02467       as<size_type> (selfMessage_ ? 1 : 0);
02468     requests_.resize (0);
02469 
02470     // Post the nonblocking receives.  It's common MPI wisdom to post
02471     // receives before sends.  In MPI terms, this means favoring
02472     // adding to the "posted queue" (of receive requests) over adding
02473     // to the "unexpected queue" (of arrived messages not yet matched
02474     // with a receive).
02475     {
02476 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02477       Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_);
02478 #endif // TPETRA_DISTRIBUTOR_TIMERS
02479 
02480       size_t curBufferOffset = 0;
02481       size_t curLIDoffset = 0;
02482       for (size_type i = 0; i < actualNumReceives; ++i) {
02483         size_t totalPacketsFrom_i = 0;
02484         for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
02485           totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
02486         }
02487         curLIDoffset += lengthsFrom_[i];
02488         if (imagesFrom_[i] != myImageID && totalPacketsFrom_i) {
02489           // If my process is receiving these packet(s) from another
02490           // process (not a self-receive), and if there is at least
02491           // one packet to receive:
02492           //
02493           // 1. Set up the persisting view (recvBuf) into the imports
02494           //    array, given the offset and size (total number of
02495           //    packets from process imagesFrom_[i]).
02496           // 2. Start the Irecv and save the resulting request.
02497           imports_view_type recvBuf =
02498             subview_offset (imports, curBufferOffset, totalPacketsFrom_i);
02499           requests_.push_back (ireceive<int> (recvBuf, imagesFrom_[i],
02500                                               tag, *comm_));
02501         }
02502         else { // Receiving these packet(s) from myself
02503           selfReceiveOffset = curBufferOffset; // Remember the offset
02504         }
02505         curBufferOffset += totalPacketsFrom_i;
02506       }
02507     }
02508 
02509     if (doBarrier) {
02510 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02511       Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_);
02512 #endif // TPETRA_DISTRIBUTOR_TIMERS
02513       // If we are using ready sends (MPI_Rsend) below, we need to do
02514       // a barrier before we post the ready sends.  This is because a
02515       // ready send requires that its matching receive has already
02516       // been posted before the send has been posted.  The only way to
02517       // guarantee that in this case is to use a barrier.
02518       comm_->barrier ();
02519     }
02520 
02521 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02522     Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_);
02523 #endif // TPETRA_DISTRIBUTOR_TIMERS
02524 
02525     // setup arrays containing starting-offsets into exports for each send,
02526     // and num-packets-to-send for each send.
02527     Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
02528     size_t maxNumPackets = 0;
02529     size_t curPKToffset = 0;
02530     for (size_t pp=0; pp<numSends_; ++pp) {
02531       sendPacketOffsets[pp] = curPKToffset;
02532       size_t numPackets = 0;
02533       for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
02534         numPackets += numExportPacketsPerLID[j];
02535       }
02536       if (numPackets > maxNumPackets) maxNumPackets = numPackets;
02537       packetsPerSend[pp] = numPackets;
02538       curPKToffset += numPackets;
02539     }
02540 
02541     // setup scan through imagesTo_ list starting with higher numbered images
02542     // (should help balance message traffic)
02543     size_t numBlocks = numSends_+ selfMessage_;
02544     size_t imageIndex = 0;
02545     while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
02546       ++imageIndex;
02547     }
02548     if (imageIndex == numBlocks) {
02549       imageIndex = 0;
02550     }
02551 
02552     size_t selfNum = 0;
02553     size_t selfIndex = 0;
02554 
02555     if (indicesTo_.empty()) {
02556       if (debug_) {
02557         std::ostringstream os;
02558         os << myImageID << ": doPosts(4,fast): posting sends" << endl;
02559         *out_ << os.str ();
02560       }
02561 
02562       // Data are already blocked (laid out) by process, so we don't
02563       // need a separate send buffer (besides the exports array).
02564       for (size_t i = 0; i < numBlocks; ++i) {
02565         size_t p = i + imageIndex;
02566         if (p > (numBlocks - 1)) {
02567           p -= numBlocks;
02568         }
02569 
02570         if (imagesTo_[p] != myImageID && packetsPerSend[p] > 0) {
02571           exports_view_type tmpSend =
02572             subview_offset(exports, sendPacketOffsets[p], packetsPerSend[p]);
02573 
02574           if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
02575             send<int> (tmpSend,
02576                        as<int> (tmpSend.size ()),
02577                        imagesTo_[p], tag, *comm_);
02578           }
02579           else if (sendType == Details::DISTRIBUTOR_RSEND) {
02580             readySend<int> (tmpSend,
02581                             as<int> (tmpSend.size ()),
02582                             imagesTo_[p], tag, *comm_);
02583           }
02584           else if (sendType == Details::DISTRIBUTOR_ISEND) {
02585             exports_view_type tmpSendBuf =
02586               subview_offset (exports, sendPacketOffsets[p], packetsPerSend[p]);
02587             requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
02588                                              tag, *comm_));
02589           }
02590           else if (sendType == Details::DISTRIBUTOR_SSEND) {
02591             ssend<int> (tmpSend,
02592                         as<int> (tmpSend.size ()),
02593                         imagesTo_[p], tag, *comm_);
02594           }
02595           else {
02596             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Tpetra::"
02597               "Distributor (4 args): Invalid send type.  We should never get "
02598               "here.  Please report this bug to the Tpetra developers.");
02599           }
02600         }
02601         else { // "Sending" the message to myself
02602           selfNum = p;
02603         }
02604       }
02605 
02606       if (selfMessage_) {
02607         deep_copy_offset(imports, exports, selfReceiveOffset,
02608                          sendPacketOffsets[selfNum], packetsPerSend[selfNum]);
02609       }
02610       if (debug_) {
02611         std::ostringstream os;
02612         os << myImageID << ": doPosts(4,fast) done" << endl;
02613         *out_ << os.str ();
02614       }
02615     }
02616     else { // data are not blocked by image, use send buffer
02617       if (debug_) {
02618         std::ostringstream os;
02619         os << myImageID << ": doPosts(4,slow): posting sends" << endl;
02620         *out_ << os.str ();
02621       }
02622 
02623       // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
02624       Kokkos::View<Packet*,Layout,Device,Mem> sendArray =
02625         create_view<Packet,Layout,Device,Mem>("sendArray", maxNumPackets); // send buffer
02626 
02627       TEUCHOS_TEST_FOR_EXCEPTION(
02628         sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
02629         "Tpetra::Distributor::doPosts<" << TypeNameTraits<Packet>::name()
02630         << "> (3-argument version):" << endl
02631         << "The \"send buffer\" code path may not necessarily work with nonblocking sends.");
02632 
02633       Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
02634       size_t ioffset = 0;
02635       for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
02636         indicesOffsets[j] = ioffset;
02637         ioffset += numExportPacketsPerLID[j];
02638       }
02639 
02640       for (size_t i = 0; i < numBlocks; ++i) {
02641         size_t p = i + imageIndex;
02642         if (p > (numBlocks - 1)) {
02643           p -= numBlocks;
02644         }
02645 
02646         if (imagesTo_[p] != myImageID) {
02647           size_t sendArrayOffset = 0;
02648           size_t j = startsTo_[p];
02649           size_t numPacketsTo_p = 0;
02650           for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
02651             deep_copy_offset(sendArray, exports, sendArrayOffset,
02652                              indicesOffsets[j], numExportPacketsPerLID[j]);
02653             sendArrayOffset += numExportPacketsPerLID[j];
02654           }
02655           if (numPacketsTo_p > 0) {
02656             Kokkos::View<Packet*, Layout, Device, Mem> tmpSend =
02657               subview_offset(sendArray, size_t(0), numPacketsTo_p);
02658 
02659             if (sendType == Details::DISTRIBUTOR_RSEND) {
02660               readySend<int> (tmpSend,
02661                               as<int> (tmpSend.size ()),
02662                               imagesTo_[p], tag, *comm_);
02663             }
02664             else if (sendType == Details::DISTRIBUTOR_ISEND) {
02665               exports_view_type tmpSendBuf =
02666                 subview_offset (sendArray, size_t(0), numPacketsTo_p);
02667               requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
02668                                                tag, *comm_));
02669             }
02670             else if (sendType == Details::DISTRIBUTOR_SSEND) {
02671               ssend<int> (tmpSend,
02672                           as<int> (tmpSend.size ()),
02673                           imagesTo_[p], tag, *comm_);
02674             }
02675             else { // if (sendType == Details::DISTRIBUTOR_SSEND)
02676               send<int> (tmpSend,
02677                          as<int> (tmpSend.size ()),
02678                          imagesTo_[p], tag, *comm_);
02679             }
02680           }
02681         }
02682         else { // "Sending" the message to myself
02683           selfNum = p;
02684           selfIndex = startsTo_[p];
02685         }
02686       }
02687 
02688       if (selfMessage_) {
02689         for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
02690           deep_copy_offset(imports, exports, selfReceiveOffset,
02691                            indicesOffsets[selfIndex],
02692                            numExportPacketsPerLID[selfIndex]);
02693           selfReceiveOffset += numExportPacketsPerLID[selfIndex];
02694           ++selfIndex;
02695         }
02696       }
02697       if (debug_) {
02698         std::ostringstream os;
02699         os << myImageID << ": doPosts(4,slow) done" << endl;
02700         *out_ << os.str ();
02701       }
02702     }
02703   }
02704 
02705   template <class Packet, class Layout, class Device, class Mem>
02706   void Distributor::
02707   doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
02708                           size_t numPackets,
02709                           const Kokkos::View<Packet*, Layout, Device, Mem> &imports)
02710   {
02711     // If MPI library doesn't support RDMA for communication directly
02712     // to/from GPU, transfer exports to the host, do the communication, then
02713     // transfer imports back to the GPU
02714     //
02715     // We need to do this here instead of doPosts() because the copy back
02716     // to the GPU must occur after the waits.
02717     typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view;
02718     typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view;
02719     if (!enable_cuda_rdma_ && !exports_view::is_hostspace) {
02720       typename exports_view::HostMirror host_exports =
02721         Kokkos::create_mirror_view(exports);
02722       typename imports_view::HostMirror host_imports =
02723         Kokkos::create_mirror_view(imports);
02724       Kokkos::deep_copy (host_exports, exports);
02725       doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports),
02726                       numPackets,
02727                       host_imports);
02728       Kokkos::deep_copy (imports, host_imports);
02729       return;
02730     }
02731 
02732     doReversePosts (exports, numPackets, imports);
02733     doReverseWaits ();
02734   }
02735 
02736   template <class Packet, class Layout, class Device, class Mem>
02737   void Distributor::
02738   doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
02739                           const ArrayView<size_t> &numExportPacketsPerLID,
02740                           const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
02741                           const ArrayView<size_t> &numImportPacketsPerLID)
02742   {
02743     // If MPI library doesn't support RDMA for communication directly
02744     // to/from GPU, transfer exports to the host, do the communication, then
02745     // transfer imports back to the GPU
02746     //
02747     // We need to do this here instead of doPosts() because the copy back
02748     // to the GPU must occur after the waits.
02749     typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view;
02750     typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view;
02751     if (!enable_cuda_rdma_ && !exports_view::is_hostspace) {
02752       typename exports_view::HostMirror host_exports =
02753         Kokkos::create_mirror_view(exports);
02754       typename imports_view::HostMirror host_imports =
02755         Kokkos::create_mirror_view(imports);
02756       Kokkos::deep_copy (host_exports, exports);
02757       doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports),
02758                       numExportPacketsPerLID,
02759                       host_imports,
02760                       numImportPacketsPerLID);
02761       Kokkos::deep_copy (imports, host_imports);
02762       return;
02763     }
02764 
02765     TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error,
02766       Teuchos::typeName(*this) << "::doReversePostsAndWaits(): There are "
02767       << requests_.size() << " outstanding nonblocking messages pending.  It is "
02768       "incorrect to call doReversePostsAndWaits with posts outstanding.");
02769 
02770     doReversePosts (exports, numExportPacketsPerLID, imports,
02771                     numImportPacketsPerLID);
02772     doReverseWaits ();
02773   }
02774 
02775   template <class Packet, class Layout, class Device, class Mem>
02776   void Distributor::
02777   doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
02778                   size_t numPackets,
02779                   const  Kokkos::View<Packet*, Layout, Device, Mem> &imports)
02780   {
02781     // FIXME (mfh 29 Mar 2012) WHY?
02782     TEUCHOS_TEST_FOR_EXCEPTION(! indicesTo_.empty (), std::runtime_error,
02783       Teuchos::typeName (*this) << "::doReversePosts(): Can only do reverse "
02784       "communication when original data are blocked by process.");
02785     if (reverseDistributor_.is_null ()) {
02786       createReverseDistributor ();
02787     }
02788     reverseDistributor_->doPosts (exports, numPackets, imports);
02789   }
02790 
02791   template <class Packet, class Layout, class Device, class Mem>
02792   void Distributor::
02793   doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
02794                   const ArrayView<size_t> &numExportPacketsPerLID,
02795                   const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
02796                   const ArrayView<size_t> &numImportPacketsPerLID)
02797   {
02798     // FIXME (mfh 29 Mar 2012) WHY?
02799     TEUCHOS_TEST_FOR_EXCEPTION(! indicesTo_.empty (), std::runtime_error,
02800       Teuchos::typeName (*this) << "::doReversePosts(): Can only do reverse "
02801       "communication when original data are blocked by process.");
02802     if (reverseDistributor_.is_null ()) {
02803       createReverseDistributor ();
02804     }
02805     reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
02806                                   imports, numImportPacketsPerLID);
02807   }
02808 
02809 #endif
02810 
02811   template <class OrdinalType>
02812   void Distributor::
02813   computeSends (const ArrayView<const OrdinalType> & importIDs,
02814                 const ArrayView<const int> & importNodeIDs,
02815                 Array<OrdinalType> & exportIDs,
02816                 Array<int> & exportNodeIDs)
02817   {
02818     // NOTE (mfh 19 Apr 2012): There was a note on this code saying:
02819     // "assumes that size_t >= Ordinal".  The code certainly does
02820     // assume that sizeof(size_t) >= sizeof(OrdinalType) as well as
02821     // sizeof(size_t) >= sizeof(int).  This is because it casts the
02822     // OrdinalType elements of importIDs (along with their
02823     // corresponding process IDs, as int) to size_t, and does a
02824     // doPostsAndWaits<size_t>() to send the packed data.
02825     using std::endl;
02826     typedef typename ArrayView<const OrdinalType>::size_type size_type;
02827 
02828     Teuchos::OSTab tab (out_);
02829     const int myRank = comm_->getRank ();
02830     if (debug_) {
02831       std::ostringstream os;
02832       os << myRank << ": computeSends" << endl;
02833       *out_ << os.str ();
02834     }
02835 
02836     const size_type numImports = importNodeIDs.size ();
02837     TEUCHOS_TEST_FOR_EXCEPTION(
02838       importIDs.size () != numImports, std::invalid_argument, "Tpetra::"
02839       "Distributor::computeSends: On Process " << myRank << ": importNodeIDs."
02840       "size() = " << importNodeIDs.size () << " != importIDs.size() = " <<
02841       importIDs.size () << ".");
02842 
02843     Array<size_t> importObjs (2*numImports);
02844     // Pack pairs (importIDs[i], my process ID) to send into importObjs.
02845     for (size_type i = 0; i < numImports; ++i) {
02846       importObjs[2*i]   = static_cast<size_t> (importIDs[i]);
02847       importObjs[2*i+1] = static_cast<size_t> (myRank);
02848     }
02849     //
02850     // Use a temporary Distributor to send the (importIDs[i], myRank)
02851     // pairs to importNodeIDs[i].
02852     //
02853     Distributor tempPlan (comm_, out_);
02854     if (debug_) {
02855       std::ostringstream os;
02856       os << myRank << ": computeSends: tempPlan.createFromSends" << endl;
02857       *out_ << os.str ();
02858     }
02859 
02860     // mfh 20 Mar 2014: An extra-cautious cast from unsigned to
02861     // signed, in order to forestall any possible causes for Bug 6069.
02862     const size_t numExportsAsSizeT = tempPlan.createFromSends (importNodeIDs);
02863     const size_type numExports = static_cast<size_type> (numExportsAsSizeT);
02864     TEUCHOS_TEST_FOR_EXCEPTION(
02865       numExports < 0, std::logic_error, "Tpetra::Distributor::computeSends: "
02866       "tempPlan.createFromSends() returned numExports = " << numExportsAsSizeT
02867       << " as a size_t, which overflows to " << numExports << " when cast to "
02868       << Teuchos::TypeNameTraits<size_type>::name () << ".  "
02869       "Please report this bug to the Tpetra developers.");
02870     TEUCHOS_TEST_FOR_EXCEPTION(
02871       static_cast<size_type> (tempPlan.getTotalReceiveLength ()) != numExports,
02872       std::logic_error, "Tpetra::Distributor::computeSends: tempPlan.getTotal"
02873       "ReceiveLength() = " << tempPlan.getTotalReceiveLength () << " != num"
02874       "Exports = " << numExports  << ".  Please report this bug to the "
02875       "Tpetra developers.");
02876 
02877     if (numExports > 0) {
02878       exportIDs.resize (numExports);
02879       exportNodeIDs.resize (numExports);
02880     }
02881 
02882     // exportObjs: Packed receive buffer.  (exportObjs[2*i],
02883     // exportObjs[2*i+1]) will give the (GID, PID) pair for export i,
02884     // after tempPlan.doPostsAndWaits(...) finishes below.
02885     //
02886     // FIXME (mfh 19 Mar 2014) This only works if OrdinalType fits in
02887     // size_t.  This issue might come up, for example, on a 32-bit
02888     // machine using 64-bit global indices.  I will add a check here
02889     // for that case.
02890     TEUCHOS_TEST_FOR_EXCEPTION(
02891       sizeof (size_t) < sizeof (OrdinalType), std::logic_error,
02892       "Tpetra::Distributor::computeSends: sizeof(size_t) = " << sizeof(size_t)
02893       << " < sizeof(" << Teuchos::TypeNameTraits<OrdinalType>::name () << ") = "
02894       << sizeof (OrdinalType) << ".  This violates an assumption of the "
02895       "method.  It's not hard to work around (just use Array<OrdinalType> as "
02896       "the export buffer, not Array<size_t>), but we haven't done that yet.  "
02897       "Please report this bug to the Tpetra developers.");
02898 
02899     TEUCHOS_TEST_FOR_EXCEPTION(
02900       tempPlan.getTotalReceiveLength () < static_cast<size_t> (numExports),
02901       std::logic_error,
02902       "Tpetra::Distributor::computeSends: tempPlan.getTotalReceiveLength() = "
02903       << tempPlan.getTotalReceiveLength() << " < numExports = " << numExports
02904       << ".  Please report this bug to the Tpetra developers.");
02905 
02906     Array<size_t> exportObjs (tempPlan.getTotalReceiveLength () * 2);
02907     if (debug_) {
02908       std::ostringstream os;
02909       os << myRank << ": computeSends: tempPlan.doPostsAndWaits" << endl;
02910       *out_ << os.str ();
02911     }
02912     tempPlan.doPostsAndWaits<size_t> (importObjs (), 2, exportObjs ());
02913 
02914     // Unpack received (GID, PID) pairs into exportIDs resp. exportNodeIDs.
02915     for (size_type i = 0; i < numExports; ++i) {
02916       exportIDs[i] = static_cast<OrdinalType> (exportObjs[2*i]);
02917       exportNodeIDs[i] = static_cast<int> (exportObjs[2*i+1]);
02918     }
02919 
02920     if (debug_) {
02921       std::ostringstream os;
02922       os << myRank << ": computeSends done" << endl;
02923       *out_ << os.str ();
02924     }
02925   }
02926 
02927   template <class OrdinalType>
02928   void Distributor::
02929   createFromRecvs (const ArrayView<const OrdinalType> &remoteIDs,
02930                    const ArrayView<const int> &remoteImageIDs,
02931                    Array<OrdinalType> &exportGIDs,
02932                    Array<int> &exportNodeIDs)
02933   {
02934     using std::endl;
02935 
02936     Teuchos::OSTab tab (out_);
02937     const int myRank = comm_->getRank();
02938 
02939     if (debug_) {
02940       *out_ << myRank << ": createFromRecvs" << endl;
02941     }
02942 
02943 #ifdef HAVE_TPETRA_DEBUG
02944     using Teuchos::outArg;
02945     using Teuchos::reduceAll;
02946 
02947     // In debug mode, first test locally, then do an all-reduce to
02948     // make sure that all processes passed.
02949     const int errProc =
02950       (remoteIDs.size () != remoteImageIDs.size ()) ? myRank : -1;
02951     int maxErrProc = -1;
02952     reduceAll<int, int> (*comm_, Teuchos::REDUCE_MAX, errProc, outArg (maxErrProc));
02953     TEUCHOS_TEST_FOR_EXCEPTION(maxErrProc != -1, std::runtime_error,
02954       Teuchos::typeName (*this) << "::createFromRecvs(): lists of remote IDs "
02955       "and remote process IDs must have the same size on all participating "
02956       "processes.  Maximum process ID with error: " << maxErrProc << ".");
02957 #else // NOT HAVE_TPETRA_DEBUG
02958 
02959     // In non-debug mode, just test locally.
02960     TEUCHOS_TEST_FOR_EXCEPTION(
02961       remoteIDs.size () != remoteImageIDs.size (), std::invalid_argument,
02962       Teuchos::typeName (*this) << "::createFromRecvs<" <<
02963       Teuchos::TypeNameTraits<OrdinalType>::name () << ">(): On Process " <<
02964       myRank << ": remoteIDs.size() = " << remoteIDs.size () << " != "
02965       "remoteImageIDs.size() = " << remoteImageIDs.size () << ".");
02966 #endif // HAVE_TPETRA_DEBUG
02967 
02968     computeSends (remoteIDs, remoteImageIDs, exportGIDs, exportNodeIDs);
02969 
02970     const size_t numProcsSendingToMe = createFromSends (exportNodeIDs ());
02971 
02972     if (debug_) {
02973       // NOTE (mfh 20 Mar 2014) If remoteImageIDs could contain
02974       // duplicates, then its length might not be the right check here,
02975       // even if we account for selfMessage_.  selfMessage_ is set in
02976       // createFromSends.
02977       std::ostringstream os;
02978       os << "Proc " << myRank << ": {numProcsSendingToMe: "
02979          << numProcsSendingToMe << ", remoteImageIDs.size(): "
02980          << remoteImageIDs.size () << ", selfMessage_: "
02981          << (selfMessage_ ? "true" : "false") << "}" << std::endl;
02982       std::cerr << os.str ();
02983     }
02984 
02985     if (debug_) {
02986       *out_ << myRank << ": createFromRecvs done" << endl;
02987     }
02988 
02989     howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS;
02990   }
02991 
02992 } // namespace Tpetra
02993 
02994 #endif // TPETRA_DISTRIBUTOR_HPP
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines