Tpetra Matrix/Vector Services Version of the Day
Tpetra_Distributor.hpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //          Tpetra: Templated Linear Algebra Services Package
00005 //                 Copyright (2008) Sandia Corporation
00006 //
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00038 //
00039 // ************************************************************************
00040 // @HEADER
00041 
00042 #ifndef TPETRA_DISTRIBUTOR_HPP
00043 #define TPETRA_DISTRIBUTOR_HPP
00044 
00045 #include "Tpetra_Util.hpp"
00046 #include <Teuchos_as.hpp>
00047 #include <Teuchos_Describable.hpp>
00048 #include <Teuchos_ParameterListAcceptorDefaultBase.hpp>
00049 #include <Teuchos_VerboseObject.hpp>
00050 
00051 // If TPETRA_DISTRIBUTOR_TIMERS is defined, Distributor will time
00052 // doPosts (both versions) and doWaits, and register those timers with
00053 // Teuchos::TimeMonitor so that summarize() or report() will show
00054 // results.
00055 
00056 // #ifndef TPETRA_DISTRIBUTOR_TIMERS
00057 // #  define TPETRA_DISTRIBUTOR_TIMERS 1
00058 // #endif // TPETRA_DISTRIBUTOR_TIMERS
00059 
00060 #ifdef TPETRA_DISTRIBUTOR_TIMERS
00061 #  undef TPETRA_DISTRIBUTOR_TIMERS
00062 #endif // TPETRA_DISTRIBUTOR_TIMERS
00063 
00064 #if TPETRA_USE_KOKKOS_DISTOBJECT || defined(TPETRA_HAVE_KOKKOS_REFACTOR)
00065 #include "KokkosCompat_View.hpp"
00066 #include "Kokkos_View.hpp"
00067 #include "Kokkos_TeuchosCommAdapters.hpp"
00068 #endif
00069 
00070 
00071 namespace Tpetra {
00072 
00073   namespace Details {
00078     enum EDistributorSendType {
00079       DISTRIBUTOR_ISEND, // Use MPI_Isend (Teuchos::isend)
00080       DISTRIBUTOR_RSEND, // Use MPI_Rsend (Teuchos::readySend)
00081       DISTRIBUTOR_SEND,  // Use MPI_Send (Teuchos::send)
00082       DISTRIBUTOR_SSEND  // Use MPI_Ssend (Teuchos::ssend)
00083     };
00084 
00089     std::string
00090     DistributorSendTypeEnumToString (EDistributorSendType sendType);
00091 
00096     enum EDistributorHowInitialized {
00097       DISTRIBUTOR_NOT_INITIALIZED, // Not initialized yet
00098       DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS, // By createFromSends
00099       DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS, // By createFromRecvs
00100       DISTRIBUTOR_INITIALIZED_BY_REVERSE, // By createReverseDistributor
00101       DISTRIBUTOR_INITIALIZED_BY_COPY // By copy constructor
00102     };
00103 
00108     std::string
00109     DistributorHowInitializedEnumToString (EDistributorHowInitialized how);
00110 
00111   } // namespace Details
00112 
00119   Array<std::string> distributorSendTypes ();
00120 
00188   class Distributor :
00189     public Teuchos::Describable,
00190     public Teuchos::ParameterListAcceptorDefaultBase,
00191     public Teuchos::VerboseObject<Distributor> {
00192   public:
00194 
00195 
00203     explicit Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm);
00204 
00214     Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
00215                  const Teuchos::RCP<Teuchos::FancyOStream>& out);
00216 
00229     Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
00230                  const Teuchos::RCP<Teuchos::ParameterList>& plist);
00231 
00246     Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
00247                  const Teuchos::RCP<Teuchos::FancyOStream>& out,
00248                  const Teuchos::RCP<Teuchos::ParameterList>& plist);
00249 
00251     Distributor (const Distributor &distributor);
00252 
00254     virtual ~Distributor ();
00255 
00261     void swap (Distributor& rhs);
00262 
00264 
00265 
00266 
00271     void setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist);
00272 
00277     Teuchos::RCP<const Teuchos::ParameterList> getValidParameters () const;
00278 
00280 
00281 
00282 
00302     size_t createFromSends (const ArrayView<const int>& exportNodeIDs);
00303 
00337     template <class Ordinal>
00338     void
00339     createFromRecvs (const ArrayView<const Ordinal>& remoteIDs,
00340                      const ArrayView<const int>& remoteNodeIDs,
00341                      Array<Ordinal>& exportIDs,
00342                      Array<int>& exportNodeIDs);
00343 
00345 
00346 
00347 
00351     size_t getNumReceives() const;
00352 
00356     size_t getNumSends() const;
00357 
00359     bool hasSelfMessage() const;
00360 
00362     size_t getMaxSendLength() const;
00363 
00365     size_t getTotalReceiveLength() const;
00366 
00371     ArrayView<const int> getImagesFrom() const;
00372 
00377     ArrayView<const int> getImagesTo() const;
00378 
00386     ArrayView<const size_t> getLengthsFrom() const;
00387 
00395     ArrayView<const size_t> getLengthsTo() const;
00396 
00401     Details::EDistributorHowInitialized howInitialized () const {
00402       return howInitialized_;
00403     }
00404 
00406 
00407 
00408 
00419     RCP<Distributor> getReverse() const;
00420 
00422 
00423 
00424 
00445     template <class Packet>
00446     void
00447     doPostsAndWaits (const ArrayView<const Packet> &exports,
00448                      size_t numPackets,
00449                      const ArrayView<Packet> &imports);
00450 
00472     template <class Packet>
00473     void
00474     doPostsAndWaits (const ArrayView<const Packet> &exports,
00475                      const ArrayView<size_t> &numExportPacketsPerLID,
00476                      const ArrayView<Packet> &imports,
00477                      const ArrayView<size_t> &numImportPacketsPerLID);
00478 
00503     template <class Packet>
00504     void
00505     doPosts (const ArrayRCP<const Packet> &exports,
00506              size_t numPackets,
00507              const ArrayRCP<Packet> &imports);
00508 
00527     template <class Packet>
00528     void
00529     doPosts (const ArrayRCP<const Packet> &exports,
00530              const ArrayView<size_t> &numExportPacketsPerLID,
00531              const ArrayRCP<Packet> &imports,
00532              const ArrayView<size_t> &numImportPacketsPerLID);
00533 
00540     void doWaits ();
00541 
00546     template <class Packet>
00547     void
00548     doReversePostsAndWaits (const ArrayView<const Packet> &exports,
00549                             size_t numPackets,
00550                             const ArrayView<Packet> &imports);
00551 
00556     template <class Packet>
00557     void
00558     doReversePostsAndWaits (const ArrayView<const Packet> &exports,
00559                             const ArrayView<size_t> &numExportPacketsPerLID,
00560                             const ArrayView<Packet> &imports,
00561                             const ArrayView<size_t> &numImportPacketsPerLID);
00562 
00567     template <class Packet>
00568     void
00569     doReversePosts (const ArrayRCP<const Packet> &exports,
00570                     size_t numPackets,
00571                     const ArrayRCP<Packet> &imports);
00572 
00577     template <class Packet>
00578     void
00579     doReversePosts (const ArrayRCP<const Packet> &exports,
00580                     const ArrayView<size_t> &numExportPacketsPerLID,
00581                     const ArrayRCP<Packet> &imports,
00582                     const ArrayView<size_t> &numImportPacketsPerLID);
00583 
00590     void doReverseWaits ();
00591 
00592 #if TPETRA_USE_KOKKOS_DISTOBJECT || defined(TPETRA_HAVE_KOKKOS_REFACTOR)
00593 
00614     template <class Packet, class Layout, class Device, class Mem>
00615     void
00616     doPostsAndWaits (
00617       const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00618       size_t numPackets,
00619       const Kokkos::View<Packet*, Layout, Device, Mem> &imports);
00620 
00642     template <class Packet, class Layout, class Device, class Mem>
00643     void
00644     doPostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00645                      const ArrayView<size_t> &numExportPacketsPerLID,
00646                      const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
00647                      const ArrayView<size_t> &numImportPacketsPerLID);
00648 
00673     template <class Packet, class Layout, class Device, class Mem>
00674     void
00675     doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00676              size_t numPackets,
00677              const Kokkos::View<Packet*, Layout, Device, Mem> &imports);
00678 
00697     template <class Packet, class Layout, class Device, class Mem>
00698     void
00699     doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00700              const ArrayView<size_t> &numExportPacketsPerLID,
00701              const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
00702              const ArrayView<size_t> &numImportPacketsPerLID);
00703 
00708     template <class Packet, class Layout, class Device, class Mem>
00709     void
00710     doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00711                             size_t numPackets,
00712                             const Kokkos::View<Packet*, Layout, Device, Mem> &imports);
00713 
00718     template <class Packet, class Layout, class Device, class Mem>
00719     void
00720     doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00721                             const ArrayView<size_t> &numExportPacketsPerLID,
00722                             const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
00723                             const ArrayView<size_t> &numImportPacketsPerLID);
00724 
00729     template <class Packet, class Layout, class Device, class Mem>
00730     void
00731     doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00732                     size_t numPackets,
00733                     const Kokkos::View<Packet*, Layout, Device, Mem> &imports);
00734 
00739     template <class Packet, class Layout, class Device, class Mem>
00740     void
00741     doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
00742                     const ArrayView<size_t> &numExportPacketsPerLID,
00743                     const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
00744                     const ArrayView<size_t> &numImportPacketsPerLID);
00745 #endif
00746 
00750     void getLastDoStatistics(size_t & bytes_sent,  size_t & bytes_recvd) const{
00751       bytes_sent  = lastRoundBytesSend_;
00752       bytes_recvd = lastRoundBytesRecv_;
00753     }
00754 
00755 
00757 
00758 
00759 
00761     std::string description() const;
00762 
00764     void describe (Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const;
00765 
00767 
00768   private:
00770     RCP<const Comm<int> > comm_;
00771 
00773     Teuchos::RCP<Teuchos::FancyOStream> out_;
00774 
00776     Details::EDistributorHowInitialized howInitialized_;
00777 
00779 
00780 
00782     Details::EDistributorSendType sendType_;
00783 
00785     bool barrierBetween_;
00786 
00788     bool debug_;
00789 
00791 
00800     bool enable_cuda_rdma_;
00801 
00803 
00812     size_t numExports_;
00813 
00817     bool selfMessage_;
00818 
00828     size_t numSends_;
00829 
00834     Array<int> imagesTo_;
00835 
00844     Array<size_t> startsTo_;
00845 
00851     Array<size_t> lengthsTo_;
00852 
00856     size_t maxSendLength_;
00857 
00873     Array<size_t> indicesTo_;
00874 
00884     size_t numReceives_;
00885 
00892     size_t totalReceiveLength_;
00893 
00899     Array<size_t> lengthsFrom_;
00900 
00906     Array<int> imagesFrom_;
00907 
00913     Array<size_t> startsFrom_;
00914 
00921     Array<size_t> indicesFrom_;
00922 
00929     Array<RCP<Teuchos::CommRequest<int> > > requests_;
00930 
00935     mutable RCP<Distributor> reverseDistributor_;
00936 
00937 
00939     size_t lastRoundBytesSend_;
00940 
00942     size_t lastRoundBytesRecv_;
00943 
00944 #ifdef TPETRA_DISTRIBUTOR_TIMERS
00945     Teuchos::RCP<Teuchos::Time> timer_doPosts3_;
00946     Teuchos::RCP<Teuchos::Time> timer_doPosts4_;
00947     Teuchos::RCP<Teuchos::Time> timer_doWaits_;
00948     Teuchos::RCP<Teuchos::Time> timer_doPosts3_recvs_;
00949     Teuchos::RCP<Teuchos::Time> timer_doPosts4_recvs_;
00950     Teuchos::RCP<Teuchos::Time> timer_doPosts3_barrier_;
00951     Teuchos::RCP<Teuchos::Time> timer_doPosts4_barrier_;
00952     Teuchos::RCP<Teuchos::Time> timer_doPosts3_sends_;
00953     Teuchos::RCP<Teuchos::Time> timer_doPosts4_sends_;
00954 
00956     void makeTimers ();
00957 #endif // TPETRA_DISTRIBUTOR_TIMERS
00958 
00970     bool useDistinctTags_;
00971 
00976     int getTag (const int pathTag) const;
00977 
00991     void
00992     init (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
00993           const Teuchos::RCP<Teuchos::ParameterList>& plist);
00994 
01005     void computeReceives ();
01006 
01019     template <class Ordinal>
01020     void computeSends (const ArrayView<const Ordinal> &importIDs,
01021                        const ArrayView<const int> &importNodeIDs,
01022                        Array<Ordinal> &exportIDs,
01023                        Array<int> &exportNodeIDs);
01024 
01026     void createReverseDistributor() const;
01027 
01028   }; // class Distributor
01029 
01030 
01031   template <class Packet>
01032   void Distributor::
01033   doPostsAndWaits (const ArrayView<const Packet>& exports,
01034                    size_t numPackets,
01035                    const ArrayView<Packet>& imports)
01036   {
01037     using Teuchos::arcp;
01038     using Teuchos::ArrayRCP;
01039     typedef typename ArrayRCP<const Packet>::size_type size_type;
01040 
01041     TEUCHOS_TEST_FOR_EXCEPTION(
01042       requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
01043       "doPostsAndWaits(3 args): There are " << requests_.size () <<
01044       " outstanding nonblocking messages pending.  It is incorrect to call "
01045       "this method with posts outstanding.");
01046 
01047     // doPosts() accepts the exports and imports arrays as ArrayRCPs,
01048     // requiring that the memory location is persisting (as is
01049     // necessary for nonblocking receives).  However, it need only
01050     // persist until doWaits() completes, so it is safe for us to use
01051     // a nonpersisting reference in this case.  The use of a
01052     // nonpersisting reference is purely a performance optimization.
01053 
01054     //const Packet* exportsPtr = exports.getRawPtr();
01055     //ArrayRCP<const Packet> exportsArcp (exportsPtr, static_cast<size_type> (0),
01056     //                                    exports.size(), false);
01057     ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
01058                                         static_cast<size_type> (0),
01059                                         exports.size(), false);
01060 
01061     // For some reason, neither of the options below (that use arcp)
01062     // compile for Packet=std::complex<double> with GCC 4.5.1.  The
01063     // issue only arises with the exports array.  This is why we
01064     // construct a separate nonowning ArrayRCP.
01065 
01066     // doPosts (arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
01067     //              numPackets,
01068     //              arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
01069     // doPosts (arcp<const Packet> (exportsPtr, 0, exports.size(), false),
01070     //              numPackets,
01071     //              arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
01072     doPosts (exportsArcp,
01073              numPackets,
01074              arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
01075     doWaits ();
01076 
01077     lastRoundBytesSend_ = exports.size () * sizeof (Packet);
01078     lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
01079   }
01080 
01081   template <class Packet>
01082   void Distributor::
01083   doPostsAndWaits (const ArrayView<const Packet>& exports,
01084                    const ArrayView<size_t> &numExportPacketsPerLID,
01085                    const ArrayView<Packet> &imports,
01086                    const ArrayView<size_t> &numImportPacketsPerLID)
01087   {
01088     using Teuchos::arcp;
01089     using Teuchos::ArrayRCP;
01090 
01091     TEUCHOS_TEST_FOR_EXCEPTION(
01092       requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
01093       "doPostsAndWaits: There are " << requests_.size () << " outstanding "
01094       "nonblocking messages pending.  It is incorrect to call doPostsAndWaits "
01095       "with posts outstanding.");
01096 
01097     // doPosts() accepts the exports and imports arrays as ArrayRCPs,
01098     // requiring that the memory location is persisting (as is
01099     // necessary for nonblocking receives).  However, it need only
01100     // persist until doWaits() completes, so it is safe for us to use
01101     // a nonpersisting reference in this case.
01102 
01103     // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
01104     // for Packet=std::complex<T> (e.g., T=float) fails to compile
01105     // with some versions of GCC.  The issue only arises with the
01106     // exports array.  This is why we construct a separate nonowning
01107     // ArrayRCP.
01108     typedef typename ArrayRCP<const Packet>::size_type size_type;
01109     ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
01110                                         static_cast<size_type> (0),
01111                                         exports.size (), false);
01112     // mfh 04 Apr 2012: This is the offending code.  This statement
01113     // would normally be in place of "exportsArcp" in the
01114     // doPosts() call below.
01115     //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
01116     doPosts (exportsArcp,
01117              numExportPacketsPerLID,
01118              arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
01119              numImportPacketsPerLID);
01120     doWaits ();
01121 
01122     lastRoundBytesSend_ = exports.size () * sizeof (Packet);
01123     lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
01124   }
01125 
01126 
01127   template <class Packet>
01128   void Distributor::
01129   doPosts (const ArrayRCP<const Packet>& exports,
01130            size_t numPackets,
01131            const ArrayRCP<Packet>& imports)
01132   {
01133     using Teuchos::Array;
01134     using Teuchos::as;
01135     using Teuchos::FancyOStream;
01136     using Teuchos::includesVerbLevel;
01137     using Teuchos::ireceive;
01138     using Teuchos::isend;
01139     using Teuchos::OSTab;
01140     using Teuchos::readySend;
01141     using Teuchos::send;
01142     using Teuchos::ssend;
01143     using Teuchos::TypeNameTraits;
01144     using Teuchos::typeName;
01145     using std::endl;
01146     typedef Array<size_t>::size_type size_type;
01147 
01148     Teuchos::OSTab tab (out_);
01149 
01150 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01151     Teuchos::TimeMonitor timeMon (*timer_doPosts3_);
01152 #endif // TPETRA_DISTRIBUTOR_TIMERS
01153 
01154     // Run-time configurable parameters that come from the input
01155     // ParameterList set by setParameterList().
01156     const Details::EDistributorSendType sendType = sendType_;
01157     const bool doBarrier = barrierBetween_;
01158 
01159 // #ifdef HAVE_TEUCHOS_DEBUG
01160 //     // Prepare for verbose output, if applicable.
01161 //     Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
01162 //     (void) verbLevel; // Silence "unused variable" compiler warning.
01163 //     RCP<FancyOStream> out = this->getOStream ();
01164 //     // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
01165 //     //   includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
01166 //     const bool doPrint = out.get () && (comm_->getRank () == 0);
01167 
01168 //     if (doPrint) {
01169 //       // Only need one process to print out parameters.
01170 //       *out << "Distributor::doPosts (3 args)" << endl;
01171 //     }
01172 //     // Add one tab level.  We declare this outside the doPrint scopes
01173 //     // so that the tab persists until the end of this method.
01174 //     OSTab tab = this->getOSTab ();
01175 //     if (doPrint) {
01176 //       *out << "Parameters:" << endl;
01177 //       {
01178 //         OSTab tab2 (out);
01179 //         *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
01180 //              << endl << "barrierBetween: " << doBarrier << endl;
01181 //       }
01182 //     }
01183 // #endif // HAVE_TEUCHOS_DEBUG
01184 
01185     TEUCHOS_TEST_FOR_EXCEPTION(
01186       sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
01187       "Tpetra::Distributor::doPosts(3 args): Ready-send version requires a "
01188       "barrier between posting receives and posting ready sends.  This should "
01189       "have been checked before.  "
01190       "Please report this bug to the Tpetra developers.");
01191 
01192     const int myImageID = comm_->getRank ();
01193     size_t selfReceiveOffset = 0;
01194 
01195     // Each message has the same number of packets.
01196     //
01197     // FIXME (mfh 18 Jul 2014): Relaxing this test from strict
01198     // inequality to a less-than seems to have fixed Bug 6170.  It's
01199     // OK for the 'imports' array to be longer than it needs to be;
01200     // I'm just curious why it would be.
01201     const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
01202     TEUCHOS_TEST_FOR_EXCEPTION(
01203       static_cast<size_t> (imports.size ()) < totalNumImportPackets,
01204       std::invalid_argument, "Tpetra::Distributor::doPosts(3 args): The "
01205       "'imports' array must have enough entries to hold the expected number "
01206       "of import packets.  imports.size() = " << imports.size () << " < "
01207       "totalNumImportPackets = " << totalNumImportPackets << ".");
01208 
01209     // MPI tag for nonblocking receives and blocking sends in this
01210     // method.  Some processes might take the "fast" path
01211     // (indicesTo_.empty()) and others might take the "slow" path for
01212     // the same doPosts() call, so the path tag must be the same for
01213     // both.
01214     const int pathTag = 0;
01215     const int tag = this->getTag (pathTag);
01216 
01217     if (debug_) {
01218       TEUCHOS_TEST_FOR_EXCEPTION(
01219         requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
01220         "doPosts(3 args): Process " << myImageID << ": requests_.size() = "
01221         << requests_.size () << " != 0.");
01222       std::ostringstream os;
01223       os << myImageID << ": doPosts(3,"
01224          << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
01225       *out_ << os.str ();
01226     }
01227 
01228     // Distributor uses requests_.size() as the number of outstanding
01229     // nonblocking message requests, so we resize to zero to maintain
01230     // this invariant.
01231     //
01232     // numReceives_ does _not_ include the self message, if there is
01233     // one.  Here, we do actually send a message to ourselves, so we
01234     // include any self message in the "actual" number of receives to
01235     // post.
01236     //
01237     // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
01238     // doesn't (re)allocate its array of requests.  That happens in
01239     // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
01240     // demand), or Resize_().
01241     const size_type actualNumReceives = as<size_type> (numReceives_) +
01242       as<size_type> (selfMessage_ ? 1 : 0);
01243     requests_.resize (0);
01244 
01245     // Post the nonblocking receives.  It's common MPI wisdom to post
01246     // receives before sends.  In MPI terms, this means favoring
01247     // adding to the "posted queue" (of receive requests) over adding
01248     // to the "unexpected queue" (of arrived messages not yet matched
01249     // with a receive).
01250     {
01251 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01252       Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_);
01253 #endif // TPETRA_DISTRIBUTOR_TIMERS
01254 
01255       size_t curBufOffset = 0;
01256       for (size_type i = 0; i < actualNumReceives; ++i) {
01257         const size_t curBufLen = lengthsFrom_[i] * numPackets;
01258         if (imagesFrom_[i] != myImageID) {
01259           // If my process is receiving these packet(s) from another
01260           // process (not a self-receive):
01261           //
01262           // 1. Set up the persisting view (recvBuf) of the imports
01263           //    array, given the offset and size (total number of
01264           //    packets from process imagesFrom_[i]).
01265           // 2. Start the Irecv and save the resulting request.
01266           TEUCHOS_TEST_FOR_EXCEPTION(
01267             curBufOffset + curBufLen > static_cast<size_t> (imports.size ()),
01268             std::logic_error, "Tpetra::Distributor::doPosts(3 args): Exceeded "
01269             "size of 'imports' array in packing loop on Process " << myImageID
01270             << ".  imports.size() = " << imports.size () << " < offset + length"
01271             " = " << (curBufOffset + curBufLen) << ".");
01272 
01273           ArrayRCP<Packet> recvBuf =
01274             imports.persistingView (curBufOffset, curBufLen);
01275           requests_.push_back (ireceive<int, Packet> (recvBuf, imagesFrom_[i],
01276                                                       tag, *comm_));
01277           if (debug_) {
01278             std::ostringstream os;
01279             os << myImageID << ": doPosts(3,"
01280                << (indicesTo_.empty () ? "fast" : "slow") << "): "
01281                << "Posted irecv from Proc " << imagesFrom_[i] << " with "
01282               "specified tag " << tag << endl;
01283             *out_ << os.str ();
01284           }
01285         }
01286         else { // Receiving from myself
01287           selfReceiveOffset = curBufOffset; // Remember the self-recv offset
01288         }
01289         curBufOffset += curBufLen;
01290       }
01291     }
01292 
01293     if (doBarrier) {
01294 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01295       Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_);
01296 #endif // TPETRA_DISTRIBUTOR_TIMERS
01297       // If we are using ready sends (MPI_Rsend) below, we need to do
01298       // a barrier before we post the ready sends.  This is because a
01299       // ready send requires that its matching receive has already
01300       // been posted before the send has been posted.  The only way to
01301       // guarantee that in this case is to use a barrier.
01302       comm_->barrier ();
01303     }
01304 
01305 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01306     Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_);
01307 #endif // TPETRA_DISTRIBUTOR_TIMERS
01308 
01309     // setup scan through imagesTo_ list starting with higher numbered images
01310     // (should help balance message traffic)
01311     //
01312     // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
01313     // It doesn't depend on the input at all.
01314     size_t numBlocks = numSends_ + selfMessage_;
01315     size_t imageIndex = 0;
01316     while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
01317       ++imageIndex;
01318     }
01319     if (imageIndex == numBlocks) {
01320       imageIndex = 0;
01321     }
01322 
01323     size_t selfNum = 0;
01324     size_t selfIndex = 0;
01325 
01326     if (indicesTo_.empty()) {
01327       if (debug_) {
01328         std::ostringstream os;
01329         os << myImageID << ": doPosts(3,fast): posting sends" << endl;
01330         *out_ << os.str ();
01331       }
01332 
01333       // Data are already blocked (laid out) by process, so we don't
01334       // need a separate send buffer (besides the exports array).
01335       for (size_t i = 0; i < numBlocks; ++i) {
01336         size_t p = i + imageIndex;
01337         if (p > (numBlocks - 1)) {
01338           p -= numBlocks;
01339         }
01340 
01341         if (imagesTo_[p] != myImageID) {
01342           ArrayView<const Packet> tmpSend =
01343             exports.view (startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
01344 
01345           if (sendType == Details::DISTRIBUTOR_SEND) {
01346             send<int, Packet> (tmpSend.getRawPtr (),
01347                                as<int> (tmpSend.size ()),
01348                                imagesTo_[p], tag, *comm_);
01349           }
01350           else if (sendType == Details::DISTRIBUTOR_ISEND) {
01351             ArrayRCP<const Packet> tmpSendBuf =
01352               exports.persistingView (startsTo_[p] * numPackets,
01353                                       lengthsTo_[p] * numPackets);
01354             requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
01355                                                      tag, *comm_));
01356           }
01357           else if (sendType == Details::DISTRIBUTOR_RSEND) {
01358             readySend<int, Packet> (tmpSend.getRawPtr (),
01359                                     as<int> (tmpSend.size ()),
01360                                     imagesTo_[p], tag, *comm_);
01361           }
01362           else if (sendType == Details::DISTRIBUTOR_SSEND) {
01363             ssend<int, Packet> (tmpSend.getRawPtr (),
01364                                 as<int> (tmpSend.size ()),
01365                                 imagesTo_[p], tag, *comm_);
01366           } else {
01367             TEUCHOS_TEST_FOR_EXCEPTION(
01368               true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
01369               "Invalid send type.  We should never get here.  "
01370               "Please report this bug to the Tpetra developers.");
01371           }
01372 
01373           if (debug_) {
01374             std::ostringstream os;
01375             os << myImageID << ": doPosts(3,fast): "
01376                << "Posted send to Proc " << imagesTo_[i]
01377                << " w/ specified tag " << tag << endl;
01378             *out_ << os.str ();
01379           }
01380         }
01381         else { // "Sending" the message to myself
01382           selfNum = p;
01383         }
01384       }
01385 
01386       if (selfMessage_) {
01387         // This is how we "send a message to ourself": we copy from
01388         // the export buffer to the import buffer.  That saves
01389         // Teuchos::Comm implementations other than MpiComm (in
01390         // particular, SerialComm) the trouble of implementing self
01391         // messages correctly.  (To do this right, SerialComm would
01392         // need internal buffer space for messages, keyed on the
01393         // message's tag.)
01394         std::copy (exports.begin()+startsTo_[selfNum]*numPackets,
01395                    exports.begin()+startsTo_[selfNum]*numPackets+lengthsTo_[selfNum]*numPackets,
01396                    imports.begin()+selfReceiveOffset);
01397       }
01398       if (debug_) {
01399         std::ostringstream os;
01400         os << myImageID << ": doPosts(3,fast) done" << endl;
01401         *out_ << os.str ();
01402       }
01403     }
01404     else { // data are not blocked by image, use send buffer
01405       if (debug_) {
01406         std::ostringstream os;
01407         os << myImageID << ": doPosts(3,slow): posting sends" << endl;
01408         *out_ << os.str ();
01409       }
01410 
01411       // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
01412       // sends), because the buffer is only long enough for one send.
01413       ArrayRCP<Packet> sendArray (maxSendLength_ * numPackets); // send buffer
01414 
01415       TEUCHOS_TEST_FOR_EXCEPTION(
01416         sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
01417         "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path "
01418         << "doesn't currently work with nonblocking sends.");
01419 
01420       for (size_t i = 0; i < numBlocks; ++i) {
01421         size_t p = i + imageIndex;
01422         if (p > (numBlocks - 1)) {
01423           p -= numBlocks;
01424         }
01425 
01426         if (imagesTo_[p] != myImageID) {
01427           typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
01428           size_t sendArrayOffset = 0;
01429           size_t j = startsTo_[p];
01430           for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
01431             srcBegin = exports.begin() + indicesTo_[j]*numPackets;
01432             srcEnd   = srcBegin + numPackets;
01433             std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
01434             sendArrayOffset += numPackets;
01435           }
01436           ArrayView<const Packet> tmpSend =
01437             sendArray.view (0, lengthsTo_[p]*numPackets);
01438 
01439           if (sendType == Details::DISTRIBUTOR_SEND) {
01440             send<int, Packet> (tmpSend.getRawPtr (),
01441                                as<int> (tmpSend.size ()),
01442                                imagesTo_[p], tag, *comm_);
01443           }
01444           else if (sendType == Details::DISTRIBUTOR_ISEND) {
01445             ArrayRCP<const Packet> tmpSendBuf =
01446               sendArray.persistingView (0, lengthsTo_[p] * numPackets);
01447             requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
01448                                                      tag, *comm_));
01449           }
01450           else if (sendType == Details::DISTRIBUTOR_RSEND) {
01451             readySend<int, Packet> (tmpSend.getRawPtr (),
01452                                     as<int> (tmpSend.size ()),
01453                                     imagesTo_[p], tag, *comm_);
01454           }
01455           else if (sendType == Details::DISTRIBUTOR_SSEND) {
01456             ssend<int, Packet> (tmpSend.getRawPtr (),
01457                                 as<int> (tmpSend.size ()),
01458                                 imagesTo_[p], tag, *comm_);
01459           }
01460           else {
01461             TEUCHOS_TEST_FOR_EXCEPTION(
01462               true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
01463               "Invalid send type.  We should never get here.  "
01464               "Please report this bug to the Tpetra developers.");
01465           }
01466 
01467           if (debug_) {
01468             std::ostringstream os;
01469             os << myImageID << ": doPosts(3,slow): "
01470                << "Posted send to Proc " << imagesTo_[i]
01471                << " w/ specified tag " << tag << endl;
01472             *out_ << os.str ();
01473           }
01474         }
01475         else { // "Sending" the message to myself
01476           selfNum = p;
01477           selfIndex = startsTo_[p];
01478         }
01479       }
01480 
01481       if (selfMessage_) {
01482         for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
01483           std::copy (exports.begin()+indicesTo_[selfIndex]*numPackets,
01484                      exports.begin()+indicesTo_[selfIndex]*numPackets + numPackets,
01485                      imports.begin() + selfReceiveOffset);
01486           ++selfIndex;
01487           selfReceiveOffset += numPackets;
01488         }
01489       }
01490       if (debug_) {
01491         std::ostringstream os;
01492         os << myImageID << ": doPosts(3,slow) done" << endl;
01493         *out_ << os.str ();
01494       }
01495     }
01496   }
01497 
01498   template <class Packet>
01499   void Distributor::
01500   doPosts (const ArrayRCP<const Packet>& exports,
01501            const ArrayView<size_t>& numExportPacketsPerLID,
01502            const ArrayRCP<Packet>& imports,
01503            const ArrayView<size_t>& numImportPacketsPerLID)
01504   {
01505     using Teuchos::Array;
01506     using Teuchos::as;
01507     using Teuchos::ireceive;
01508     using Teuchos::isend;
01509     using Teuchos::readySend;
01510     using Teuchos::send;
01511     using Teuchos::ssend;
01512     using Teuchos::TypeNameTraits;
01513 #ifdef HAVE_TEUCHOS_DEBUG
01514     using Teuchos::OSTab;
01515 #endif // HAVE_TEUCHOS_DEBUG
01516     using std::endl;
01517     typedef Array<size_t>::size_type size_type;
01518 
01519     Teuchos::OSTab tab (out_);
01520 
01521 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01522     Teuchos::TimeMonitor timeMon (*timer_doPosts4_);
01523 #endif // TPETRA_DISTRIBUTOR_TIMERS
01524 
01525     // Run-time configurable parameters that come from the input
01526     // ParameterList set by setParameterList().
01527     const Details::EDistributorSendType sendType = sendType_;
01528     const bool doBarrier = barrierBetween_;
01529 
01530 // #ifdef HAVE_TEUCHOS_DEBUG
01531 //     // Prepare for verbose output, if applicable.
01532 //     Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
01533 //     RCP<Teuchos::FancyOStream> out = this->getOStream ();
01534 //     const bool doPrint = out.get () && (comm_->getRank () == 0) &&
01535 //       includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
01536 
01537 //     if (doPrint) {
01538 //       // Only need one process to print out parameters.
01539 //       *out << "Distributor::doPosts (4 args)" << endl;
01540 //     }
01541 //     // Add one tab level.  We declare this outside the doPrint scopes
01542 //     // so that the tab persists until the end of this method.
01543 //     Teuchos::OSTab tab = this->getOSTab ();
01544 //     if (doPrint) {
01545 //       *out << "Parameters:" << endl;
01546 //       {
01547 //         OSTab tab2 (out);
01548 //         *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
01549 //              << endl << "barrierBetween: " << doBarrier << endl;
01550 //       }
01551 //     }
01552 // #endif // HAVE_TEUCHOS_DEBUG
01553 
01554     TEUCHOS_TEST_FOR_EXCEPTION(
01555       sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
01556       "Tpetra::Distributor::doPosts(4 args): Ready-send version requires a "
01557       "barrier between posting receives and posting ready sends.  This should "
01558       "have been checked before.  "
01559       "Please report this bug to the Tpetra developers.");
01560 
01561     const int myImageID = comm_->getRank ();
01562     size_t selfReceiveOffset = 0;
01563 
01564 #ifdef HAVE_TEUCHOS_DEBUG
01565     // Different messages may have different numbers of packets.
01566     size_t totalNumImportPackets = 0;
01567     for (int ii = 0; ii < numImportPacketsPerLID.size(); ++ii) {
01568       totalNumImportPackets += numImportPacketsPerLID[ii];
01569     }
01570     TEUCHOS_TEST_FOR_EXCEPTION(
01571       static_cast<size_t> (imports.size ()) < totalNumImportPackets,
01572       std::runtime_error, "Tpetra::Distributor::doPosts(4 args): The 'imports' "
01573       "array must have enough entries to hold the expected number of import "
01574       "packets.  imports.size() = " << imports.size() << " < "
01575       "totalNumImportPackets = " << totalNumImportPackets << ".");
01576 #endif // HAVE_TEUCHOS_DEBUG
01577 
01578     // MPI tag for nonblocking receives and blocking sends in this
01579     // method.  Some processes might take the "fast" path
01580     // (indicesTo_.empty()) and others might take the "slow" path for
01581     // the same doPosts() call, so the path tag must be the same for
01582     // both.
01583     const int pathTag = 1;
01584     const int tag = this->getTag (pathTag);
01585 
01586     if (debug_) {
01587       TEUCHOS_TEST_FOR_EXCEPTION(
01588         requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
01589         "doPosts(4 args): Process " << myImageID << ": requests_.size() = "
01590         << requests_.size () << " != 0.");
01591       std::ostringstream os;
01592       os << myImageID << ": doPosts(4,"
01593          << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
01594       *out_ << os.str ();
01595     }
01596 
01597     // Distributor uses requests_.size() as the number of outstanding
01598     // nonblocking message requests, so we resize to zero to maintain
01599     // this invariant.
01600     //
01601     // numReceives_ does _not_ include the self message, if there is
01602     // one.  Here, we do actually send a message to ourselves, so we
01603     // include any self message in the "actual" number of receives to
01604     // post.
01605     //
01606     // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
01607     // doesn't (re)allocate its array of requests.  That happens in
01608     // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
01609     // demand), or Resize_().
01610     const size_type actualNumReceives = as<size_type> (numReceives_) +
01611       as<size_type> (selfMessage_ ? 1 : 0);
01612     requests_.resize (0);
01613 
01614     // Post the nonblocking receives.  It's common MPI wisdom to post
01615     // receives before sends.  In MPI terms, this means favoring
01616     // adding to the "posted queue" (of receive requests) over adding
01617     // to the "unexpected queue" (of arrived messages not yet matched
01618     // with a receive).
01619     {
01620 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01621       Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_);
01622 #endif // TPETRA_DISTRIBUTOR_TIMERS
01623 
01624       size_t curBufferOffset = 0;
01625       size_t curLIDoffset = 0;
01626       for (size_type i = 0; i < actualNumReceives; ++i) {
01627         size_t totalPacketsFrom_i = 0;
01628         for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
01629           totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
01630         }
01631         curLIDoffset += lengthsFrom_[i];
01632         if (imagesFrom_[i] != myImageID && totalPacketsFrom_i) {
01633           // If my process is receiving these packet(s) from another
01634           // process (not a self-receive), and if there is at least
01635           // one packet to receive:
01636           //
01637           // 1. Set up the persisting view (recvBuf) into the imports
01638           //    array, given the offset and size (total number of
01639           //    packets from process imagesFrom_[i]).
01640           // 2. Start the Irecv and save the resulting request.
01641           ArrayRCP<Packet> recvBuf =
01642             imports.persistingView (curBufferOffset, totalPacketsFrom_i);
01643           requests_.push_back (ireceive<int, Packet> (recvBuf, imagesFrom_[i],
01644                                                       tag, *comm_));
01645         }
01646         else { // Receiving these packet(s) from myself
01647           selfReceiveOffset = curBufferOffset; // Remember the offset
01648         }
01649         curBufferOffset += totalPacketsFrom_i;
01650       }
01651     }
01652 
01653     if (doBarrier) {
01654 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01655       Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_);
01656 #endif // TPETRA_DISTRIBUTOR_TIMERS
01657       // If we are using ready sends (MPI_Rsend) below, we need to do
01658       // a barrier before we post the ready sends.  This is because a
01659       // ready send requires that its matching receive has already
01660       // been posted before the send has been posted.  The only way to
01661       // guarantee that in this case is to use a barrier.
01662       comm_->barrier ();
01663     }
01664 
01665 #ifdef TPETRA_DISTRIBUTOR_TIMERS
01666     Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_);
01667 #endif // TPETRA_DISTRIBUTOR_TIMERS
01668 
01669     // setup arrays containing starting-offsets into exports for each send,
01670     // and num-packets-to-send for each send.
01671     Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
01672     size_t maxNumPackets = 0;
01673     size_t curPKToffset = 0;
01674     for (size_t pp=0; pp<numSends_; ++pp) {
01675       sendPacketOffsets[pp] = curPKToffset;
01676       size_t numPackets = 0;
01677       for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
01678         numPackets += numExportPacketsPerLID[j];
01679       }
01680       if (numPackets > maxNumPackets) maxNumPackets = numPackets;
01681       packetsPerSend[pp] = numPackets;
01682       curPKToffset += numPackets;
01683     }
01684 
01685     // setup scan through imagesTo_ list starting with higher numbered images
01686     // (should help balance message traffic)
01687     size_t numBlocks = numSends_+ selfMessage_;
01688     size_t imageIndex = 0;
01689     while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
01690       ++imageIndex;
01691     }
01692     if (imageIndex == numBlocks) {
01693       imageIndex = 0;
01694     }
01695 
01696     size_t selfNum = 0;
01697     size_t selfIndex = 0;
01698 
01699     if (indicesTo_.empty()) {
01700       if (debug_) {
01701         std::ostringstream os;
01702         os << myImageID << ": doPosts(4,fast): posting sends" << endl;
01703         *out_ << os.str ();
01704       }
01705 
01706       // Data are already blocked (laid out) by process, so we don't
01707       // need a separate send buffer (besides the exports array).
01708       for (size_t i = 0; i < numBlocks; ++i) {
01709         size_t p = i + imageIndex;
01710         if (p > (numBlocks - 1)) {
01711           p -= numBlocks;
01712         }
01713 
01714         if (imagesTo_[p] != myImageID && packetsPerSend[p] > 0) {
01715           ArrayView<const Packet> tmpSend =
01716             exports.view (sendPacketOffsets[p], packetsPerSend[p]);
01717 
01718           if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
01719             send<int, Packet> (tmpSend.getRawPtr (),
01720                                as<int> (tmpSend.size ()),
01721                                imagesTo_[p], tag, *comm_);
01722           }
01723           else if (sendType == Details::DISTRIBUTOR_RSEND) {
01724             readySend<int, Packet> (tmpSend.getRawPtr (),
01725                                     as<int> (tmpSend.size ()),
01726                                     imagesTo_[p], tag, *comm_);
01727           }
01728           else if (sendType == Details::DISTRIBUTOR_ISEND) {
01729             ArrayRCP<const Packet> tmpSendBuf =
01730               exports.persistingView (sendPacketOffsets[p], packetsPerSend[p]);
01731             requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
01732                                                      tag, *comm_));
01733           }
01734           else if (sendType == Details::DISTRIBUTOR_SSEND) {
01735             ssend<int, Packet> (tmpSend.getRawPtr (),
01736                                 as<int> (tmpSend.size ()),
01737                                 imagesTo_[p], tag, *comm_);
01738           }
01739           else {
01740             TEUCHOS_TEST_FOR_EXCEPTION(
01741               true, std::logic_error, "Tpetra::Distributor::doPosts(4 args): "
01742               "Invalid send type.  We should never get here.  Please report "
01743               "this bug to the Tpetra developers.");
01744           }
01745         }
01746         else { // "Sending" the message to myself
01747           selfNum = p;
01748         }
01749       }
01750 
01751       if (selfMessage_) {
01752         std::copy (exports.begin()+sendPacketOffsets[selfNum],
01753                    exports.begin()+sendPacketOffsets[selfNum]+packetsPerSend[selfNum],
01754                    imports.begin()+selfReceiveOffset);
01755       }
01756       if (debug_) {
01757         std::ostringstream os;
01758         os << myImageID << ": doPosts(4,fast) done" << endl;
01759         *out_ << os.str ();
01760       }
01761     }
01762     else { // data are not blocked by image, use send buffer
01763       if (debug_) {
01764         std::ostringstream os;
01765         os << myImageID << ": doPosts(4,slow): posting sends" << endl;
01766         *out_ << os.str ();
01767       }
01768 
01769       // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
01770       ArrayRCP<Packet> sendArray (maxNumPackets); // send buffer
01771 
01772       TEUCHOS_TEST_FOR_EXCEPTION(
01773         sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
01774         "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" "
01775         "code path may not necessarily work with nonblocking sends.");
01776 
01777       Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
01778       size_t ioffset = 0;
01779       for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
01780         indicesOffsets[j] = ioffset;
01781         ioffset += numExportPacketsPerLID[j];
01782       }
01783 
01784       for (size_t i = 0; i < numBlocks; ++i) {
01785         size_t p = i + imageIndex;
01786         if (p > (numBlocks - 1)) {
01787           p -= numBlocks;
01788         }
01789 
01790         if (imagesTo_[p] != myImageID) {
01791           typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
01792           size_t sendArrayOffset = 0;
01793           size_t j = startsTo_[p];
01794           size_t numPacketsTo_p = 0;
01795           for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
01796             srcBegin = exports.begin() + indicesOffsets[j];
01797             srcEnd   = srcBegin + numExportPacketsPerLID[j];
01798             numPacketsTo_p += numExportPacketsPerLID[j];
01799             std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
01800             sendArrayOffset += numExportPacketsPerLID[j];
01801           }
01802           if (numPacketsTo_p > 0) {
01803             ArrayView<const Packet> tmpSend =
01804               sendArray.view (0, numPacketsTo_p);
01805 
01806             if (sendType == Details::DISTRIBUTOR_RSEND) {
01807               readySend<int, Packet> (tmpSend.getRawPtr (),
01808                                       as<int> (tmpSend.size ()),
01809                                       imagesTo_[p], tag, *comm_);
01810             }
01811             else if (sendType == Details::DISTRIBUTOR_ISEND) {
01812               ArrayRCP<const Packet> tmpSendBuf =
01813                 sendArray.persistingView (0, numPacketsTo_p);
01814               requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
01815                                                        tag, *comm_));
01816             }
01817             else if (sendType == Details::DISTRIBUTOR_SSEND) {
01818               ssend<int, Packet> (tmpSend.getRawPtr (),
01819                                   as<int> (tmpSend.size ()),
01820                                   imagesTo_[p], tag, *comm_);
01821             }
01822             else { // if (sendType == Details::DISTRIBUTOR_SSEND)
01823               send<int, Packet> (tmpSend.getRawPtr (),
01824                                  as<int> (tmpSend.size ()),
01825                                  imagesTo_[p], tag, *comm_);
01826             }
01827           }
01828         }
01829         else { // "Sending" the message to myself
01830           selfNum = p;
01831           selfIndex = startsTo_[p];
01832         }
01833       }
01834 
01835       if (selfMessage_) {
01836         for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
01837           std::copy (exports.begin()+indicesOffsets[selfIndex],
01838                      exports.begin()+indicesOffsets[selfIndex]+numExportPacketsPerLID[selfIndex],
01839                      imports.begin() + selfReceiveOffset);
01840           selfReceiveOffset += numExportPacketsPerLID[selfIndex];
01841           ++selfIndex;
01842         }
01843       }
01844       if (debug_) {
01845         std::ostringstream os;
01846         os << myImageID << ": doPosts(4,slow) done" << endl;
01847         *out_ << os.str ();
01848       }
01849     }
01850   }
01851 
01852   template <class Packet>
01853   void Distributor::
01854   doReversePostsAndWaits (const ArrayView<const Packet>& exports,
01855                           size_t numPackets,
01856                           const ArrayView<Packet>& imports)
01857   {
01858     using Teuchos::as;
01859 
01860     // doReversePosts() takes exports and imports as ArrayRCPs,
01861     // requiring that the memory locations are persisting.  However,
01862     // they need only persist within the scope of that routine, so it
01863     // is safe for us to use nonpersisting references in this case.
01864 
01865     // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
01866     // for Packet=std::complex<T> (e.g., T=float) fails to compile
01867     // with some versions of GCC.  The issue only arises with the
01868     // exports array.  This is why we construct a separate nonowning
01869     // ArrayRCP.
01870     typedef typename ArrayRCP<const Packet>::size_type size_type;
01871     ArrayRCP<const Packet> exportsArcp (exports.getRawPtr(), as<size_type> (0),
01872                                         exports.size(), false);
01873     // mfh 04 Apr 2012: This is the offending code.  This statement
01874     // would normally be in place of "exportsArcp" in the
01875     // doReversePosts() call below.
01876     //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false)
01877     doReversePosts (exportsArcp,
01878                     numPackets,
01879                     arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
01880     doReverseWaits ();
01881 
01882     lastRoundBytesSend_ = exports.size() * sizeof(Packet);
01883     lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
01884   }
01885 
01886   template <class Packet>
01887   void Distributor::
01888   doReversePostsAndWaits (const ArrayView<const Packet>& exports,
01889                           const ArrayView<size_t> &numExportPacketsPerLID,
01890                           const ArrayView<Packet> &imports,
01891                           const ArrayView<size_t> &numImportPacketsPerLID)
01892   {
01893     using Teuchos::as;
01894     using Teuchos::arcp;
01895     using Teuchos::ArrayRCP;
01896 
01897     TEUCHOS_TEST_FOR_EXCEPTION(
01898       requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
01899       "doReversePostsAndWaits(4 args): There are " << requests_.size ()
01900       << " outstanding nonblocking messages pending.  It is incorrect to call "
01901       "this method with posts outstanding.");
01902 
01903     // doReversePosts() accepts the exports and imports arrays as
01904     // ArrayRCPs, requiring that the memory location is persisting (as
01905     // is necessary for nonblocking receives).  However, it need only
01906     // persist until doReverseWaits() completes, so it is safe for us
01907     // to use a nonpersisting reference in this case.  The use of a
01908     // nonpersisting reference is purely a performance optimization.
01909 
01910     // mfh 02 Apr 2012: For some reason, calling arcp<const Packet>
01911     // for Packet=std::complex<double> fails to compile with some
01912     // versions of GCC.  The issue only arises with the exports array.
01913     // This is why we construct a separate nonowning ArrayRCP.
01914     typedef typename ArrayRCP<const Packet>::size_type size_type;
01915     ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (), as<size_type> (0),
01916                                         exports.size (), false);
01917     doReversePosts (exportsArcp,
01918                     numExportPacketsPerLID,
01919                     arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
01920                     numImportPacketsPerLID);
01921     doReverseWaits ();
01922 
01923     lastRoundBytesSend_ = exports.size() * sizeof(Packet);
01924     lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
01925   }
01926 
01927   template <class Packet>
01928   void Distributor::
01929   doReversePosts (const ArrayRCP<const Packet>& exports,
01930                   size_t numPackets,
01931                   const ArrayRCP<Packet>& imports)
01932   {
01933     // FIXME (mfh 29 Mar 2012) WHY?
01934     TEUCHOS_TEST_FOR_EXCEPTION(
01935       ! indicesTo_.empty (), std::runtime_error,
01936       "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
01937       "communication when original data are blocked by process.");
01938     if (reverseDistributor_.is_null ()) {
01939       createReverseDistributor ();
01940     }
01941     reverseDistributor_->doPosts (exports, numPackets, imports);
01942   }
01943 
01944   template <class Packet>
01945   void Distributor::
01946   doReversePosts (const ArrayRCP<const Packet>& exports,
01947                   const ArrayView<size_t>& numExportPacketsPerLID,
01948                   const ArrayRCP<Packet>& imports,
01949                   const ArrayView<size_t>& numImportPacketsPerLID)
01950   {
01951     // FIXME (mfh 29 Mar 2012) WHY?
01952     TEUCHOS_TEST_FOR_EXCEPTION(
01953       ! indicesTo_.empty (), std::runtime_error,
01954       "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
01955       "communication when original data are blocked by process.");
01956     if (reverseDistributor_.is_null ()) {
01957       createReverseDistributor ();
01958     }
01959     reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
01960                                   imports, numImportPacketsPerLID);
01961   }
01962 
01963 #if TPETRA_USE_KOKKOS_DISTOBJECT || defined(TPETRA_HAVE_KOKKOS_REFACTOR)
01964 
01965   template <class Packet, class Layout, class Device, class Mem>
01966   void Distributor::
01967   doPostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
01968                    size_t numPackets,
01969                    const Kokkos::View<Packet*, Layout, Device, Mem> &imports)
01970   {
01971     // If the MPI library doesn't support RDMA for communication
01972     // directly to or from the GPU's memory, we must transfer exports
01973     // to the host, do the communication, then transfer imports back
01974     // to the GPU.
01975     //
01976     // We need to do this here instead of doPosts() because the copy
01977     // back to the GPU must occur after the waits.
01978     typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view;
01979     typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view;
01980     if (! enable_cuda_rdma_ && ! exports_view::is_hostspace) {
01981       typename exports_view::HostMirror host_exports =
01982         Kokkos::create_mirror_view (exports);
01983       typename imports_view::HostMirror host_imports =
01984         Kokkos::create_mirror_view (imports);
01985       Kokkos::deep_copy (host_exports, exports);
01986       doPostsAndWaits (Kokkos::Compat::create_const_view (host_exports),
01987                        numPackets,
01988                        host_imports);
01989       Kokkos::deep_copy (imports, host_imports);
01990       return;
01991     }
01992 
01993     TEUCHOS_TEST_FOR_EXCEPTION(
01994       requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
01995       "doPostsAndWaits(3 args): There are " << requests_.size () <<
01996       " outstanding nonblocking messages pending.  It is incorrect to call "
01997       "this method with posts outstanding.");
01998 
01999     doPosts (exports, numPackets, imports);
02000     doWaits ();
02001   }
02002 
02003   template <class Packet, class Layout, class Device, class Mem>
02004   void Distributor::
02005   doPostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
02006                    const ArrayView<size_t> &numExportPacketsPerLID,
02007                    const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
02008                    const ArrayView<size_t> &numImportPacketsPerLID)
02009   {
02010     // If MPI library doesn't support RDMA for communication directly
02011     // to/from GPU, transfer exports to the host, do the communication, then
02012     // transfer imports back to the GPU
02013     //
02014     // We need to do this here instead of doPosts() because the copy back
02015     // to the GPU must occur after the waits.
02016     typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view;
02017     typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view;
02018     if (!enable_cuda_rdma_ && !exports_view::is_hostspace) {
02019       typename exports_view::HostMirror host_exports =
02020         Kokkos::create_mirror_view(exports);
02021       typename imports_view::HostMirror host_imports =
02022         Kokkos::create_mirror_view(imports);
02023       Kokkos::deep_copy (host_exports, exports);
02024       doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports),
02025                       numExportPacketsPerLID,
02026                       host_imports,
02027                       numImportPacketsPerLID);
02028       Kokkos::deep_copy (imports, host_imports);
02029       return;
02030     }
02031 
02032     TEUCHOS_TEST_FOR_EXCEPTION(
02033       requests_.size () != 0, std::runtime_error,
02034       "Tpetra::Distributor::doPostsAndWaits(4 args): There are "
02035       << requests_.size () << " outstanding nonblocking messages pending.  "
02036       "It is incorrect to call this method with posts outstanding.");
02037 
02038     doPosts (exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
02039     doWaits ();
02040   }
02041 
02042 
02043   template <class Packet, class Layout, class Device, class Mem>
02044   void Distributor::
02045   doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
02046            size_t numPackets,
02047            const Kokkos::View<Packet*, Layout, Device, Mem> &imports)
02048   {
02049     using Teuchos::Array;
02050     using Teuchos::as;
02051     using Teuchos::FancyOStream;
02052     using Teuchos::includesVerbLevel;
02053     using Teuchos::ireceive;
02054     using Teuchos::isend;
02055     using Teuchos::OSTab;
02056     using Teuchos::readySend;
02057     using Teuchos::send;
02058     using Teuchos::ssend;
02059     using Teuchos::TypeNameTraits;
02060     using Teuchos::typeName;
02061     using std::endl;
02062     using Kokkos::Compat::create_const_view;
02063     using Kokkos::Compat::create_view;
02064     using Kokkos::Compat::subview_offset;
02065     using Kokkos::Compat::deep_copy_offset;
02066     typedef Array<size_t>::size_type size_type;
02067     typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view_type;
02068     typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view_type;
02069 
02070     Teuchos::OSTab tab (out_);
02071 
02072 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02073     Teuchos::TimeMonitor timeMon (*timer_doPosts3_);
02074 #endif // TPETRA_DISTRIBUTOR_TIMERS
02075 
02076     // Run-time configurable parameters that come from the input
02077     // ParameterList set by setParameterList().
02078     const Details::EDistributorSendType sendType = sendType_;
02079     const bool doBarrier = barrierBetween_;
02080 
02081 // #ifdef HAVE_TEUCHOS_DEBUG
02082 //     // Prepare for verbose output, if applicable.
02083 //     Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
02084 //     (void) verbLevel; // Silence "unused variable" compiler warning.
02085 //     RCP<FancyOStream> out = this->getOStream ();
02086 //     // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
02087 //     //   includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
02088 //     const bool doPrint = out.get () && (comm_->getRank () == 0);
02089 
02090 //     if (doPrint) {
02091 //       // Only need one process to print out parameters.
02092 //       *out << "Distributor::doPosts (3 args)" << endl;
02093 //     }
02094 //     // Add one tab level.  We declare this outside the doPrint scopes
02095 //     // so that the tab persists until the end of this method.
02096 //     OSTab tab = this->getOSTab ();
02097 //     if (doPrint) {
02098 //       *out << "Parameters:" << endl;
02099 //       {
02100 //         OSTab tab2 (out);
02101 //         *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
02102 //              << endl << "barrierBetween: " << doBarrier << endl;
02103 //       }
02104 //     }
02105 // #endif // HAVE_TEUCHOS_DEBUG
02106 
02107     TEUCHOS_TEST_FOR_EXCEPTION(
02108       sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
02109       "Tpetra::Distributor::doPosts(3 args): Ready-send version requires a "
02110       "barrier between posting receives and posting ready sends.  This should "
02111       "have been checked before.  "
02112       "Please report this bug to the Tpetra developers.");
02113 
02114     const int myImageID = comm_->getRank ();
02115     size_t selfReceiveOffset = 0;
02116 
02117     // Each message has the same number of packets.
02118     const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
02119     TEUCHOS_TEST_FOR_EXCEPTION(
02120       static_cast<size_t> (imports.dimension_0 ()) < totalNumImportPackets,
02121       std::runtime_error, "Tpetra::Distributor::doPosts(3 args): The 'imports' "
02122       "array must have enough entries to hold the expected number of import "
02123       "packets.  imports.dimension_0() = " << imports.dimension_0 () << " < "
02124       "totalNumImportPackets = " << totalNumImportPackets << ".");
02125 
02126     // MPI tag for nonblocking receives and blocking sends in this
02127     // method.  Some processes might take the "fast" path
02128     // (indicesTo_.empty()) and others might take the "slow" path for
02129     // the same doPosts() call, so the path tag must be the same for
02130     // both.
02131     const int pathTag = 0;
02132     const int tag = this->getTag (pathTag);
02133 
02134     if (debug_) {
02135       TEUCHOS_TEST_FOR_EXCEPTION(
02136         requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
02137         "doPosts(3 args): Process " << myImageID << ": requests_.size() = "
02138         << requests_.size () << " != 0.");
02139       std::ostringstream os;
02140       os << myImageID << ": doPosts(3,"
02141          << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
02142       *out_ << os.str ();
02143     }
02144 
02145     // Distributor uses requests_.size() as the number of outstanding
02146     // nonblocking message requests, so we resize to zero to maintain
02147     // this invariant.
02148     //
02149     // numReceives_ does _not_ include the self message, if there is
02150     // one.  Here, we do actually send a message to ourselves, so we
02151     // include any self message in the "actual" number of receives to
02152     // post.
02153     //
02154     // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
02155     // doesn't (re)allocate its array of requests.  That happens in
02156     // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
02157     // demand), or Resize_().
02158     const size_type actualNumReceives = as<size_type> (numReceives_) +
02159       as<size_type> (selfMessage_ ? 1 : 0);
02160     requests_.resize (0);
02161 
02162     // Post the nonblocking receives.  It's common MPI wisdom to post
02163     // receives before sends.  In MPI terms, this means favoring
02164     // adding to the "posted queue" (of receive requests) over adding
02165     // to the "unexpected queue" (of arrived messages not yet matched
02166     // with a receive).
02167     {
02168 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02169       Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_);
02170 #endif // TPETRA_DISTRIBUTOR_TIMERS
02171 
02172       size_t curBufferOffset = 0;
02173       for (size_type i = 0; i < actualNumReceives; ++i) {
02174         if (imagesFrom_[i] != myImageID) {
02175           // If my process is receiving these packet(s) from another
02176           // process (not a self-receive):
02177           //
02178           // 1. Set up the persisting view (recvBuf) of the imports
02179           //    array, given the offset and size (total number of
02180           //    packets from process imagesFrom_[i]).
02181           // 2. Start the Irecv and save the resulting request.
02182           imports_view_type recvBuf =
02183             subview_offset (imports, curBufferOffset, lengthsFrom_[i]*numPackets);
02184           requests_.push_back (ireceive<int> (recvBuf, imagesFrom_[i],
02185                                               tag, *comm_));
02186           if (debug_) {
02187             std::ostringstream os;
02188             os << myImageID << ": doPosts(3,"
02189                << (indicesTo_.empty () ? "fast" : "slow") << "): "
02190                << "Posted irecv from Proc " << imagesFrom_[i] << " with "
02191               "specified tag " << tag << endl;
02192             *out_ << os.str ();
02193           }
02194         }
02195         else { // Receiving from myself
02196           selfReceiveOffset = curBufferOffset; // Remember the self-recv offset
02197         }
02198         curBufferOffset += lengthsFrom_[i]*numPackets;
02199       }
02200     }
02201 
02202     if (doBarrier) {
02203 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02204       Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_);
02205 #endif // TPETRA_DISTRIBUTOR_TIMERS
02206       // If we are using ready sends (MPI_Rsend) below, we need to do
02207       // a barrier before we post the ready sends.  This is because a
02208       // ready send requires that its matching receive has already
02209       // been posted before the send has been posted.  The only way to
02210       // guarantee that in this case is to use a barrier.
02211       comm_->barrier ();
02212     }
02213 
02214 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02215     Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_);
02216 #endif // TPETRA_DISTRIBUTOR_TIMERS
02217 
02218     // setup scan through imagesTo_ list starting with higher numbered images
02219     // (should help balance message traffic)
02220     //
02221     // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
02222     // It doesn't depend on the input at all.
02223     size_t numBlocks = numSends_ + selfMessage_;
02224     size_t imageIndex = 0;
02225     while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
02226       ++imageIndex;
02227     }
02228     if (imageIndex == numBlocks) {
02229       imageIndex = 0;
02230     }
02231 
02232     size_t selfNum = 0;
02233     size_t selfIndex = 0;
02234 
02235     if (indicesTo_.empty()) {
02236       if (debug_) {
02237         std::ostringstream os;
02238         os << myImageID << ": doPosts(3,fast): posting sends" << endl;
02239         *out_ << os.str ();
02240       }
02241 
02242       // Data are already blocked (laid out) by process, so we don't
02243       // need a separate send buffer (besides the exports array).
02244       for (size_t i = 0; i < numBlocks; ++i) {
02245         size_t p = i + imageIndex;
02246         if (p > (numBlocks - 1)) {
02247           p -= numBlocks;
02248         }
02249 
02250         if (imagesTo_[p] != myImageID) {
02251           exports_view_type tmpSend = subview_offset(
02252             exports, startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
02253 
02254           if (sendType == Details::DISTRIBUTOR_SEND) {
02255             send<int> (tmpSend,
02256                        as<int> (tmpSend.size ()),
02257                        imagesTo_[p], tag, *comm_);
02258           }
02259           else if (sendType == Details::DISTRIBUTOR_ISEND) {
02260             exports_view_type tmpSendBuf =
02261               subview_offset (exports, startsTo_[p] * numPackets,
02262                               lengthsTo_[p] * numPackets);
02263             requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
02264                                              tag, *comm_));
02265           }
02266           else if (sendType == Details::DISTRIBUTOR_RSEND) {
02267             readySend<int> (tmpSend,
02268                             as<int> (tmpSend.size ()),
02269                             imagesTo_[p], tag, *comm_);
02270           }
02271           else if (sendType == Details::DISTRIBUTOR_SSEND) {
02272             ssend<int> (tmpSend,
02273                         as<int> (tmpSend.size ()),
02274                         imagesTo_[p], tag, *comm_);
02275           } else {
02276             TEUCHOS_TEST_FOR_EXCEPTION(
02277               true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
02278               "Invalid send type.  We should never get here.  "
02279               "Please report this bug to the Tpetra developers.");
02280           }
02281 
02282           if (debug_) {
02283             std::ostringstream os;
02284             os << myImageID << ": doPosts(3,fast): "
02285                << "Posted send to Proc " << imagesTo_[i]
02286                << " w/ specified tag " << tag << endl;
02287             *out_ << os.str ();
02288           }
02289         }
02290         else { // "Sending" the message to myself
02291           selfNum = p;
02292         }
02293       }
02294 
02295       if (selfMessage_) {
02296         // This is how we "send a message to ourself": we copy from
02297         // the export buffer to the import buffer.  That saves
02298         // Teuchos::Comm implementations other than MpiComm (in
02299         // particular, SerialComm) the trouble of implementing self
02300         // messages correctly.  (To do this right, SerialComm would
02301         // need internal buffer space for messages, keyed on the
02302         // message's tag.)
02303         deep_copy_offset(imports, exports, selfReceiveOffset,
02304                          startsTo_[selfNum]*numPackets,
02305                          lengthsTo_[selfNum]*numPackets);
02306       }
02307       if (debug_) {
02308         std::ostringstream os;
02309         os << myImageID << ": doPosts(3,fast) done" << endl;
02310         *out_ << os.str ();
02311       }
02312     }
02313     else { // data are not blocked by image, use send buffer
02314       if (debug_) {
02315         std::ostringstream os;
02316         os << myImageID << ": doPosts(3,slow): posting sends" << endl;
02317         *out_ << os.str ();
02318       }
02319 
02320       // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
02321       // sends), because the buffer is only long enough for one send.
02322       Kokkos::View<Packet*,Layout,Device,Mem> sendArray =
02323         create_view<Packet,Layout,Device,Mem> ("sendArray",
02324                                                maxSendLength_ * numPackets);
02325 
02326       TEUCHOS_TEST_FOR_EXCEPTION(
02327         sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
02328         "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path "
02329         "doesn't currently work with nonblocking sends.");
02330 
02331       for (size_t i = 0; i < numBlocks; ++i) {
02332         size_t p = i + imageIndex;
02333         if (p > (numBlocks - 1)) {
02334           p -= numBlocks;
02335         }
02336 
02337         if (imagesTo_[p] != myImageID) {
02338           size_t sendArrayOffset = 0;
02339           size_t j = startsTo_[p];
02340           for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
02341             deep_copy_offset(sendArray, exports, sendArrayOffset,
02342                              indicesTo_[j]*numPackets, numPackets);
02343             sendArrayOffset += numPackets;
02344           }
02345           Kokkos::View<Packet*, Layout, Device, Mem> tmpSend =
02346             subview_offset(sendArray, size_t(0), lengthsTo_[p]*numPackets);
02347 
02348           if (sendType == Details::DISTRIBUTOR_SEND) {
02349             send<int> (tmpSend,
02350                        as<int> (tmpSend.size ()),
02351                        imagesTo_[p], tag, *comm_);
02352           }
02353           else if (sendType == Details::DISTRIBUTOR_ISEND) {
02354             exports_view_type tmpSendBuf =
02355               subview_offset (sendArray, size_t(0), lengthsTo_[p] * numPackets);
02356             requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
02357                                              tag, *comm_));
02358           }
02359           else if (sendType == Details::DISTRIBUTOR_RSEND) {
02360             readySend<int> (tmpSend,
02361                             as<int> (tmpSend.size ()),
02362                             imagesTo_[p], tag, *comm_);
02363           }
02364           else if (sendType == Details::DISTRIBUTOR_SSEND) {
02365             ssend<int> (tmpSend,
02366                         as<int> (tmpSend.size ()),
02367                         imagesTo_[p], tag, *comm_);
02368           }
02369           else {
02370             TEUCHOS_TEST_FOR_EXCEPTION(
02371               true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
02372               "Invalid send type.  We should never get here.  "
02373               "Please report this bug to the Tpetra developers.");
02374           }
02375 
02376           if (debug_) {
02377             std::ostringstream os;
02378             os << myImageID << ": doPosts(3,slow): "
02379                << "Posted send to Proc " << imagesTo_[i]
02380                << " w/ specified tag " << tag << endl;
02381             *out_ << os.str ();
02382           }
02383         }
02384         else { // "Sending" the message to myself
02385           selfNum = p;
02386           selfIndex = startsTo_[p];
02387         }
02388       }
02389 
02390       if (selfMessage_) {
02391         for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
02392           deep_copy_offset(imports, exports, selfReceiveOffset,
02393                            indicesTo_[selfIndex]*numPackets, numPackets);
02394           ++selfIndex;
02395           selfReceiveOffset += numPackets;
02396         }
02397       }
02398       if (debug_) {
02399         std::ostringstream os;
02400         os << myImageID << ": doPosts(3,slow) done" << endl;
02401         *out_ << os.str ();
02402       }
02403     }
02404   }
02405 
02406   template <class Packet, class Layout, class Device, class Mem>
02407   void Distributor::
02408   doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
02409            const ArrayView<size_t> &numExportPacketsPerLID,
02410            const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
02411            const ArrayView<size_t> &numImportPacketsPerLID)
02412   {
02413     using Teuchos::Array;
02414     using Teuchos::as;
02415     using Teuchos::ireceive;
02416     using Teuchos::isend;
02417     using Teuchos::readySend;
02418     using Teuchos::send;
02419     using Teuchos::ssend;
02420     using Teuchos::TypeNameTraits;
02421 #ifdef HAVE_TEUCHOS_DEBUG
02422     using Teuchos::OSTab;
02423 #endif // HAVE_TEUCHOS_DEBUG
02424     using std::endl;
02425     using Kokkos::Compat::create_const_view;
02426     using Kokkos::Compat::create_view;
02427     using Kokkos::Compat::subview_offset;
02428     using Kokkos::Compat::deep_copy_offset;
02429     typedef Array<size_t>::size_type size_type;
02430     typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view_type;
02431     typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view_type;
02432 
02433     Teuchos::OSTab tab (out_);
02434 
02435 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02436     Teuchos::TimeMonitor timeMon (*timer_doPosts4_);
02437 #endif // TPETRA_DISTRIBUTOR_TIMERS
02438 
02439     // Run-time configurable parameters that come from the input
02440     // ParameterList set by setParameterList().
02441     const Details::EDistributorSendType sendType = sendType_;
02442     const bool doBarrier = barrierBetween_;
02443 
02444 // #ifdef HAVE_TEUCHOS_DEBUG
02445 //     // Prepare for verbose output, if applicable.
02446 //     Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
02447 //     RCP<Teuchos::FancyOStream> out = this->getOStream ();
02448 //     const bool doPrint = out.get () && (comm_->getRank () == 0) &&
02449 //       includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
02450 
02451 //     if (doPrint) {
02452 //       // Only need one process to print out parameters.
02453 //       *out << "Distributor::doPosts (4 args)" << endl;
02454 //     }
02455 //     // Add one tab level.  We declare this outside the doPrint scopes
02456 //     // so that the tab persists until the end of this method.
02457 //     Teuchos::OSTab tab = this->getOSTab ();
02458 //     if (doPrint) {
02459 //       *out << "Parameters:" << endl;
02460 //       {
02461 //         OSTab tab2 (out);
02462 //         *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
02463 //              << endl << "barrierBetween: " << doBarrier << endl;
02464 //       }
02465 //     }
02466 // #endif // HAVE_TEUCHOS_DEBUG
02467 
02468     TEUCHOS_TEST_FOR_EXCEPTION(
02469       sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
02470       std::logic_error, "Tpetra::Distributor::doPosts(4 args): Ready-send "
02471       "version requires a barrier between posting receives and posting ready "
02472       "sends.  This should have been checked before.  "
02473       "Please report this bug to the Tpetra developers.");
02474 
02475     const int myImageID = comm_->getRank ();
02476     size_t selfReceiveOffset = 0;
02477 
02478 #ifdef HAVE_TEUCHOS_DEBUG
02479     // Different messages may have different numbers of packets.
02480     size_t totalNumImportPackets = 0;
02481     for (size_type ii = 0; ii < numImportPacketsPerLID.size (); ++ii) {
02482       totalNumImportPackets += numImportPacketsPerLID[ii];
02483     }
02484     TEUCHOS_TEST_FOR_EXCEPTION(
02485       imports.dimension_0 () < totalNumImportPackets, std::runtime_error,
02486       "Tpetra::Distributor::doPosts(4 args): The 'imports' array must have "
02487       "enough entries to hold the expected number of import packets.  "
02488       "imports.dimension_0() = " << imports.dimension_0 () << " < "
02489       "totalNumImportPackets = " << totalNumImportPackets << ".");
02490 #endif // HAVE_TEUCHOS_DEBUG
02491 
02492     // MPI tag for nonblocking receives and blocking sends in this
02493     // method.  Some processes might take the "fast" path
02494     // (indicesTo_.empty()) and others might take the "slow" path for
02495     // the same doPosts() call, so the path tag must be the same for
02496     // both.
02497     const int pathTag = 1;
02498     const int tag = this->getTag (pathTag);
02499 
02500     if (debug_) {
02501       TEUCHOS_TEST_FOR_EXCEPTION(
02502         requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
02503         "doPosts(4 args): Process " << myImageID << ": requests_.size () = "
02504         << requests_.size () << " != 0.");
02505       std::ostringstream os;
02506       os << myImageID << ": doPosts(4,"
02507          << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
02508       *out_ << os.str ();
02509     }
02510 
02511     // Distributor uses requests_.size() as the number of outstanding
02512     // nonblocking message requests, so we resize to zero to maintain
02513     // this invariant.
02514     //
02515     // numReceives_ does _not_ include the self message, if there is
02516     // one.  Here, we do actually send a message to ourselves, so we
02517     // include any self message in the "actual" number of receives to
02518     // post.
02519     //
02520     // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
02521     // doesn't (re)allocate its array of requests.  That happens in
02522     // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
02523     // demand), or Resize_().
02524     const size_type actualNumReceives = as<size_type> (numReceives_) +
02525       as<size_type> (selfMessage_ ? 1 : 0);
02526     requests_.resize (0);
02527 
02528     // Post the nonblocking receives.  It's common MPI wisdom to post
02529     // receives before sends.  In MPI terms, this means favoring
02530     // adding to the "posted queue" (of receive requests) over adding
02531     // to the "unexpected queue" (of arrived messages not yet matched
02532     // with a receive).
02533     {
02534 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02535       Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_);
02536 #endif // TPETRA_DISTRIBUTOR_TIMERS
02537 
02538       size_t curBufferOffset = 0;
02539       size_t curLIDoffset = 0;
02540       for (size_type i = 0; i < actualNumReceives; ++i) {
02541         size_t totalPacketsFrom_i = 0;
02542         for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
02543           totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
02544         }
02545         curLIDoffset += lengthsFrom_[i];
02546         if (imagesFrom_[i] != myImageID && totalPacketsFrom_i) {
02547           // If my process is receiving these packet(s) from another
02548           // process (not a self-receive), and if there is at least
02549           // one packet to receive:
02550           //
02551           // 1. Set up the persisting view (recvBuf) into the imports
02552           //    array, given the offset and size (total number of
02553           //    packets from process imagesFrom_[i]).
02554           // 2. Start the Irecv and save the resulting request.
02555           imports_view_type recvBuf =
02556             subview_offset (imports, curBufferOffset, totalPacketsFrom_i);
02557           requests_.push_back (ireceive<int> (recvBuf, imagesFrom_[i],
02558                                               tag, *comm_));
02559         }
02560         else { // Receiving these packet(s) from myself
02561           selfReceiveOffset = curBufferOffset; // Remember the offset
02562         }
02563         curBufferOffset += totalPacketsFrom_i;
02564       }
02565     }
02566 
02567     if (doBarrier) {
02568 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02569       Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_);
02570 #endif // TPETRA_DISTRIBUTOR_TIMERS
02571       // If we are using ready sends (MPI_Rsend) below, we need to do
02572       // a barrier before we post the ready sends.  This is because a
02573       // ready send requires that its matching receive has already
02574       // been posted before the send has been posted.  The only way to
02575       // guarantee that in this case is to use a barrier.
02576       comm_->barrier ();
02577     }
02578 
02579 #ifdef TPETRA_DISTRIBUTOR_TIMERS
02580     Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_);
02581 #endif // TPETRA_DISTRIBUTOR_TIMERS
02582 
02583     // setup arrays containing starting-offsets into exports for each send,
02584     // and num-packets-to-send for each send.
02585     Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
02586     size_t maxNumPackets = 0;
02587     size_t curPKToffset = 0;
02588     for (size_t pp=0; pp<numSends_; ++pp) {
02589       sendPacketOffsets[pp] = curPKToffset;
02590       size_t numPackets = 0;
02591       for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
02592         numPackets += numExportPacketsPerLID[j];
02593       }
02594       if (numPackets > maxNumPackets) maxNumPackets = numPackets;
02595       packetsPerSend[pp] = numPackets;
02596       curPKToffset += numPackets;
02597     }
02598 
02599     // setup scan through imagesTo_ list starting with higher numbered images
02600     // (should help balance message traffic)
02601     size_t numBlocks = numSends_+ selfMessage_;
02602     size_t imageIndex = 0;
02603     while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
02604       ++imageIndex;
02605     }
02606     if (imageIndex == numBlocks) {
02607       imageIndex = 0;
02608     }
02609 
02610     size_t selfNum = 0;
02611     size_t selfIndex = 0;
02612 
02613     if (indicesTo_.empty()) {
02614       if (debug_) {
02615         std::ostringstream os;
02616         os << myImageID << ": doPosts(4,fast): posting sends" << endl;
02617         *out_ << os.str ();
02618       }
02619 
02620       // Data are already blocked (laid out) by process, so we don't
02621       // need a separate send buffer (besides the exports array).
02622       for (size_t i = 0; i < numBlocks; ++i) {
02623         size_t p = i + imageIndex;
02624         if (p > (numBlocks - 1)) {
02625           p -= numBlocks;
02626         }
02627 
02628         if (imagesTo_[p] != myImageID && packetsPerSend[p] > 0) {
02629           exports_view_type tmpSend =
02630             subview_offset(exports, sendPacketOffsets[p], packetsPerSend[p]);
02631 
02632           if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
02633             send<int> (tmpSend,
02634                        as<int> (tmpSend.size ()),
02635                        imagesTo_[p], tag, *comm_);
02636           }
02637           else if (sendType == Details::DISTRIBUTOR_RSEND) {
02638             readySend<int> (tmpSend,
02639                             as<int> (tmpSend.size ()),
02640                             imagesTo_[p], tag, *comm_);
02641           }
02642           else if (sendType == Details::DISTRIBUTOR_ISEND) {
02643             exports_view_type tmpSendBuf =
02644               subview_offset (exports, sendPacketOffsets[p], packetsPerSend[p]);
02645             requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
02646                                              tag, *comm_));
02647           }
02648           else if (sendType == Details::DISTRIBUTOR_SSEND) {
02649             ssend<int> (tmpSend,
02650                         as<int> (tmpSend.size ()),
02651                         imagesTo_[p], tag, *comm_);
02652           }
02653           else {
02654             TEUCHOS_TEST_FOR_EXCEPTION(
02655               true, std::logic_error, "Tpetra::Distributor::doPosts(4 args): "
02656               "Invalid send type.  We should never get here.  "
02657               "Please report this bug to the Tpetra developers.");
02658           }
02659         }
02660         else { // "Sending" the message to myself
02661           selfNum = p;
02662         }
02663       }
02664 
02665       if (selfMessage_) {
02666         deep_copy_offset(imports, exports, selfReceiveOffset,
02667                          sendPacketOffsets[selfNum], packetsPerSend[selfNum]);
02668       }
02669       if (debug_) {
02670         std::ostringstream os;
02671         os << myImageID << ": doPosts(4,fast) done" << endl;
02672         *out_ << os.str ();
02673       }
02674     }
02675     else { // data are not blocked by image, use send buffer
02676       if (debug_) {
02677         std::ostringstream os;
02678         os << myImageID << ": doPosts(4,slow): posting sends" << endl;
02679         *out_ << os.str ();
02680       }
02681 
02682       // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
02683       Kokkos::View<Packet*,Layout,Device,Mem> sendArray =
02684         create_view<Packet,Layout,Device,Mem>("sendArray", maxNumPackets); // send buffer
02685 
02686       TEUCHOS_TEST_FOR_EXCEPTION(
02687         sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
02688         "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path "
02689         "may not necessarily work with nonblocking sends.");
02690 
02691       Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
02692       size_t ioffset = 0;
02693       for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
02694         indicesOffsets[j] = ioffset;
02695         ioffset += numExportPacketsPerLID[j];
02696       }
02697 
02698       for (size_t i = 0; i < numBlocks; ++i) {
02699         size_t p = i + imageIndex;
02700         if (p > (numBlocks - 1)) {
02701           p -= numBlocks;
02702         }
02703 
02704         if (imagesTo_[p] != myImageID) {
02705           size_t sendArrayOffset = 0;
02706           size_t j = startsTo_[p];
02707           size_t numPacketsTo_p = 0;
02708           for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
02709             deep_copy_offset(sendArray, exports, sendArrayOffset,
02710                              indicesOffsets[j], numExportPacketsPerLID[j]);
02711             sendArrayOffset += numExportPacketsPerLID[j];
02712           }
02713           if (numPacketsTo_p > 0) {
02714             Kokkos::View<Packet*, Layout, Device, Mem> tmpSend =
02715               subview_offset(sendArray, size_t(0), numPacketsTo_p);
02716 
02717             if (sendType == Details::DISTRIBUTOR_RSEND) {
02718               readySend<int> (tmpSend,
02719                               as<int> (tmpSend.size ()),
02720                               imagesTo_[p], tag, *comm_);
02721             }
02722             else if (sendType == Details::DISTRIBUTOR_ISEND) {
02723               exports_view_type tmpSendBuf =
02724                 subview_offset (sendArray, size_t(0), numPacketsTo_p);
02725               requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
02726                                                tag, *comm_));
02727             }
02728             else if (sendType == Details::DISTRIBUTOR_SSEND) {
02729               ssend<int> (tmpSend,
02730                           as<int> (tmpSend.size ()),
02731                           imagesTo_[p], tag, *comm_);
02732             }
02733             else { // if (sendType == Details::DISTRIBUTOR_SSEND)
02734               send<int> (tmpSend,
02735                          as<int> (tmpSend.size ()),
02736                          imagesTo_[p], tag, *comm_);
02737             }
02738           }
02739         }
02740         else { // "Sending" the message to myself
02741           selfNum = p;
02742           selfIndex = startsTo_[p];
02743         }
02744       }
02745 
02746       if (selfMessage_) {
02747         for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
02748           deep_copy_offset(imports, exports, selfReceiveOffset,
02749                            indicesOffsets[selfIndex],
02750                            numExportPacketsPerLID[selfIndex]);
02751           selfReceiveOffset += numExportPacketsPerLID[selfIndex];
02752           ++selfIndex;
02753         }
02754       }
02755       if (debug_) {
02756         std::ostringstream os;
02757         os << myImageID << ": doPosts(4,slow) done" << endl;
02758         *out_ << os.str ();
02759       }
02760     }
02761   }
02762 
02763   template <class Packet, class Layout, class Device, class Mem>
02764   void Distributor::
02765   doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
02766                           size_t numPackets,
02767                           const Kokkos::View<Packet*, Layout, Device, Mem> &imports)
02768   {
02769     // If MPI library doesn't support RDMA for communication directly
02770     // to/from GPU, transfer exports to the host, do the communication, then
02771     // transfer imports back to the GPU
02772     //
02773     // We need to do this here instead of doPosts() because the copy back
02774     // to the GPU must occur after the waits.
02775     typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view;
02776     typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view;
02777     if (!enable_cuda_rdma_ && !exports_view::is_hostspace) {
02778       typename exports_view::HostMirror host_exports =
02779         Kokkos::create_mirror_view(exports);
02780       typename imports_view::HostMirror host_imports =
02781         Kokkos::create_mirror_view(imports);
02782       Kokkos::deep_copy (host_exports, exports);
02783       doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports),
02784                       numPackets,
02785                       host_imports);
02786       Kokkos::deep_copy (imports, host_imports);
02787       return;
02788     }
02789 
02790     doReversePosts (exports, numPackets, imports);
02791     doReverseWaits ();
02792   }
02793 
02794   template <class Packet, class Layout, class Device, class Mem>
02795   void Distributor::
02796   doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
02797                           const ArrayView<size_t> &numExportPacketsPerLID,
02798                           const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
02799                           const ArrayView<size_t> &numImportPacketsPerLID)
02800   {
02801     // If MPI library doesn't support RDMA for communication directly
02802     // to/from GPU, transfer exports to the host, do the communication, then
02803     // transfer imports back to the GPU
02804     //
02805     // We need to do this here instead of doPosts() because the copy back
02806     // to the GPU must occur after the waits.
02807     typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view;
02808     typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view;
02809     if (!enable_cuda_rdma_ && !exports_view::is_hostspace) {
02810       typename exports_view::HostMirror host_exports =
02811         Kokkos::create_mirror_view(exports);
02812       typename imports_view::HostMirror host_imports =
02813         Kokkos::create_mirror_view(imports);
02814       Kokkos::deep_copy (host_exports, exports);
02815       doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports),
02816                       numExportPacketsPerLID,
02817                       host_imports,
02818                       numImportPacketsPerLID);
02819       Kokkos::deep_copy (imports, host_imports);
02820       return;
02821     }
02822 
02823     TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error,
02824       "Tpetra::Distributor::doReversePostsAndWaits(4 args): There are "
02825       << requests_.size() << " outstanding nonblocking messages pending.  It "
02826       "is incorrect to call this method with posts outstanding.");
02827 
02828     doReversePosts (exports, numExportPacketsPerLID, imports,
02829                     numImportPacketsPerLID);
02830     doReverseWaits ();
02831   }
02832 
02833   template <class Packet, class Layout, class Device, class Mem>
02834   void Distributor::
02835   doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
02836                   size_t numPackets,
02837                   const  Kokkos::View<Packet*, Layout, Device, Mem> &imports)
02838   {
02839     // FIXME (mfh 29 Mar 2012) WHY?
02840     TEUCHOS_TEST_FOR_EXCEPTION(
02841       ! indicesTo_.empty (), std::runtime_error,
02842       "Tpetra::Distributor::doReversePosts(3 args): Can only do "
02843       "reverse communication when original data are blocked by process.");
02844     if (reverseDistributor_.is_null ()) {
02845       createReverseDistributor ();
02846     }
02847     reverseDistributor_->doPosts (exports, numPackets, imports);
02848   }
02849 
02850   template <class Packet, class Layout, class Device, class Mem>
02851   void Distributor::
02852   doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
02853                   const ArrayView<size_t> &numExportPacketsPerLID,
02854                   const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
02855                   const ArrayView<size_t> &numImportPacketsPerLID)
02856   {
02857     // FIXME (mfh 29 Mar 2012) WHY?
02858     TEUCHOS_TEST_FOR_EXCEPTION(
02859       ! indicesTo_.empty (), std::runtime_error,
02860       "Tpetra::Distributor::doReversePosts(3 args): Can only do "
02861       "reverse communication when original data are blocked by process.");
02862     if (reverseDistributor_.is_null ()) {
02863       createReverseDistributor ();
02864     }
02865     reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
02866                                   imports, numImportPacketsPerLID);
02867   }
02868 
02869 #endif
02870 
02871   template <class OrdinalType>
02872   void Distributor::
02873   computeSends (const ArrayView<const OrdinalType> & importIDs,
02874                 const ArrayView<const int> & importNodeIDs,
02875                 Array<OrdinalType> & exportIDs,
02876                 Array<int> & exportNodeIDs)
02877   {
02878     // NOTE (mfh 19 Apr 2012): There was a note on this code saying:
02879     // "assumes that size_t >= Ordinal".  The code certainly does
02880     // assume that sizeof(size_t) >= sizeof(OrdinalType) as well as
02881     // sizeof(size_t) >= sizeof(int).  This is because it casts the
02882     // OrdinalType elements of importIDs (along with their
02883     // corresponding process IDs, as int) to size_t, and does a
02884     // doPostsAndWaits<size_t>() to send the packed data.
02885     using std::endl;
02886     typedef typename ArrayView<const OrdinalType>::size_type size_type;
02887 
02888     Teuchos::OSTab tab (out_);
02889     const int myRank = comm_->getRank ();
02890     if (debug_) {
02891       std::ostringstream os;
02892       os << myRank << ": computeSends" << endl;
02893       *out_ << os.str ();
02894     }
02895 
02896     TEUCHOS_TEST_FOR_EXCEPTION(
02897       importIDs.size () != importNodeIDs.size (), std::invalid_argument,
02898       "Tpetra::Distributor::computeSends: On Process " << myRank << ": "
02899       "importNodeIDs.size() = " << importNodeIDs.size ()
02900       << " != importIDs.size() = " << importIDs.size () << ".");
02901 
02902     const size_type numImports = importNodeIDs.size ();
02903     Array<size_t> importObjs (2*numImports);
02904     // Pack pairs (importIDs[i], my process ID) to send into importObjs.
02905     for (size_type i = 0; i < numImports; ++i) {
02906       importObjs[2*i]   = static_cast<size_t> (importIDs[i]);
02907       importObjs[2*i+1] = static_cast<size_t> (myRank);
02908     }
02909     //
02910     // Use a temporary Distributor to send the (importIDs[i], myRank)
02911     // pairs to importNodeIDs[i].
02912     //
02913     Distributor tempPlan (comm_, out_);
02914     if (debug_) {
02915       std::ostringstream os;
02916       os << myRank << ": computeSends: tempPlan.createFromSends" << endl;
02917       *out_ << os.str ();
02918     }
02919 
02920     // mfh 20 Mar 2014: An extra-cautious cast from unsigned to
02921     // signed, in order to forestall any possible causes for Bug 6069.
02922     const size_t numExportsAsSizeT = tempPlan.createFromSends (importNodeIDs);
02923     const size_type numExports = static_cast<size_type> (numExportsAsSizeT);
02924     TEUCHOS_TEST_FOR_EXCEPTION(
02925       numExports < 0, std::logic_error, "Tpetra::Distributor::computeSends: "
02926       "tempPlan.createFromSends() returned numExports = " << numExportsAsSizeT
02927       << " as a size_t, which overflows to " << numExports << " when cast to "
02928       << Teuchos::TypeNameTraits<size_type>::name () << ".  "
02929       "Please report this bug to the Tpetra developers.");
02930     TEUCHOS_TEST_FOR_EXCEPTION(
02931       static_cast<size_type> (tempPlan.getTotalReceiveLength ()) != numExports,
02932       std::logic_error, "Tpetra::Distributor::computeSends: tempPlan.getTotal"
02933       "ReceiveLength() = " << tempPlan.getTotalReceiveLength () << " != num"
02934       "Exports = " << numExports  << ".  Please report this bug to the "
02935       "Tpetra developers.");
02936 
02937     if (numExports > 0) {
02938       exportIDs.resize (numExports);
02939       exportNodeIDs.resize (numExports);
02940     }
02941 
02942     // exportObjs: Packed receive buffer.  (exportObjs[2*i],
02943     // exportObjs[2*i+1]) will give the (GID, PID) pair for export i,
02944     // after tempPlan.doPostsAndWaits(...) finishes below.
02945     //
02946     // FIXME (mfh 19 Mar 2014) This only works if OrdinalType fits in
02947     // size_t.  This issue might come up, for example, on a 32-bit
02948     // machine using 64-bit global indices.  I will add a check here
02949     // for that case.
02950     TEUCHOS_TEST_FOR_EXCEPTION(
02951       sizeof (size_t) < sizeof (OrdinalType), std::logic_error,
02952       "Tpetra::Distributor::computeSends: sizeof(size_t) = " << sizeof(size_t)
02953       << " < sizeof(" << Teuchos::TypeNameTraits<OrdinalType>::name () << ") = "
02954       << sizeof (OrdinalType) << ".  This violates an assumption of the "
02955       "method.  It's not hard to work around (just use Array<OrdinalType> as "
02956       "the export buffer, not Array<size_t>), but we haven't done that yet.  "
02957       "Please report this bug to the Tpetra developers.");
02958 
02959     TEUCHOS_TEST_FOR_EXCEPTION(
02960       tempPlan.getTotalReceiveLength () < static_cast<size_t> (numExports),
02961       std::logic_error,
02962       "Tpetra::Distributor::computeSends: tempPlan.getTotalReceiveLength() = "
02963       << tempPlan.getTotalReceiveLength() << " < numExports = " << numExports
02964       << ".  Please report this bug to the Tpetra developers.");
02965 
02966     Array<size_t> exportObjs (tempPlan.getTotalReceiveLength () * 2);
02967     if (debug_) {
02968       std::ostringstream os;
02969       os << myRank << ": computeSends: tempPlan.doPostsAndWaits" << endl;
02970       *out_ << os.str ();
02971     }
02972     tempPlan.doPostsAndWaits<size_t> (importObjs (), 2, exportObjs ());
02973 
02974     // Unpack received (GID, PID) pairs into exportIDs resp. exportNodeIDs.
02975     for (size_type i = 0; i < numExports; ++i) {
02976       exportIDs[i] = static_cast<OrdinalType> (exportObjs[2*i]);
02977       exportNodeIDs[i] = static_cast<int> (exportObjs[2*i+1]);
02978     }
02979 
02980     if (debug_) {
02981       std::ostringstream os;
02982       os << myRank << ": computeSends done" << endl;
02983       *out_ << os.str ();
02984     }
02985   }
02986 
02987   template <class OrdinalType>
02988   void Distributor::
02989   createFromRecvs (const ArrayView<const OrdinalType> &remoteIDs,
02990                    const ArrayView<const int> &remoteImageIDs,
02991                    Array<OrdinalType> &exportGIDs,
02992                    Array<int> &exportNodeIDs)
02993   {
02994     using std::endl;
02995 
02996     Teuchos::OSTab tab (out_);
02997     const int myRank = comm_->getRank();
02998 
02999     if (debug_) {
03000       *out_ << myRank << ": createFromRecvs" << endl;
03001     }
03002 
03003 #ifdef HAVE_TPETRA_DEBUG
03004     using Teuchos::outArg;
03005     using Teuchos::reduceAll;
03006 
03007     // In debug mode, first test locally, then do an all-reduce to
03008     // make sure that all processes passed.
03009     const int errProc =
03010       (remoteIDs.size () != remoteImageIDs.size ()) ? myRank : -1;
03011     int maxErrProc = -1;
03012     reduceAll<int, int> (*comm_, Teuchos::REDUCE_MAX, errProc, outArg (maxErrProc));
03013     TEUCHOS_TEST_FOR_EXCEPTION(maxErrProc != -1, std::runtime_error,
03014       Teuchos::typeName (*this) << "::createFromRecvs(): lists of remote IDs "
03015       "and remote process IDs must have the same size on all participating "
03016       "processes.  Maximum process ID with error: " << maxErrProc << ".");
03017 #else // NOT HAVE_TPETRA_DEBUG
03018 
03019     // In non-debug mode, just test locally.
03020     TEUCHOS_TEST_FOR_EXCEPTION(
03021       remoteIDs.size () != remoteImageIDs.size (), std::invalid_argument,
03022       Teuchos::typeName (*this) << "::createFromRecvs<" <<
03023       Teuchos::TypeNameTraits<OrdinalType>::name () << ">(): On Process " <<
03024       myRank << ": remoteIDs.size() = " << remoteIDs.size () << " != "
03025       "remoteImageIDs.size() = " << remoteImageIDs.size () << ".");
03026 #endif // HAVE_TPETRA_DEBUG
03027 
03028     computeSends (remoteIDs, remoteImageIDs, exportGIDs, exportNodeIDs);
03029 
03030     const size_t numProcsSendingToMe = createFromSends (exportNodeIDs ());
03031 
03032     if (debug_) {
03033       // NOTE (mfh 20 Mar 2014) If remoteImageIDs could contain
03034       // duplicates, then its length might not be the right check here,
03035       // even if we account for selfMessage_.  selfMessage_ is set in
03036       // createFromSends.
03037       std::ostringstream os;
03038       os << "Proc " << myRank << ": {numProcsSendingToMe: "
03039          << numProcsSendingToMe << ", remoteImageIDs.size(): "
03040          << remoteImageIDs.size () << ", selfMessage_: "
03041          << (selfMessage_ ? "true" : "false") << "}" << std::endl;
03042       std::cerr << os.str ();
03043     }
03044 
03045     if (debug_) {
03046       *out_ << myRank << ": createFromRecvs done" << endl;
03047     }
03048 
03049     howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS;
03050   }
03051 
03052 } // namespace Tpetra
03053 
03054 #endif // TPETRA_DISTRIBUTOR_HPP
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines