Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_CrsMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_CRSMATRIX_DEF_HPP
11#define TPETRA_CRSMATRIX_DEF_HPP
12
20
23#include "Tpetra_RowMatrix.hpp"
24#include "Tpetra_LocalCrsMatrixOperator.hpp"
25
32#include "Tpetra_Details_getDiagCopyWithoutOffsets.hpp"
40#include "Tpetra_Details_packCrsMatrix.hpp"
41#include "Tpetra_Details_unpackCrsMatrixAndCombine.hpp"
43#include "Teuchos_FancyOStream.hpp"
44#include "Teuchos_RCP.hpp"
45#include "Teuchos_DataAccess.hpp"
46#include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
47#include "KokkosBlas1_scal.hpp"
48#include "KokkosSparse_getDiagCopy.hpp"
49#include "KokkosSparse_spmv.hpp"
50
51#include <memory>
52#include <sstream>
53#include <typeinfo>
54#include <utility>
55#include <vector>
56
57namespace Tpetra {
58
59namespace { // (anonymous)
60
61 template<class T, class BinaryFunction>
62 T atomic_binary_function_update (volatile T* const dest,
63 const T& inputVal,
64 BinaryFunction f)
65 {
66 T oldVal = *dest;
67 T assume;
68
69 // NOTE (mfh 30 Nov 2015) I do NOT need a fence here for IBM
70 // POWER architectures, because 'newval' depends on 'assume',
71 // which depends on 'oldVal', which depends on '*dest'. This
72 // sets up a chain of read dependencies that should ensure
73 // correct behavior given a sane memory model.
74 do {
75 assume = oldVal;
76 T newVal = f (assume, inputVal);
77 oldVal = Kokkos::atomic_compare_exchange (dest, assume, newVal);
78 } while (assume != oldVal);
79
80 return oldVal;
81 }
82} // namespace (anonymous)
83
84//
85// Users must never rely on anything in the Details namespace.
86//
87namespace Details {
88
98template<class Scalar>
99struct AbsMax {
101 Scalar operator() (const Scalar& x, const Scalar& y) {
102 typedef Teuchos::ScalarTraits<Scalar> STS;
103 return std::max (STS::magnitude (x), STS::magnitude (y));
104 }
105};
106
107} // namespace Details
108} // namespace Tpetra
109
110namespace Tpetra {
111
112 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
113 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
114 CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
115 size_t maxNumEntriesPerRow,
116 const Teuchos::RCP<Teuchos::ParameterList>& params) :
117 dist_object_type (rowMap)
118 {
119 const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, size_t "
120 "[, RCP<ParameterList>]): ";
121 Teuchos::RCP<crs_graph_type> graph;
122 try {
123 graph = Teuchos::rcp (new crs_graph_type (rowMap, maxNumEntriesPerRow,
124 params));
125 }
126 catch (std::exception& e) {
127 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
128 (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
129 "size_t [, RCP<ParameterList>]) threw an exception: "
130 << e.what ());
131 }
132 // myGraph_ not null means that the matrix owns the graph. That's
133 // different than the const CrsGraph constructor, where the matrix
134 // does _not_ own the graph.
135 myGraph_ = graph;
136 staticGraph_ = myGraph_;
137 resumeFill (params);
139 }
140
141 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
142 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
143 CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
144 const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
145 const Teuchos::RCP<Teuchos::ParameterList>& params) :
146 dist_object_type (rowMap)
147 {
148 const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
149 "ArrayView<const size_t>[, RCP<ParameterList>]): ";
150 Teuchos::RCP<crs_graph_type> graph;
151 try {
152 using Teuchos::rcp;
153 graph = rcp(new crs_graph_type(rowMap, numEntPerRowToAlloc,
154 params));
155 }
156 catch (std::exception& e) {
157 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
158 (true, std::runtime_error, "CrsGraph constructor "
159 "(RCP<const Map>, ArrayView<const size_t>"
160 "[, RCP<ParameterList>]) threw an exception: "
161 << e.what ());
162 }
163 // myGraph_ not null means that the matrix owns the graph. That's
164 // different than the const CrsGraph constructor, where the matrix
165 // does _not_ own the graph.
166 myGraph_ = graph;
167 staticGraph_ = graph;
168 resumeFill (params);
170 }
171
172 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
173 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
174 CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
175 const Teuchos::RCP<const map_type>& colMap,
176 const size_t maxNumEntPerRow,
177 const Teuchos::RCP<Teuchos::ParameterList>& params) :
178 dist_object_type (rowMap)
179 {
180 const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
181 "RCP<const Map>, size_t[, RCP<ParameterList>]): ";
182 const char suffix[] =
183 " Please report this bug to the Tpetra developers.";
184
185 // An artifact of debugging something a while back.
186 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
187 (! staticGraph_.is_null (), std::logic_error,
188 "staticGraph_ is not null at the beginning of the constructor."
189 << suffix);
190 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
191 (! myGraph_.is_null (), std::logic_error,
192 "myGraph_ is not null at the beginning of the constructor."
193 << suffix);
194 Teuchos::RCP<crs_graph_type> graph;
195 try {
196 graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
197 maxNumEntPerRow,
198 params));
199 }
200 catch (std::exception& e) {
201 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
202 (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
203 "RCP<const Map>, size_t[, RCP<ParameterList>]) threw an "
204 "exception: " << e.what ());
205 }
206 // myGraph_ not null means that the matrix owns the graph. That's
207 // different than the const CrsGraph constructor, where the matrix
208 // does _not_ own the graph.
209 myGraph_ = graph;
210 staticGraph_ = myGraph_;
211 resumeFill (params);
213 }
214
215 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
216 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
217 CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
218 const Teuchos::RCP<const map_type>& colMap,
219 const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
220 const Teuchos::RCP<Teuchos::ParameterList>& params) :
221 dist_object_type (rowMap)
222 {
223 const char tfecfFuncName[] =
224 "CrsMatrix(RCP<const Map>, RCP<const Map>, "
225 "ArrayView<const size_t>[, RCP<ParameterList>]): ";
226 Teuchos::RCP<crs_graph_type> graph;
227 try {
228 graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
229 numEntPerRowToAlloc,
230 params));
231 }
232 catch (std::exception& e) {
233 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
234 (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
235 "RCP<const Map>, ArrayView<const size_t>[, "
236 "RCP<ParameterList>]) threw an exception: " << e.what ());
237 }
238 // myGraph_ not null means that the matrix owns the graph. That's
239 // different than the const CrsGraph constructor, where the matrix
240 // does _not_ own the graph.
241 myGraph_ = graph;
242 staticGraph_ = graph;
243 resumeFill (params);
245 }
246
247
248 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
249 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
250 CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
251 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
252 dist_object_type (graph->getRowMap ()),
253 staticGraph_ (graph),
254 storageStatus_ (Details::STORAGE_1D_PACKED)
255 {
256 using std::endl;
257 typedef typename local_matrix_device_type::values_type values_type;
258 const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>[, "
259 "RCP<ParameterList>]): ";
260 const bool verbose = Details::Behavior::verbose("CrsMatrix");
261
262 std::unique_ptr<std::string> prefix;
263 if (verbose) {
264 prefix = this->createPrefix("CrsMatrix", "CrsMatrix(graph,params)");
265 std::ostringstream os;
266 os << *prefix << "Start" << endl;
267 std::cerr << os.str ();
268 }
269
270 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
271 (graph.is_null (), std::runtime_error, "Input graph is null.");
272 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
273 (! graph->isFillComplete (), std::runtime_error, "Input graph "
274 "is not fill complete. You must call fillComplete on the "
275 "graph before using it to construct a CrsMatrix. Note that "
276 "calling resumeFill on the graph makes it not fill complete, "
277 "even if you had previously called fillComplete. In that "
278 "case, you must call fillComplete on the graph again.");
279
280 // The graph is fill complete, so it is locally indexed and has a
281 // fixed structure. This means we can allocate the (1-D) array of
282 // values and build the local matrix right now. Note that the
283 // local matrix's number of columns comes from the column Map, not
284 // the domain Map.
285
286 const size_t numEnt = graph->lclIndsPacked_wdv.extent (0);
287 if (verbose) {
288 std::ostringstream os;
289 os << *prefix << "Allocate values: " << numEnt << endl;
290 std::cerr << os.str ();
291 }
292
293 values_type val ("Tpetra::CrsMatrix::values", numEnt);
294 valuesPacked_wdv = values_wdv_type(val);
295 valuesUnpacked_wdv = valuesPacked_wdv;
296
298
299 if (verbose) {
300 std::ostringstream os;
301 os << *prefix << "Done" << endl;
302 std::cerr << os.str ();
303 }
304 }
305
306 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
307 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
308 CrsMatrix(CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& matrix,
309 const Teuchos::RCP<const crs_graph_type>& graph,
310 const Teuchos::RCP<Teuchos::ParameterList>& params) :
311 dist_object_type (graph->getRowMap ()),
312 staticGraph_ (graph),
314 {
315 const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
316 "local_matrix_device_type::values_type, "
317 "[,RCP<ParameterList>]): ";
318 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
319 (graph.is_null (), std::runtime_error, "Input graph is null.");
320 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
321 (! graph->isFillComplete (), std::runtime_error, "Input graph "
322 "is not fill complete. You must call fillComplete on the "
323 "graph before using it to construct a CrsMatrix. Note that "
324 "calling resumeFill on the graph makes it not fill complete, "
325 "even if you had previously called fillComplete. In that "
326 "case, you must call fillComplete on the graph again.");
327
328 size_t numValuesPacked = graph->lclIndsPacked_wdv.extent(0);
329 valuesPacked_wdv = values_wdv_type(matrix.valuesPacked_wdv, 0, numValuesPacked);
330
331 size_t numValuesUnpacked = graph->lclIndsUnpacked_wdv.extent(0);
332 valuesUnpacked_wdv = values_wdv_type(matrix.valuesUnpacked_wdv, 0, numValuesUnpacked);
333
335 }
336
337
338 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
339 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
340 CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
341 const typename local_matrix_device_type::values_type& values,
342 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
343 dist_object_type (graph->getRowMap ()),
344 staticGraph_ (graph),
345 storageStatus_ (Details::STORAGE_1D_PACKED)
346 {
347 const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
348 "local_matrix_device_type::values_type, "
349 "[,RCP<ParameterList>]): ";
350 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
351 (graph.is_null (), std::runtime_error, "Input graph is null.");
352 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
353 (! graph->isFillComplete (), std::runtime_error, "Input graph "
354 "is not fill complete. You must call fillComplete on the "
355 "graph before using it to construct a CrsMatrix. Note that "
356 "calling resumeFill on the graph makes it not fill complete, "
357 "even if you had previously called fillComplete. In that "
358 "case, you must call fillComplete on the graph again.");
359
360 // The graph is fill complete, so it is locally indexed and has a
361 // fixed structure. This means we can allocate the (1-D) array of
362 // values and build the local matrix right now. Note that the
363 // local matrix's number of columns comes from the column Map, not
364 // the domain Map.
365
366 valuesPacked_wdv = values_wdv_type(values);
367 valuesUnpacked_wdv = valuesPacked_wdv;
368
369 // FIXME (22 Jun 2016) I would very much like to get rid of
370 // k_values1D_ at some point. I find it confusing to have all
371 // these extra references lying around.
372 // KDDKDD ALMOST THERE, MARK!
373// k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
374
376 }
377
378 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
379 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
380 CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
381 const Teuchos::RCP<const map_type>& colMap,
382 const typename local_graph_device_type::row_map_type& rowPointers,
383 const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
384 const typename local_matrix_device_type::values_type& values,
385 const Teuchos::RCP<Teuchos::ParameterList>& params) :
386 dist_object_type (rowMap),
387 storageStatus_ (Details::STORAGE_1D_PACKED)
388 {
389 using Details::getEntryOnHost;
390 using Teuchos::RCP;
391 using std::endl;
392 const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
393 "RCP<const Map>, ptr, ind, val[, params]): ";
394 const char suffix[] =
395 ". Please report this bug to the Tpetra developers.";
396 const bool debug = Details::Behavior::debug("CrsMatrix");
397 const bool verbose = Details::Behavior::verbose("CrsMatrix");
398
399 std::unique_ptr<std::string> prefix;
400 if (verbose) {
401 prefix = this->createPrefix(
402 "CrsMatrix", "CrsMatrix(rowMap,colMap,ptr,ind,val[,params])");
403 std::ostringstream os;
404 os << *prefix << "Start" << endl;
405 std::cerr << os.str ();
406 }
407
408 // Check the user's input. Note that this might throw only on
409 // some processes but not others, causing deadlock. We prefer
410 // deadlock due to exceptions to segfaults, because users can
411 // catch exceptions.
412 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
413 (values.extent(0) != columnIndices.extent(0),
414 std::invalid_argument, "values.extent(0)=" << values.extent(0)
415 << " != columnIndices.extent(0) = " << columnIndices.extent(0)
416 << ".");
417 if (debug && rowPointers.extent(0) != 0) {
418 const size_t numEnt =
419 getEntryOnHost(rowPointers, rowPointers.extent(0) - 1);
420 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
421 (numEnt != size_t(columnIndices.extent(0)) ||
422 numEnt != size_t(values.extent(0)),
423 std::invalid_argument, "Last entry of rowPointers says that "
424 "the matrix has " << numEnt << " entr"
425 << (numEnt != 1 ? "ies" : "y") << ", but the dimensions of "
426 "columnIndices and values don't match this. "
427 "columnIndices.extent(0)=" << columnIndices.extent (0)
428 << " and values.extent(0)=" << values.extent (0) << ".");
429 }
430
431 RCP<crs_graph_type> graph;
432 try {
433 graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, rowPointers,
434 columnIndices, params));
435 }
436 catch (std::exception& e) {
437 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
438 (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
439 "RCP<const Map>, ptr, ind[, params]) threw an exception: "
440 << e.what ());
441 }
442 // The newly created CrsGraph _must_ have a local graph at this
443 // point. We don't really care whether CrsGraph's constructor
444 // deep-copies or shallow-copies the input, but the dimensions
445 // have to be right. That's how we tell whether the CrsGraph has
446 // a local graph.
447 auto lclGraph = graph->getLocalGraphDevice ();
448 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
449 (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
450 lclGraph.entries.extent (0) != columnIndices.extent (0),
451 std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, "
452 "ind[, params]) did not set the local graph correctly." << suffix);
453 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
454 (lclGraph.entries.extent (0) != values.extent (0),
455 std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, ind[, "
456 "params]) did not set the local graph correctly. "
457 "lclGraph.entries.extent(0) = " << lclGraph.entries.extent (0)
458 << " != values.extent(0) = " << values.extent (0) << suffix);
459
460 // myGraph_ not null means that the matrix owns the graph. This
461 // is true because the column indices come in as nonconst,
462 // implying shared ownership.
463 myGraph_ = graph;
464 staticGraph_ = graph;
465
466 // The graph may not be fill complete yet. However, it is locally
467 // indexed (since we have a column Map) and has a fixed structure
468 // (due to the input arrays). This means we can allocate the
469 // (1-D) array of values and build the local matrix right now.
470 // Note that the local matrix's number of columns comes from the
471 // column Map, not the domain Map.
472
473 valuesPacked_wdv = values_wdv_type(values);
474 valuesUnpacked_wdv = valuesPacked_wdv;
475
476 // FIXME (22 Jun 2016) I would very much like to get rid of
477 // k_values1D_ at some point. I find it confusing to have all
478 // these extra references lying around.
479// this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
480
482 if (verbose) {
483 std::ostringstream os;
484 os << *prefix << "Done" << endl;
485 std::cerr << os.str();
486 }
487 }
488
489 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
490 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
491 CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
492 const Teuchos::RCP<const map_type>& colMap,
493 const Teuchos::ArrayRCP<size_t>& ptr,
494 const Teuchos::ArrayRCP<LocalOrdinal>& ind,
495 const Teuchos::ArrayRCP<Scalar>& val,
496 const Teuchos::RCP<Teuchos::ParameterList>& params) :
497 dist_object_type (rowMap),
498 storageStatus_ (Details::STORAGE_1D_PACKED)
499 {
500 using Kokkos::Compat::getKokkosViewDeepCopy;
501 using Teuchos::av_reinterpret_cast;
502 using Teuchos::RCP;
503 using values_type = typename local_matrix_device_type::values_type;
504 using IST = impl_scalar_type;
505 const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
506 "RCP<const Map>, ptr, ind, val[, params]): ";
507
508 RCP<crs_graph_type> graph;
509 try {
510 graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, ptr,
511 ind, params));
512 }
513 catch (std::exception& e) {
514 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
515 (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
516 "RCP<const Map>, ArrayRCP<size_t>, ArrayRCP<LocalOrdinal>[, "
517 "RCP<ParameterList>]) threw an exception: " << e.what ());
518 }
519 // myGraph_ not null means that the matrix owns the graph. This
520 // is true because the column indices come in as nonconst,
521 // implying shared ownership.
522 myGraph_ = graph;
523 staticGraph_ = graph;
524
525 // The graph may not be fill complete yet. However, it is locally
526 // indexed (since we have a column Map) and has a fixed structure
527 // (due to the input arrays). This means we can allocate the
528 // (1-D) array of values and build the local matrix right now.
529 // Note that the local matrix's number of columns comes from the
530 // column Map, not the domain Map.
531
532 // The graph _must_ have a local graph at this point. We don't
533 // really care whether CrsGraph's constructor deep-copies or
534 // shallow-copies the input, but the dimensions have to be right.
535 // That's how we tell whether the CrsGraph has a local graph.
536 auto lclGraph = staticGraph_->getLocalGraphDevice ();
537 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
538 (size_t (lclGraph.row_map.extent (0)) != size_t (ptr.size ()) ||
539 size_t (lclGraph.entries.extent (0)) != size_t (ind.size ()),
540 std::logic_error, "CrsGraph's constructor (rowMap, colMap, "
541 "ptr, ind[, params]) did not set the local graph correctly. "
542 "Please report this bug to the Tpetra developers.");
543
544 values_type valIn =
545 getKokkosViewDeepCopy<device_type> (av_reinterpret_cast<IST> (val ()));
546 valuesPacked_wdv = values_wdv_type(valIn);
547 valuesUnpacked_wdv = valuesPacked_wdv;
548
549 // FIXME (22 Jun 2016) I would very much like to get rid of
550 // k_values1D_ at some point. I find it confusing to have all
551 // these extra references lying around.
552// this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
553
555 }
556
557 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
558 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
559 CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
560 const Teuchos::RCP<const map_type>& colMap,
561 const local_matrix_device_type& lclMatrix,
562 const Teuchos::RCP<Teuchos::ParameterList>& params) :
563 dist_object_type (rowMap),
564 storageStatus_ (Details::STORAGE_1D_PACKED),
565 fillComplete_ (true)
566 {
567 const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
568 "RCP<const Map>, local_matrix_device_type[, RCP<ParameterList>]): ";
569 const char suffix[] =
570 " Please report this bug to the Tpetra developers.";
571
572 Teuchos::RCP<crs_graph_type> graph;
573 try {
574 graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
575 lclMatrix.graph, params));
576 }
577 catch (std::exception& e) {
578 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
579 (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
580 "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) threw an "
581 "exception: " << e.what ());
582 }
583 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
584 (!graph->isFillComplete (), std::logic_error, "CrsGraph constructor (RCP"
585 "<const Map>, RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) "
586 "did not produce a fill-complete graph. Please report this bug to the "
587 "Tpetra developers.");
588 // myGraph_ not null means that the matrix owns the graph. This
589 // is true because the column indices come in as nonconst through
590 // the matrix, implying shared ownership.
591 myGraph_ = graph;
592 staticGraph_ = graph;
593
594 valuesPacked_wdv = values_wdv_type(lclMatrix.values);
595 valuesUnpacked_wdv = valuesPacked_wdv;
596
597 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
598 (isFillActive (), std::logic_error,
599 "At the end of a CrsMatrix constructor that should produce "
600 "a fillComplete matrix, isFillActive() is true." << suffix);
601 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
602 (! isFillComplete (), std::logic_error, "At the end of a "
603 "CrsMatrix constructor that should produce a fillComplete "
604 "matrix, isFillComplete() is false." << suffix);
606 }
607
608 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
609 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
610 CrsMatrix (const local_matrix_device_type& lclMatrix,
611 const Teuchos::RCP<const map_type>& rowMap,
612 const Teuchos::RCP<const map_type>& colMap,
613 const Teuchos::RCP<const map_type>& domainMap,
614 const Teuchos::RCP<const map_type>& rangeMap,
615 const Teuchos::RCP<Teuchos::ParameterList>& params) :
616 dist_object_type (rowMap),
617 storageStatus_ (Details::STORAGE_1D_PACKED),
618 fillComplete_ (true)
619 {
620 const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
621 "RCP<const Map>, RCP<const Map>, RCP<const Map>, "
622 "local_matrix_device_type[, RCP<ParameterList>]): ";
623 const char suffix[] =
624 " Please report this bug to the Tpetra developers.";
625
626 Teuchos::RCP<crs_graph_type> graph;
627 try {
628 graph = Teuchos::rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
629 domainMap, rangeMap, params));
630 }
631 catch (std::exception& e) {
632 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
633 (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
634 "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_device_type[, "
635 "RCP<ParameterList>]) threw an exception: " << e.what ());
636 }
637 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
638 (! graph->isFillComplete (), std::logic_error, "CrsGraph "
639 "constructor (RCP<const Map>, RCP<const Map>, RCP<const Map>, "
640 "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) did "
641 "not produce a fillComplete graph." << suffix);
642 // myGraph_ not null means that the matrix owns the graph. This
643 // is true because the column indices come in as nonconst through
644 // the matrix, implying shared ownership.
645 myGraph_ = graph;
646 staticGraph_ = graph;
647
648 valuesPacked_wdv = values_wdv_type(lclMatrix.values);
649 valuesUnpacked_wdv = valuesPacked_wdv;
650
651 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
652 (isFillActive (), std::logic_error,
653 "At the end of a CrsMatrix constructor that should produce "
654 "a fillComplete matrix, isFillActive() is true." << suffix);
655 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
656 (! isFillComplete (), std::logic_error, "At the end of a "
657 "CrsMatrix constructor that should produce a fillComplete "
658 "matrix, isFillComplete() is false." << suffix);
660 }
661
662 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
663 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
664 CrsMatrix (const local_matrix_device_type& lclMatrix,
665 const Teuchos::RCP<const map_type>& rowMap,
666 const Teuchos::RCP<const map_type>& colMap,
667 const Teuchos::RCP<const map_type>& domainMap,
668 const Teuchos::RCP<const map_type>& rangeMap,
669 const Teuchos::RCP<const import_type>& importer,
670 const Teuchos::RCP<const export_type>& exporter,
671 const Teuchos::RCP<Teuchos::ParameterList>& params) :
672 dist_object_type (rowMap),
673 storageStatus_ (Details::STORAGE_1D_PACKED),
674 fillComplete_ (true)
675 {
676 using Teuchos::rcp;
677 const char tfecfFuncName[] = "Tpetra::CrsMatrix"
678 "(lclMat,Map,Map,Map,Map,Import,Export,params): ";
679 const char suffix[] =
680 " Please report this bug to the Tpetra developers.";
681
682 Teuchos::RCP<crs_graph_type> graph;
683 try {
684 graph = rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
685 domainMap, rangeMap, importer,
686 exporter, params));
687 }
688 catch (std::exception& e) {
689 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
690 (true, std::runtime_error, "CrsGraph constructor "
691 "(local_graph_device_type, Map, Map, Map, Map, Import, Export, "
692 "params) threw: " << e.what ());
693 }
694 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
695 (!graph->isFillComplete (), std::logic_error, "CrsGraph "
696 "constructor (local_graph_device_type, Map, Map, Map, Map, Import, "
697 "Export, params) did not produce a fill-complete graph. "
698 "Please report this bug to the Tpetra developers.");
699 // myGraph_ not null means that the matrix owns the graph. This
700 // is true because the column indices come in as nonconst through
701 // the matrix, implying shared ownership.
702 myGraph_ = graph;
703 staticGraph_ = graph;
704
705 valuesPacked_wdv = values_wdv_type(lclMatrix.values);
706 valuesUnpacked_wdv = valuesPacked_wdv;
707
708 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
709 (isFillActive (), std::logic_error,
710 "At the end of a CrsMatrix constructor that should produce "
711 "a fillComplete matrix, isFillActive() is true." << suffix);
712 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
713 (! isFillComplete (), std::logic_error, "At the end of a "
714 "CrsMatrix constructor that should produce a fillComplete "
715 "matrix, isFillComplete() is false." << suffix);
717 }
718
719 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
720 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
721 CrsMatrix (const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& source,
722 const Teuchos::DataAccess copyOrView):
723 dist_object_type (source.getCrsGraph()->getRowMap ()),
724 staticGraph_ (source.getCrsGraph()),
726 {
727 const char tfecfFuncName[] = "Tpetra::CrsMatrix("
728 "const CrsMatrix&, const Teuchos::DataAccess): ";
729 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
730 (! source.isFillComplete (), std::invalid_argument,
731 "Source graph must be fillComplete().");
732
733 if (copyOrView == Teuchos::Copy) {
734 using values_type = typename local_matrix_device_type::values_type;
735 auto vals = source.getLocalValuesDevice (Access::ReadOnly);
736 using Kokkos::view_alloc;
737 using Kokkos::WithoutInitializing;
738 values_type newvals (view_alloc ("val", WithoutInitializing),
739 vals.extent (0));
740 // DEEP_COPY REVIEW - DEVICE-TO_DEVICE
741 Kokkos::deep_copy (newvals, vals);
742 valuesPacked_wdv = values_wdv_type(newvals);
743 valuesUnpacked_wdv = valuesPacked_wdv;
744 fillComplete (source.getDomainMap (), source.getRangeMap ());
745 }
746 else if (copyOrView == Teuchos::View) {
747 valuesPacked_wdv = values_wdv_type(source.valuesPacked_wdv);
748 valuesUnpacked_wdv = values_wdv_type(source.valuesUnpacked_wdv);
749 fillComplete (source.getDomainMap (), source.getRangeMap ());
750 }
751 else {
752 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
753 (true, std::invalid_argument, "Second argument 'copyOrView' "
754 "has an invalid value " << copyOrView << ". Valid values "
755 "include Teuchos::Copy = " << Teuchos::Copy << " and "
756 "Teuchos::View = " << Teuchos::View << ".");
757 }
759 }
760
761 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
762 void
764 swap(CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> & crs_matrix)
765 {
766 std::swap(crs_matrix.importMV_, this->importMV_);
767 std::swap(crs_matrix.exportMV_, this->exportMV_);
768 std::swap(crs_matrix.staticGraph_, this->staticGraph_);
769 std::swap(crs_matrix.myGraph_, this->myGraph_);
770 std::swap(crs_matrix.valuesPacked_wdv, this->valuesPacked_wdv);
771 std::swap(crs_matrix.valuesUnpacked_wdv, this->valuesUnpacked_wdv);
772 std::swap(crs_matrix.storageStatus_, this->storageStatus_);
773 std::swap(crs_matrix.fillComplete_, this->fillComplete_);
774 std::swap(crs_matrix.nonlocals_, this->nonlocals_);
775 }
776
777 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
778 Teuchos::RCP<const Teuchos::Comm<int> >
780 getComm () const {
781 return getCrsGraphRef ().getComm ();
782 }
783
784 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
785 bool
790
791 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
792 bool
797
798 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
799 bool
801 isStorageOptimized () const {
802 return this->getCrsGraphRef ().isStorageOptimized ();
803 }
804
805 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
806 bool
808 isLocallyIndexed () const {
809 return getCrsGraphRef ().isLocallyIndexed ();
810 }
811
812 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
813 bool
815 isGloballyIndexed () const {
816 return getCrsGraphRef ().isGloballyIndexed ();
817 }
818
819 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
820 bool
822 hasColMap () const {
823 return getCrsGraphRef ().hasColMap ();
824 }
825
826 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
829 getGlobalNumEntries () const {
830 return getCrsGraphRef ().getGlobalNumEntries ();
831 }
832
833 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
834 size_t
836 getLocalNumEntries () const {
837 return getCrsGraphRef ().getLocalNumEntries ();
838 }
839
840 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
843 getGlobalNumRows () const {
844 return getCrsGraphRef ().getGlobalNumRows ();
845 }
846
847 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
850 getGlobalNumCols () const {
851 return getCrsGraphRef ().getGlobalNumCols ();
852 }
853
854 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
855 size_t
857 getLocalNumRows () const {
858 return getCrsGraphRef ().getLocalNumRows ();
859 }
860
861
862 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
863 size_t
865 getLocalNumCols () const {
866 return getCrsGraphRef ().getLocalNumCols ();
867 }
868
869
870 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
871 size_t
873 getNumEntriesInGlobalRow (GlobalOrdinal globalRow) const {
874 return getCrsGraphRef ().getNumEntriesInGlobalRow (globalRow);
875 }
876
877 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
878 size_t
880 getNumEntriesInLocalRow (LocalOrdinal localRow) const {
881 return getCrsGraphRef ().getNumEntriesInLocalRow (localRow);
882 }
883
884 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
885 size_t
888 return getCrsGraphRef ().getGlobalMaxNumRowEntries ();
889 }
890
891 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
892 size_t
895 return getCrsGraphRef ().getLocalMaxNumRowEntries ();
896 }
897
898 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
899 GlobalOrdinal
901 getIndexBase () const {
902 return getRowMap ()->getIndexBase ();
903 }
904
905 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
906 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
908 getRowMap () const {
909 return getCrsGraphRef ().getRowMap ();
910 }
911
912 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
913 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
915 getColMap () const {
916 return getCrsGraphRef ().getColMap ();
917 }
918
919 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
920 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
922 getDomainMap () const {
923 return getCrsGraphRef ().getDomainMap ();
924 }
925
926 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
927 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
929 getRangeMap () const {
930 return getCrsGraphRef ().getRangeMap ();
931 }
932
933 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
934 Teuchos::RCP<const RowGraph<LocalOrdinal, GlobalOrdinal, Node> >
936 getGraph () const {
937 if (staticGraph_ != Teuchos::null) {
938 return staticGraph_;
939 }
940 return myGraph_;
941 }
942
943 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
944 Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
946 getCrsGraph () const {
947 if (staticGraph_ != Teuchos::null) {
948 return staticGraph_;
949 }
950 return myGraph_;
951 }
952
953 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
956 getCrsGraphRef () const
957 {
958#ifdef HAVE_TPETRA_DEBUG
959 constexpr bool debug = true;
960#else
961 constexpr bool debug = false;
962#endif // HAVE_TPETRA_DEBUG
963
964 if (! this->staticGraph_.is_null ()) {
965 return * (this->staticGraph_);
966 }
967 else {
968 if (debug) {
969 const char tfecfFuncName[] = "getCrsGraphRef: ";
970 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
971 (this->myGraph_.is_null (), std::logic_error,
972 "Both staticGraph_ and myGraph_ are null. "
973 "Please report this bug to the Tpetra developers.");
974 }
975 return * (this->myGraph_);
976 }
977 }
978
979 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
983 {
984 auto numCols = staticGraph_->getColMap()->getLocalNumElements();
985 return local_matrix_device_type("Tpetra::CrsMatrix::lclMatrixDevice",
986 numCols,
987 valuesPacked_wdv.getDeviceView(Access::ReadWrite),
988 staticGraph_->getLocalGraphDevice());
989 }
990
991 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
992 typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type
994 getLocalMatrixHost () const
995 {
996 auto numCols = staticGraph_->getColMap()->getLocalNumElements();
997 return local_matrix_host_type("Tpetra::CrsMatrix::lclMatrixHost", numCols,
998 valuesPacked_wdv.getHostView(Access::ReadWrite),
999 staticGraph_->getLocalGraphHost());
1000 }
1001
1002#if KOKKOSKERNELS_VERSION < 40299
1003// KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1004 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1005 std::shared_ptr<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_multiply_op_type>
1008 {
1009 auto localMatrix = getLocalMatrixDevice();
1010#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) || defined(KOKKOSKERNELS_ENABLE_TPL_MKL)
1011 if(this->getLocalNumEntries() <= size_t(Teuchos::OrdinalTraits<LocalOrdinal>::max()))
1012 {
1013 if(this->ordinalRowptrs.data() == nullptr)
1014 {
1015 auto originalRowptrs = localMatrix.graph.row_map;
1016 //create LocalOrdinal-typed copy of the local graph's rowptrs.
1017 //This enables the LocalCrsMatrixOperator to use cuSPARSE SpMV.
1018 this->ordinalRowptrs = ordinal_rowptrs_type(
1019 Kokkos::ViewAllocateWithoutInitializing("CrsMatrix::ordinalRowptrs"), originalRowptrs.extent(0));
1020 auto ordinalRowptrs_ = this->ordinalRowptrs; //don't want to capture 'this'
1021 Kokkos::parallel_for("CrsMatrix::getLocalMultiplyOperator::convertRowptrs",
1022 Kokkos::RangePolicy<execution_space>(0, originalRowptrs.extent(0)),
1023 KOKKOS_LAMBDA(LocalOrdinal i)
1024 {
1025 ordinalRowptrs_(i) = originalRowptrs(i);
1026 });
1027 }
1028 //return local operator using ordinalRowptrs
1029 return std::make_shared<local_multiply_op_type>(
1030 std::make_shared<local_matrix_device_type>(localMatrix), this->ordinalRowptrs);
1032#endif
1033// KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1034 return std::make_shared<local_multiply_op_type>(
1035 std::make_shared<local_matrix_device_type>(localMatrix));
1036 }
1037#endif
1038
1039 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1040 bool
1042 isStaticGraph () const {
1043 return myGraph_.is_null ();
1044 }
1045
1046 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1047 bool
1052
1053 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1054 bool
1059
1060 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1061 void
1063 allocateValues (ELocalGlobal lg, GraphAllocationStatus gas,
1064 const bool verbose)
1065 {
1066 using Details::Behavior;
1068 using std::endl;
1069 const char tfecfFuncName[] = "allocateValues: ";
1070 const char suffix[] =
1071 " Please report this bug to the Tpetra developers.";
1072 ProfilingRegion region("Tpetra::CrsMatrix::allocateValues");
1073
1074 std::unique_ptr<std::string> prefix;
1075 if (verbose) {
1076 prefix = this->createPrefix("CrsMatrix", "allocateValues");
1077 std::ostringstream os;
1078 os << *prefix << "lg: "
1079 << (lg == LocalIndices ? "Local" : "Global") << "Indices"
1080 << ", gas: Graph"
1081 << (gas == GraphAlreadyAllocated ? "Already" : "NotYet")
1082 << "Allocated" << endl;
1083 std::cerr << os.str();
1084 }
1085
1086 const bool debug = Behavior::debug("CrsMatrix");
1087 if (debug) {
1088 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1089 (this->staticGraph_.is_null (), std::logic_error,
1090 "staticGraph_ is null." << suffix);
1091
1092 // If the graph indices are already allocated, then gas should be
1093 // GraphAlreadyAllocated. Otherwise, gas should be
1094 // GraphNotYetAllocated.
1095 if ((gas == GraphAlreadyAllocated) !=
1096 staticGraph_->indicesAreAllocated ()) {
1097 const char err1[] = "The caller has asserted that the graph "
1098 "is ";
1099 const char err2[] = "already allocated, but the static graph "
1100 "says that its indices are ";
1101 const char err3[] = "already allocated. ";
1102 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1103 (gas == GraphAlreadyAllocated &&
1104 ! staticGraph_->indicesAreAllocated (), std::logic_error,
1105 err1 << err2 << "not " << err3 << suffix);
1106 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1107 (gas != GraphAlreadyAllocated &&
1108 staticGraph_->indicesAreAllocated (), std::logic_error,
1109 err1 << "not " << err2 << err3 << suffix);
1110 }
1111
1112 // If the graph is unallocated, then it had better be a
1113 // matrix-owned graph. ("Matrix-owned graph" means that the
1114 // matrix gets to define the graph structure. If the CrsMatrix
1115 // constructor that takes an RCP<const CrsGraph> was used, then
1116 // the matrix does _not_ own the graph.)
1117 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1118 (! this->staticGraph_->indicesAreAllocated () &&
1119 this->myGraph_.is_null (), std::logic_error,
1120 "The static graph says that its indices are not allocated, "
1121 "but the graph is not owned by the matrix." << suffix);
1122 }
1123
1124 if (gas == GraphNotYetAllocated) {
1125 if (debug) {
1126 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1127 (this->myGraph_.is_null (), std::logic_error,
1128 "gas = GraphNotYetAllocated, but myGraph_ is null." << suffix);
1129 }
1130 try {
1131 this->myGraph_->allocateIndices (lg, verbose);
1132 }
1133 catch (std::exception& e) {
1134 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1135 (true, std::runtime_error, "CrsGraph::allocateIndices "
1136 "threw an exception: " << e.what ());
1137 }
1138 catch (...) {
1139 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1140 (true, std::runtime_error, "CrsGraph::allocateIndices "
1141 "threw an exception not a subclass of std::exception.");
1142 }
1143 }
1144
1145 // Allocate matrix values.
1146 const size_t lclTotalNumEntries = this->staticGraph_->getLocalAllocationSize();
1147 if (debug) {
1148 const size_t lclNumRows = this->staticGraph_->getLocalNumRows ();
1149 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1150 (this->staticGraph_->getRowPtrsUnpackedHost()(lclNumRows) != lclTotalNumEntries, std::logic_error,
1151 "length of staticGraph's lclIndsUnpacked does not match final entry of rowPtrsUnapcked_host." << suffix);
1152 }
1153
1154 // Allocate array of (packed???) matrix values.
1155 using values_type = typename local_matrix_device_type::values_type;
1156 if (verbose) {
1157 std::ostringstream os;
1158 os << *prefix << "Allocate values_wdv: Pre "
1159 << valuesUnpacked_wdv.extent(0) << ", post "
1160 << lclTotalNumEntries << endl;
1161 std::cerr << os.str();
1162 }
1163// this->k_values1D_ =
1164 valuesUnpacked_wdv = values_wdv_type(
1165 values_type("Tpetra::CrsMatrix::values",
1166 lclTotalNumEntries));
1167 }
1168
1169
1170 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1171 void
1173 fillLocalGraphAndMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1174 {
1176 using ::Tpetra::Details::getEntryOnHost;
1177 using Teuchos::arcp_const_cast;
1178 using Teuchos::Array;
1179 using Teuchos::ArrayRCP;
1180 using Teuchos::null;
1181 using Teuchos::RCP;
1182 using Teuchos::rcp;
1183 using std::endl;
1184 using row_map_type = typename local_graph_device_type::row_map_type;
1185 using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
1186 using values_type = typename local_matrix_device_type::values_type;
1187 Details::ProfilingRegion regionFLGAM
1188 ("Tpetra::CrsMatrix::fillLocalGraphAndMatrix");
1189
1190 const char tfecfFuncName[] = "fillLocalGraphAndMatrix (called from "
1191 "fillComplete or expertStaticFillComplete): ";
1192 const char suffix[] =
1193 " Please report this bug to the Tpetra developers.";
1194 const bool debug = Details::Behavior::debug("CrsMatrix");
1195 const bool verbose = Details::Behavior::verbose("CrsMatrix");
1196
1197 std::unique_ptr<std::string> prefix;
1198 if (verbose) {
1199 prefix = this->createPrefix("CrsMatrix", "fillLocalGraphAndMatrix");
1200 std::ostringstream os;
1201 os << *prefix << endl;
1202 std::cerr << os.str ();
1203 }
1204
1205 if (debug) {
1206 // fillComplete() only calls fillLocalGraphAndMatrix() if the
1207 // matrix owns the graph, which means myGraph_ is not null.
1208 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1209 (myGraph_.is_null (), std::logic_error, "The nonconst graph "
1210 "(myGraph_) is null. This means that the matrix has a "
1211 "const (a.k.a. \"static\") graph. fillComplete or "
1212 "expertStaticFillComplete should never call "
1213 "fillLocalGraphAndMatrix in that case." << suffix);
1214 }
1215
1216 const size_t lclNumRows = this->getLocalNumRows ();
1217
1218 // This method's goal is to fill in the three arrays (compressed
1219 // sparse row format) that define the sparse graph's and matrix's
1220 // structure, and the sparse matrix's values.
1221 //
1222 // Get references to the data in myGraph_, so we can modify them
1223 // as well. Note that we only call fillLocalGraphAndMatrix() if
1224 // the matrix owns the graph, which means myGraph_ is not null.
1225
1226 // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1227 // See: https://github.com/trilinos/Trilinos/issues/12237
1228 //using row_entries_type = decltype (myGraph_->k_numRowEntries_);
1229 using row_entries_type = typename crs_graph_type::num_row_entries_type;
1230
1231 typename Graph::local_graph_device_type::row_map_type curRowOffsets =
1232 myGraph_->rowPtrsUnpacked_dev_;
1233
1234 if (debug) {
1235 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1236 (curRowOffsets.extent (0) == 0, std::logic_error,
1237 "curRowOffsets.extent(0) == 0.");
1238 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1239 (curRowOffsets.extent (0) != lclNumRows + 1, std::logic_error,
1240 "curRowOffsets.extent(0) = "
1241 << curRowOffsets.extent (0) << " != lclNumRows + 1 = "
1242 << (lclNumRows + 1) << ".");
1243 const size_t numOffsets = curRowOffsets.extent (0);
1244 const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1245 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1246 (numOffsets != 0 &&
1247 myGraph_->lclIndsUnpacked_wdv.extent (0) != valToCheck,
1248 std::logic_error, "numOffsets = " <<
1249 numOffsets << " != 0 and myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1250 << myGraph_->lclIndsUnpacked_wdv.extent (0) << " != curRowOffsets("
1251 << numOffsets << ") = " << valToCheck << ".");
1252 }
1253
1254 if (myGraph_->getLocalNumEntries() !=
1255 myGraph_->getLocalAllocationSize()) {
1256
1257 // Use the nonconst version of row_map_type for k_ptrs,
1258 // because row_map_type is const and we need to modify k_ptrs here.
1259 typename row_map_type::non_const_type k_ptrs;
1260 row_map_type k_ptrs_const;
1261 lclinds_1d_type k_inds;
1262 values_type k_vals;
1263
1264 if (verbose) {
1265 std::ostringstream os;
1266 const auto numEnt = myGraph_->getLocalNumEntries();
1267 const auto allocSize = myGraph_->getLocalAllocationSize();
1268 os << *prefix << "Unpacked 1-D storage: numEnt=" << numEnt
1269 << ", allocSize=" << allocSize << endl;
1270 std::cerr << os.str ();
1271 }
1272 // The matrix's current 1-D storage is "unpacked." This means
1273 // the row offsets may differ from what the final row offsets
1274 // should be. This could happen, for example, if the user
1275 // set an upper
1276 // bound on the number of entries per row, but didn't fill all
1277 // those entries.
1278 if (debug && curRowOffsets.extent (0) != 0) {
1279 const size_t numOffsets =
1280 static_cast<size_t> (curRowOffsets.extent (0));
1281 const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1282 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1283 (static_cast<size_t> (valToCheck) !=
1284 static_cast<size_t> (valuesUnpacked_wdv.extent (0)),
1285 std::logic_error, "(unpacked branch) Before "
1286 "allocating or packing, curRowOffsets(" << (numOffsets-1)
1287 << ") = " << valToCheck << " != valuesUnpacked_wdv.extent(0)"
1288 " = " << valuesUnpacked_wdv.extent (0) << ".");
1289 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1290 (static_cast<size_t> (valToCheck) !=
1291 static_cast<size_t> (myGraph_->lclIndsUnpacked_wdv.extent (0)),
1292 std::logic_error, "(unpacked branch) Before "
1293 "allocating or packing, curRowOffsets(" << (numOffsets-1)
1294 << ") = " << valToCheck
1295 << " != myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1296 << myGraph_->lclIndsUnpacked_wdv.extent (0) << ".");
1297 }
1298 // Pack the row offsets into k_ptrs, by doing a sum-scan of
1299 // the array of valid entry counts per row.
1300
1301 // Total number of entries in the matrix on the calling
1302 // process. We will compute this in the loop below. It's
1303 // cheap to compute and useful as a sanity check.
1304 size_t lclTotalNumEntries = 0;
1305 {
1306 // Allocate the packed row offsets array. We use a nonconst
1307 // temporary (packedRowOffsets) here, because k_ptrs is
1308 // const. We will assign packedRowOffsets to k_ptrs below.
1309 if (verbose) {
1310 std::ostringstream os;
1311 os << *prefix << "Allocate packed row offsets: "
1312 << (lclNumRows+1) << endl;
1313 std::cerr << os.str ();
1314 }
1315 typename row_map_type::non_const_type
1316 packedRowOffsets ("Tpetra::CrsGraph::ptr", lclNumRows + 1);
1317 typename row_entries_type::const_type numRowEnt_h =
1318 myGraph_->k_numRowEntries_;
1319 // We're computing offsets on device. This function can
1320 // handle numRowEnt_h being a host View.
1321 lclTotalNumEntries =
1322 computeOffsetsFromCounts (packedRowOffsets, numRowEnt_h);
1323 // packedRowOffsets is modifiable; k_ptrs isn't, so we have
1324 // to use packedRowOffsets in the loop above and assign here.
1325 k_ptrs = packedRowOffsets;
1326 k_ptrs_const = k_ptrs;
1327 }
1328
1329 if (debug) {
1330 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1331 (static_cast<size_t> (k_ptrs.extent (0)) != lclNumRows + 1,
1332 std::logic_error,
1333 "(unpacked branch) After packing k_ptrs, "
1334 "k_ptrs.extent(0) = " << k_ptrs.extent (0) << " != "
1335 "lclNumRows+1 = " << (lclNumRows+1) << ".");
1336 const auto valToCheck = getEntryOnHost (k_ptrs, lclNumRows);
1337 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1338 (valToCheck != lclTotalNumEntries, std::logic_error,
1339 "(unpacked branch) After filling k_ptrs, "
1340 "k_ptrs(lclNumRows=" << lclNumRows << ") = " << valToCheck
1341 << " != total number of entries on the calling process = "
1342 << lclTotalNumEntries << ".");
1343 }
1344
1345 // Allocate the arrays of packed column indices and values.
1346 if (verbose) {
1347 std::ostringstream os;
1348 os << *prefix << "Allocate packed local column indices: "
1349 << lclTotalNumEntries << endl;
1350 std::cerr << os.str ();
1351 }
1352 k_inds = lclinds_1d_type ("Tpetra::CrsGraph::lclInds", lclTotalNumEntries);
1353 if (verbose) {
1354 std::ostringstream os;
1355 os << *prefix << "Allocate packed values: "
1356 << lclTotalNumEntries << endl;
1357 std::cerr << os.str ();
1358 }
1359 k_vals = values_type ("Tpetra::CrsMatrix::values", lclTotalNumEntries);
1360
1361 // curRowOffsets (myGraph_->rowPtrsUnpacked_) (???), lclIndsUnpacked_wdv,
1362 // and valuesUnpacked_wdv are currently unpacked. Pack them, using
1363 // the packed row offsets array k_ptrs that we created above.
1364 //
1365 // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
1366 // need to keep around the unpacked row offsets, column
1367 // indices, and values arrays.
1368
1369 // Pack the column indices from unpacked lclIndsUnpacked_wdv into
1370 // packed k_inds. We will replace lclIndsUnpacked_wdv below.
1371 using inds_packer_type = pack_functor<
1372 typename Graph::local_graph_device_type::entries_type::non_const_type,
1373 typename Graph::local_inds_dualv_type::t_dev::const_type,
1374 typename Graph::local_graph_device_type::row_map_type::non_const_type,
1375 typename Graph::local_graph_device_type::row_map_type>;
1376 inds_packer_type indsPacker (
1377 k_inds,
1378 myGraph_->lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
1379 k_ptrs, curRowOffsets);
1380 using exec_space = typename decltype (k_inds)::execution_space;
1381 using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1382 Kokkos::parallel_for
1383 ("Tpetra::CrsMatrix pack column indices",
1384 range_type (0, lclNumRows), indsPacker);
1385
1386 // Pack the values from unpacked valuesUnpacked_wdv into packed
1387 // k_vals. We will replace valuesPacked_wdv below.
1388 using vals_packer_type = pack_functor<
1389 typename values_type::non_const_type,
1390 typename values_type::const_type,
1391 typename row_map_type::non_const_type,
1392 typename row_map_type::const_type>;
1393 vals_packer_type valsPacker (
1394 k_vals,
1395 this->valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1396 k_ptrs, curRowOffsets);
1397 Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1398 range_type (0, lclNumRows), valsPacker);
1399
1400 if (debug) {
1401 const char myPrefix[] = "(\"Optimize Storage\""
1402 "=true branch) After packing, ";
1403 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1404 (k_ptrs.extent (0) == 0, std::logic_error, myPrefix
1405 << "k_ptrs.extent(0) = 0. This probably means that "
1406 "rowPtrsUnpacked_ was never allocated.");
1407 if (k_ptrs.extent (0) != 0) {
1408 const size_t numOffsets (k_ptrs.extent (0));
1409 const auto valToCheck =
1410 getEntryOnHost (k_ptrs, numOffsets - 1);
1411 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1412 (size_t (valToCheck) != k_vals.extent (0),
1413 std::logic_error, myPrefix <<
1414 "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1415 " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
1416 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1417 (size_t (valToCheck) != k_inds.extent (0),
1418 std::logic_error, myPrefix <<
1419 "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1420 " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
1421 }
1422 }
1423 // Build the local graph.
1424 myGraph_->setRowPtrsPacked(k_ptrs_const);
1425 myGraph_->lclIndsPacked_wdv =
1426 typename crs_graph_type::local_inds_wdv_type(k_inds);
1427 valuesPacked_wdv = values_wdv_type(k_vals);
1428 }
1429 else { // We don't have to pack, so just set the pointers.
1430 // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1431 // FIXME? This is already done in the graph fill call - need to avoid the memcpy to host
1432 myGraph_->rowPtrsPacked_dev_ = myGraph_->rowPtrsUnpacked_dev_;
1433 myGraph_->rowPtrsPacked_host_ = myGraph_->rowPtrsUnpacked_host_;
1434 myGraph_->packedUnpackedRowPtrsMatch_ = true;
1435 myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
1436 valuesPacked_wdv = valuesUnpacked_wdv;
1437
1438 if (verbose) {
1439 std::ostringstream os;
1440 os << *prefix << "Storage already packed: rowPtrsUnpacked_: "
1441 << myGraph_->getRowPtrsUnpackedHost().extent(0) << ", lclIndsUnpacked_wdv: "
1442 << myGraph_->lclIndsUnpacked_wdv.extent(0) << ", valuesUnpacked_wdv: "
1443 << valuesUnpacked_wdv.extent(0) << endl;
1444 std::cerr << os.str();
1445 }
1446
1447 if (debug) {
1448 const char myPrefix[] =
1449 "(\"Optimize Storage\"=false branch) ";
1450 auto rowPtrsUnpackedHost = myGraph_->getRowPtrsUnpackedHost();
1451 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1452 (myGraph_->rowPtrsUnpacked_dev_.extent (0) == 0, std::logic_error, myPrefix
1453 << "myGraph->rowPtrsUnpacked_dev_.extent(0) = 0. This probably means "
1454 "that rowPtrsUnpacked_ was never allocated.");
1455 if (myGraph_->rowPtrsUnpacked_dev_.extent (0) != 0) {
1456 const size_t numOffsets = rowPtrsUnpackedHost.extent (0);
1457 const auto valToCheck = rowPtrsUnpackedHost(numOffsets - 1);
1458 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1459 (size_t (valToCheck) != valuesPacked_wdv.extent (0),
1460 std::logic_error, myPrefix <<
1461 "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1462 << " != valuesPacked_wdv.extent(0) = "
1463 << valuesPacked_wdv.extent (0) << ".");
1464 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1465 (size_t (valToCheck) != myGraph_->lclIndsPacked_wdv.extent (0),
1466 std::logic_error, myPrefix <<
1467 "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1468 << " != myGraph_->lclIndsPacked.extent(0) = "
1469 << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1470 }
1471 }
1472 }
1473
1474 if (debug) {
1475 const char myPrefix[] = "After packing, ";
1476 auto rowPtrsPackedHost = myGraph_->getRowPtrsPackedHost();
1477 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1478 (size_t (rowPtrsPackedHost.extent (0)) != size_t (lclNumRows + 1),
1479 std::logic_error, myPrefix << "myGraph_->rowPtrsPacked_host_.extent(0) = "
1480 << rowPtrsPackedHost.extent (0) << " != lclNumRows+1 = " <<
1481 (lclNumRows+1) << ".");
1482 if (rowPtrsPackedHost.extent (0) != 0) {
1483 const size_t numOffsets (rowPtrsPackedHost.extent (0));
1484 const size_t valToCheck = rowPtrsPackedHost(numOffsets-1);
1485 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1486 (valToCheck != size_t (valuesPacked_wdv.extent (0)),
1487 std::logic_error, myPrefix << "k_ptrs_const(" <<
1488 (numOffsets-1) << ") = " << valToCheck
1489 << " != valuesPacked_wdv.extent(0) = "
1490 << valuesPacked_wdv.extent (0) << ".");
1491 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1492 (valToCheck != size_t (myGraph_->lclIndsPacked_wdv.extent (0)),
1493 std::logic_error, myPrefix << "k_ptrs_const(" <<
1494 (numOffsets-1) << ") = " << valToCheck
1495 << " != myGraph_->lclIndsPacked_wdvk_inds.extent(0) = "
1496 << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1497 }
1498 }
1499
1500 // May we ditch the old allocations for the packed (and otherwise
1501 // "optimized") allocations, later in this routine? Optimize
1502 // storage if the graph is not static, or if the graph already has
1503 // optimized storage.
1504 const bool defaultOptStorage =
1505 ! isStaticGraph () || staticGraph_->isStorageOptimized ();
1506 const bool requestOptimizedStorage =
1507 (! params.is_null () &&
1508 params->get ("Optimize Storage", defaultOptStorage)) ||
1509 (params.is_null () && defaultOptStorage);
1510
1511 // The graph has optimized storage when indices are allocated,
1512 // myGraph_->k_numRowEntries_ is empty, and there are more than
1513 // zero rows on this process.
1514 if (requestOptimizedStorage) {
1515 // Free the old, unpacked, unoptimized allocations.
1516 // Free graph data structures that are only needed for
1517 // unpacked 1-D storage.
1518 if (verbose) {
1519 std::ostringstream os;
1520 os << *prefix << "Optimizing storage: free k_numRowEntries_: "
1521 << myGraph_->k_numRowEntries_.extent(0) << endl;
1522 std::cerr << os.str();
1523 }
1524
1525 myGraph_->k_numRowEntries_ = row_entries_type ();
1526
1527 // Keep the new 1-D packed allocations.
1528 // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1529 // We directly set the memory spaces to avoid a memcpy from device to host
1530 myGraph_->rowPtrsUnpacked_dev_ = myGraph_->rowPtrsPacked_dev_;
1531 myGraph_->rowPtrsUnpacked_host_ = myGraph_->rowPtrsPacked_host_;
1532 myGraph_->packedUnpackedRowPtrsMatch_ = true;
1533 myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
1534 valuesUnpacked_wdv = valuesPacked_wdv;
1535
1536 myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
1537 this->storageStatus_ = Details::STORAGE_1D_PACKED;
1538 }
1539 else {
1540 if (verbose) {
1541 std::ostringstream os;
1542 os << *prefix << "User requested NOT to optimize storage"
1543 << endl;
1544 std::cerr << os.str();
1545 }
1546 }
1547 }
1548
1549 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1550 void
1552 fillLocalMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1553 {
1555 using Teuchos::ArrayRCP;
1556 using Teuchos::Array;
1557 using Teuchos::null;
1558 using Teuchos::RCP;
1559 using Teuchos::rcp;
1560 using std::endl;
1561 using row_map_type = typename Graph::local_graph_device_type::row_map_type;
1562 using non_const_row_map_type = typename row_map_type::non_const_type;
1563 using values_type = typename local_matrix_device_type::values_type;
1564 ProfilingRegion regionFLM("Tpetra::CrsMatrix::fillLocalMatrix");
1565 const size_t lclNumRows = getLocalNumRows();
1566
1567 const bool verbose = Details::Behavior::verbose("CrsMatrix");
1568 std::unique_ptr<std::string> prefix;
1569 if (verbose) {
1570 prefix = this->createPrefix("CrsMatrix", "fillLocalMatrix");
1571 std::ostringstream os;
1572 os << *prefix << "lclNumRows: " << lclNumRows << endl;
1573 std::cerr << os.str ();
1574 }
1575
1576 // The goals of this routine are first, to allocate and fill
1577 // packed 1-D storage (see below for an explanation) in the vals
1578 // array, and second, to give vals to the local matrix and
1579 // finalize the local matrix. We only need k_ptrs, the packed 1-D
1580 // row offsets, within the scope of this routine, since we're only
1581 // filling the local matrix here (use fillLocalGraphAndMatrix() to
1582 // fill both the graph and the matrix at the same time).
1583
1584 // get data from staticGraph_
1585 size_t nodeNumEntries = staticGraph_->getLocalNumEntries ();
1586 size_t nodeNumAllocated = staticGraph_->getLocalAllocationSize ();
1587 row_map_type k_rowPtrs = staticGraph_->rowPtrsPacked_dev_;
1588
1589 row_map_type k_ptrs; // "packed" row offsets array
1590 values_type k_vals; // "packed" values array
1591
1592 // May we ditch the old allocations for the packed (and otherwise
1593 // "optimized") allocations, later in this routine? Request
1594 // optimized storage by default.
1595 bool requestOptimizedStorage = true;
1596 const bool default_OptimizeStorage =
1597 ! isStaticGraph() || staticGraph_->isStorageOptimized();
1598 if (! params.is_null() &&
1599 ! params->get("Optimize Storage", default_OptimizeStorage)) {
1600 requestOptimizedStorage = false;
1601 }
1602 // If we're not allowed to change a static graph, then we can't
1603 // change the storage of the matrix, either. This means that if
1604 // the graph's storage isn't already optimized, we can't optimize
1605 // the matrix's storage either. Check and give warning, as
1606 // appropriate.
1607 if (! staticGraph_->isStorageOptimized () &&
1608 requestOptimizedStorage) {
1610 (true, std::runtime_error, "You requested optimized storage "
1611 "by setting the \"Optimize Storage\" flag to \"true\" in "
1612 "the ParameterList, or by virtue of default behavior. "
1613 "However, the associated CrsGraph was filled separately and "
1614 "requested not to optimize storage. Therefore, the "
1615 "CrsMatrix cannot optimize storage.");
1616 requestOptimizedStorage = false;
1617 }
1618
1619 // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1620 // See: https://github.com/trilinos/Trilinos/issues/12237
1621 //using row_entries_type = decltype (staticGraph_->k_numRowEntries_);
1622 using row_entries_type = typename crs_graph_type::num_row_entries_type;
1623
1624 // The matrix's values are currently
1625 // stored in a 1-D format. However, this format is "unpacked";
1626 // it doesn't necessarily have the same row offsets as indicated
1627 // by the ptrs array returned by allocRowPtrs. This could
1628 // happen, for example, if the user
1629 // fixed the number of matrix entries in
1630 // each row, but didn't fill all those entries.
1631 //
1632 // As above, we don't need to keep the "packed" row offsets
1633 // array ptrs here, but we do need it here temporarily, so we
1634 // have to allocate it. We'll free ptrs later in this method.
1635 //
1636 // Note that this routine checks whether storage has already
1637 // been packed. This is a common case for solution of nonlinear
1638 // PDEs using the finite element method, as long as the
1639 // structure of the sparse matrix does not change between linear
1640 // solves.
1641 if (nodeNumEntries != nodeNumAllocated) {
1642 if (verbose) {
1643 std::ostringstream os;
1644 os << *prefix << "Unpacked 1-D storage: numEnt="
1645 << nodeNumEntries << ", allocSize=" << nodeNumAllocated
1646 << endl;
1647 std::cerr << os.str();
1648 }
1649 // We have to pack the 1-D storage, since the user didn't fill
1650 // up all requested storage.
1651 if (verbose) {
1652 std::ostringstream os;
1653 os << *prefix << "Allocate packed row offsets: "
1654 << (lclNumRows+1) << endl;
1655 std::cerr << os.str();
1656 }
1657 non_const_row_map_type tmpk_ptrs ("Tpetra::CrsGraph::ptr",
1658 lclNumRows+1);
1659 // Total number of entries in the matrix on the calling
1660 // process. We will compute this in the loop below. It's
1661 // cheap to compute and useful as a sanity check.
1662 size_t lclTotalNumEntries = 0;
1663 k_ptrs = tmpk_ptrs;
1664 {
1665 typename row_entries_type::const_type numRowEnt_h =
1666 staticGraph_->k_numRowEntries_;
1667 // This function can handle the counts being a host View.
1668 lclTotalNumEntries =
1669 Details::computeOffsetsFromCounts (tmpk_ptrs, numRowEnt_h);
1670 }
1671
1672 // Allocate the "packed" values array.
1673 // It has exactly the right number of entries.
1674 if (verbose) {
1675 std::ostringstream os;
1676 os << *prefix << "Allocate packed values: "
1677 << lclTotalNumEntries << endl;
1678 std::cerr << os.str ();
1679 }
1680 k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1681
1682 // Pack values_wdv into k_vals. We will replace values_wdv below.
1683 pack_functor<
1684 typename values_type::non_const_type,
1685 typename values_type::const_type,
1686 typename row_map_type::non_const_type,
1687 typename row_map_type::const_type> valsPacker
1688 (k_vals, valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1689 tmpk_ptrs, k_rowPtrs);
1690
1691 using exec_space = typename decltype (k_vals)::execution_space;
1692 using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1693 Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1694 range_type (0, lclNumRows), valsPacker);
1695 valuesPacked_wdv = values_wdv_type(k_vals);
1696 }
1697 else { // We don't have to pack, so just set the pointer.
1698 valuesPacked_wdv = valuesUnpacked_wdv;
1699 if (verbose) {
1700 std::ostringstream os;
1701 os << *prefix << "Storage already packed: "
1702 << "valuesUnpacked_wdv: " << valuesUnpacked_wdv.extent(0) << endl;
1703 std::cerr << os.str();
1704 }
1705 }
1706
1707 // May we ditch the old allocations for the packed one?
1708 if (requestOptimizedStorage) {
1709 // The user requested optimized storage, so we can dump the
1710 // unpacked 1-D storage, and keep the packed storage.
1711 valuesUnpacked_wdv = valuesPacked_wdv;
1712// k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
1713 this->storageStatus_ = Details::STORAGE_1D_PACKED;
1714 }
1715 }
1716
1717 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1718 void
1721 RowInfo& rowInfo,
1722 const typename crs_graph_type::SLocalGlobalViews& newInds,
1723 const Teuchos::ArrayView<impl_scalar_type>& oldRowVals,
1724 const Teuchos::ArrayView<const impl_scalar_type>& newRowVals,
1725 const ELocalGlobal lg,
1726 const ELocalGlobal I)
1727 {
1728 const size_t oldNumEnt = rowInfo.numEntries;
1729 const size_t numInserted = graph.insertIndices (rowInfo, newInds, lg, I);
1730
1731 // Use of memcpy here works around an issue with GCC >= 4.9.0,
1732 // that probably relates to scalar_type vs. impl_scalar_type
1733 // aliasing. See history of Tpetra_CrsGraph_def.hpp for
1734 // details; look for GCC_WORKAROUND macro definition.
1735 if (numInserted > 0) {
1736 const size_t startOffset = oldNumEnt;
1737 memcpy ((void*) &oldRowVals[startOffset], &newRowVals[0],
1738 numInserted * sizeof (impl_scalar_type));
1739 }
1740 }
1741
1742 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1743 void
1745 insertLocalValues (const LocalOrdinal lclRow,
1746 const Teuchos::ArrayView<const LocalOrdinal>& indices,
1747 const Teuchos::ArrayView<const Scalar>& values,
1748 const CombineMode CM)
1749 {
1750 using std::endl;
1751 const char tfecfFuncName[] = "insertLocalValues: ";
1752
1753 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1754 (! this->isFillActive (), std::runtime_error,
1755 "Fill is not active. After calling fillComplete, you must call "
1756 "resumeFill before you may insert entries into the matrix again.");
1757 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1758 (this->isStaticGraph (), std::runtime_error,
1759 "Cannot insert indices with static graph; use replaceLocalValues() "
1760 "instead.");
1761 // At this point, we know that myGraph_ is nonnull.
1762 crs_graph_type& graph = * (this->myGraph_);
1763 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1764 (graph.colMap_.is_null (), std::runtime_error,
1765 "Cannot insert local indices without a column map.");
1766 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1767 (graph.isGloballyIndexed (),
1768 std::runtime_error, "Graph indices are global; use "
1769 "insertGlobalValues().");
1770 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1771 (values.size () != indices.size (), std::runtime_error,
1772 "values.size() = " << values.size ()
1773 << " != indices.size() = " << indices.size () << ".");
1774 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1775 ! graph.rowMap_->isNodeLocalElement (lclRow), std::runtime_error,
1776 "Local row index " << lclRow << " does not belong to this process.");
1777
1778 if (! graph.indicesAreAllocated ()) {
1779 // We only allocate values at most once per process, so it's OK
1780 // to check TPETRA_VERBOSE here.
1781 const bool verbose = Details::Behavior::verbose("CrsMatrix");
1782 this->allocateValues (LocalIndices, GraphNotYetAllocated, verbose);
1783 }
1784
1785#ifdef HAVE_TPETRA_DEBUG
1786 const size_t numEntriesToAdd = static_cast<size_t> (indices.size ());
1787 // In a debug build, test whether any of the given column indices
1788 // are not in the column Map. Keep track of the invalid column
1789 // indices so we can tell the user about them.
1790 {
1791 using Teuchos::toString;
1792
1793 const map_type& colMap = * (graph.colMap_);
1794 Teuchos::Array<LocalOrdinal> badColInds;
1795 bool allInColMap = true;
1796 for (size_t k = 0; k < numEntriesToAdd; ++k) {
1797 if (! colMap.isNodeLocalElement (indices[k])) {
1798 allInColMap = false;
1799 badColInds.push_back (indices[k]);
1801 }
1802 if (! allInColMap) {
1803 std::ostringstream os;
1804 os << "You attempted to insert entries in owned row " << lclRow
1805 << ", at the following column indices: " << toString (indices)
1806 << "." << endl;
1807 os << "Of those, the following indices are not in the column Map on "
1808 "this process: " << toString (badColInds) << "." << endl << "Since "
1809 "the matrix has a column Map already, it is invalid to insert "
1810 "entries at those locations.";
1811 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1812 (true, std::invalid_argument, os.str ());
1813 }
1814 }
1815#endif // HAVE_TPETRA_DEBUG
1816
1817 RowInfo rowInfo = graph.getRowInfo (lclRow);
1818
1819 auto valsView = this->getValuesViewHostNonConst(rowInfo);
1820 if (CM == ADD) {
1821 auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1822 valsView[offset] += values[k]; };
1823 std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1824 graph.insertLocalIndicesImpl(lclRow, indices, cb);
1825 } else if (CM == INSERT) {
1826 auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1827 valsView[offset] = values[k]; };
1828 std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1829 graph.insertLocalIndicesImpl(lclRow, indices, cb);
1830 } else {
1831 std::ostringstream os;
1832 os << "You attempted to use insertLocalValues with CombineMode " << combineModeToString(CM)
1833 << "but this has not been implemented." << endl;
1834 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1835 (true, std::invalid_argument, os.str ());
1836 }
1837 }
1838
1839 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1840 void
1842 insertLocalValues (const LocalOrdinal localRow,
1843 const LocalOrdinal numEnt,
1844 const Scalar vals[],
1845 const LocalOrdinal cols[],
1846 const CombineMode CM)
1847 {
1848 Teuchos::ArrayView<const LocalOrdinal> colsT (cols, numEnt);
1849 Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
1850 this->insertLocalValues (localRow, colsT, valsT, CM);
1851 }
1852
1853 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1854 void
1857 RowInfo& rowInfo,
1858 const GlobalOrdinal gblColInds[],
1859 const impl_scalar_type vals[],
1860 const size_t numInputEnt)
1861 {
1862#ifdef HAVE_TPETRA_DEBUG
1863 const char tfecfFuncName[] = "insertGlobalValuesImpl: ";
1864 const size_t origNumEnt = graph.getNumEntriesInLocalRow (rowInfo.localRow);
1865 const size_t curNumEnt = rowInfo.numEntries;
1866#endif // HAVE_TPETRA_DEBUG
1867
1868 if (! graph.indicesAreAllocated ()) {
1869 // We only allocate values at most once per process, so it's OK
1870 // to check TPETRA_VERBOSE here.
1872 const bool verbose = Behavior::verbose("CrsMatrix");
1873 this->allocateValues (GlobalIndices, GraphNotYetAllocated, verbose);
1874 // mfh 23 Jul 2017: allocateValues invalidates existing
1875 // getRowInfo results. Once we get rid of lazy graph
1876 // allocation, we'll be able to move the getRowInfo call outside
1877 // of this method.
1878 rowInfo = graph.getRowInfo (rowInfo.localRow);
1879 }
1880
1881 auto valsView = this->getValuesViewHostNonConst(rowInfo);
1882 auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset){
1883 valsView[offset] += vals[k];
1884 };
1885 std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1886#ifdef HAVE_TPETRA_DEBUG
1887 //numInserted is only used inside the debug code below.
1888 auto numInserted =
1889#endif
1890 graph.insertGlobalIndicesImpl(rowInfo, gblColInds, numInputEnt, cb);
1891
1892#ifdef HAVE_TPETRA_DEBUG
1893 size_t newNumEnt = curNumEnt + numInserted;
1894 const size_t chkNewNumEnt =
1895 graph.getNumEntriesInLocalRow (rowInfo.localRow);
1896 if (chkNewNumEnt != newNumEnt) {
1897 std::ostringstream os;
1898 os << std::endl << "newNumEnt = " << newNumEnt
1899 << " != graph.getNumEntriesInLocalRow(" << rowInfo.localRow
1900 << ") = " << chkNewNumEnt << "." << std::endl
1901 << "\torigNumEnt: " << origNumEnt << std::endl
1902 << "\tnumInputEnt: " << numInputEnt << std::endl
1903 << "\tgblColInds: [";
1904 for (size_t k = 0; k < numInputEnt; ++k) {
1905 os << gblColInds[k];
1906 if (k + size_t (1) < numInputEnt) {
1907 os << ",";
1908 }
1909 }
1910 os << "]" << std::endl
1911 << "\tvals: [";
1912 for (size_t k = 0; k < numInputEnt; ++k) {
1913 os << vals[k];
1914 if (k + size_t (1) < numInputEnt) {
1915 os << ",";
1916 }
1917 }
1918 os << "]" << std::endl;
1919
1920 if (this->supportsRowViews ()) {
1921 values_host_view_type vals2;
1922 if (this->isGloballyIndexed ()) {
1923 global_inds_host_view_type gblColInds2;
1924 const GlobalOrdinal gblRow =
1925 graph.rowMap_->getGlobalElement (rowInfo.localRow);
1926 if (gblRow ==
1927 Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid ()) {
1928 os << "Local row index " << rowInfo.localRow << " is invalid!"
1929 << std::endl;
1930 }
1931 else {
1932 bool getViewThrew = false;
1933 try {
1934 this->getGlobalRowView (gblRow, gblColInds2, vals2);
1935 }
1936 catch (std::exception& e) {
1937 getViewThrew = true;
1938 os << "getGlobalRowView threw exception:" << std::endl
1939 << e.what () << std::endl;
1940 }
1941 if (! getViewThrew) {
1942 os << "\tNew global column indices: ";
1943 for (size_t jjj = 0; jjj < gblColInds2.extent(0); jjj++)
1944 os << gblColInds2[jjj] << " ";
1945 os << std::endl;
1946 os << "\tNew values: ";
1947 for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1948 os << vals2[jjj] << " ";
1949 os << std::endl;
1950 }
1951 }
1952 }
1953 else if (this->isLocallyIndexed ()) {
1954 local_inds_host_view_type lclColInds2;
1955 this->getLocalRowView (rowInfo.localRow, lclColInds2, vals2);
1956 os << "\tNew local column indices: ";
1957 for (size_t jjj = 0; jjj < lclColInds2.extent(0); jjj++)
1958 os << lclColInds2[jjj] << " ";
1959 os << std::endl;
1960 os << "\tNew values: ";
1961 for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1962 os << vals2[jjj] << " ";
1963 os << std::endl;
1964 }
1965 }
1966
1967 os << "Please report this bug to the Tpetra developers.";
1968 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1969 (true, std::logic_error, os.str ());
1970 }
1971#endif // HAVE_TPETRA_DEBUG
1972 }
1973
1974 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1975 void
1977 insertGlobalValues (const GlobalOrdinal gblRow,
1978 const Teuchos::ArrayView<const GlobalOrdinal>& indices,
1979 const Teuchos::ArrayView<const Scalar>& values)
1980 {
1981 using Teuchos::toString;
1982 using std::endl;
1983 typedef impl_scalar_type IST;
1984 typedef LocalOrdinal LO;
1985 typedef GlobalOrdinal GO;
1986 typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
1987 typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
1988 const char tfecfFuncName[] = "insertGlobalValues: ";
1989
1990#ifdef HAVE_TPETRA_DEBUG
1991 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1992 (values.size () != indices.size (), std::runtime_error,
1993 "values.size() = " << values.size () << " != indices.size() = "
1994 << indices.size () << ".");
1995#endif // HAVE_TPETRA_DEBUG
1996
1997 // getRowMap() is not thread safe, because it increments RCP's
1998 // reference count. getCrsGraphRef() is thread safe.
1999 const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2000 const LO lclRow = rowMap.getLocalElement (gblRow);
2001
2002 if (lclRow == OTLO::invalid ()) {
2003 // Input row is _not_ owned by the calling process.
2004 //
2005 // See a note (now deleted) from mfh 14 Dec 2012: If input row
2006 // is not in the row Map, it doesn't matter whether or not the
2007 // graph is static; the data just get stashed for later use by
2008 // globalAssemble().
2009 this->insertNonownedGlobalValues (gblRow, indices, values);
2010 }
2011 else { // Input row _is_ owned by the calling process
2012 if (this->isStaticGraph ()) {
2013 // Uh oh! Not allowed to insert into owned rows in that case.
2014 const int myRank = rowMap.getComm ()->getRank ();
2015 const int numProcs = rowMap.getComm ()->getSize ();
2016 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2017 (true, std::runtime_error,
2018 "The matrix was constructed with a constant (\"static\") graph, "
2019 "yet the given global row index " << gblRow << " is in the row "
2020 "Map on the calling process (with rank " << myRank << ", of " <<
2021 numProcs << " process(es)). In this case, you may not insert "
2022 "new entries into rows owned by the calling process.");
2023 }
2024
2025 crs_graph_type& graph = * (this->myGraph_);
2026 const IST* const inputVals =
2027 reinterpret_cast<const IST*> (values.getRawPtr ());
2028 const GO* const inputGblColInds = indices.getRawPtr ();
2029 const size_t numInputEnt = indices.size ();
2030 RowInfo rowInfo = graph.getRowInfo (lclRow);
2031
2032 // If the matrix has a column Map, check at this point whether
2033 // the column indices belong to the column Map.
2034 //
2035 // FIXME (mfh 16 May 2013) We may want to consider deferring the
2036 // test to the CrsGraph method, since it may have to do this
2037 // anyway.
2038 if (! graph.colMap_.is_null ()) {
2039 const map_type& colMap = * (graph.colMap_);
2040 // In a debug build, keep track of the nonowned ("bad") column
2041 // indices, so that we can display them in the exception
2042 // message. In a release build, just ditch the loop early if
2043 // we encounter a nonowned column index.
2044#ifdef HAVE_TPETRA_DEBUG
2045 Teuchos::Array<GO> badColInds;
2046#endif // HAVE_TPETRA_DEBUG
2047 const size_type numEntriesToInsert = indices.size ();
2048 bool allInColMap = true;
2049 for (size_type k = 0; k < numEntriesToInsert; ++k) {
2050 if (! colMap.isNodeGlobalElement (indices[k])) {
2051 allInColMap = false;
2052#ifdef HAVE_TPETRA_DEBUG
2053 badColInds.push_back (indices[k]);
2054#else
2055 break;
2056#endif // HAVE_TPETRA_DEBUG
2057 }
2058 }
2059 if (! allInColMap) {
2060 std::ostringstream os;
2061 os << "You attempted to insert entries in owned row " << gblRow
2062 << ", at the following column indices: " << toString (indices)
2063 << "." << endl;
2064#ifdef HAVE_TPETRA_DEBUG
2065 os << "Of those, the following indices are not in the column Map "
2066 "on this process: " << toString (badColInds) << "." << endl
2067 << "Since the matrix has a column Map already, it is invalid "
2068 "to insert entries at those locations.";
2069#else
2070 os << "At least one of those indices is not in the column Map "
2071 "on this process." << endl << "It is invalid to insert into "
2072 "columns not in the column Map on the process that owns the "
2073 "row.";
2074#endif // HAVE_TPETRA_DEBUG
2075 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2076 (true, std::invalid_argument, os.str ());
2077 }
2078 }
2079
2080 this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2081 inputVals, numInputEnt);
2082 }
2083 }
2084
2085
2086 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2087 void
2089 insertGlobalValues (const GlobalOrdinal globalRow,
2090 const LocalOrdinal numEnt,
2091 const Scalar vals[],
2092 const GlobalOrdinal inds[])
2094 Teuchos::ArrayView<const GlobalOrdinal> indsT (inds, numEnt);
2095 Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
2096 this->insertGlobalValues (globalRow, indsT, valsT);
2097 }
2098
2099
2100 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2101 void
2104 const GlobalOrdinal gblRow,
2105 const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2106 const Teuchos::ArrayView<const Scalar>& values,
2107 const bool debug)
2108 {
2109 typedef impl_scalar_type IST;
2110 typedef LocalOrdinal LO;
2111 typedef GlobalOrdinal GO;
2112 typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2113 const char tfecfFuncName[] = "insertGlobalValuesFiltered: ";
2114
2115 if (debug) {
2116 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2117 (values.size () != indices.size (), std::runtime_error,
2118 "values.size() = " << values.size () << " != indices.size() = "
2119 << indices.size () << ".");
2120 }
2121
2122 // getRowMap() is not thread safe, because it increments RCP's
2123 // reference count. getCrsGraphRef() is thread safe.
2124 const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2125 const LO lclRow = rowMap.getLocalElement (gblRow);
2126 if (lclRow == OTLO::invalid ()) {
2127 // Input row is _not_ owned by the calling process.
2128 //
2129 // See a note (now deleted) from mfh 14 Dec 2012: If input row
2130 // is not in the row Map, it doesn't matter whether or not the
2131 // graph is static; the data just get stashed for later use by
2132 // globalAssemble().
2133 this->insertNonownedGlobalValues (gblRow, indices, values);
2134 }
2135 else { // Input row _is_ owned by the calling process
2136 if (this->isStaticGraph ()) {
2137 // Uh oh! Not allowed to insert into owned rows in that case.
2138 const int myRank = rowMap.getComm ()->getRank ();
2139 const int numProcs = rowMap.getComm ()->getSize ();
2140 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2141 (true, std::runtime_error,
2142 "The matrix was constructed with a constant (\"static\") graph, "
2143 "yet the given global row index " << gblRow << " is in the row "
2144 "Map on the calling process (with rank " << myRank << ", of " <<
2145 numProcs << " process(es)). In this case, you may not insert "
2146 "new entries into rows owned by the calling process.");
2147 }
2148
2149 crs_graph_type& graph = * (this->myGraph_);
2150 const IST* const inputVals =
2151 reinterpret_cast<const IST*> (values.getRawPtr ());
2152 const GO* const inputGblColInds = indices.getRawPtr ();
2153 const size_t numInputEnt = indices.size ();
2154 RowInfo rowInfo = graph.getRowInfo (lclRow);
2155
2156 if (!graph.colMap_.is_null() && graph.isLocallyIndexed()) {
2157 // This branch is similar in function to the following branch, but for
2158 // the special case that the target graph is locally indexed.
2159 // In this case, we cannot simply filter
2160 // out global indices that don't exist on the receiving process and
2161 // insert the remaining (global) indices, but we must convert them (the
2162 // remaining global indices) to local and call `insertLocalValues`.
2163 const map_type& colMap = * (graph.colMap_);
2164 size_t curOffset = 0;
2165 while (curOffset < numInputEnt) {
2166 // Find a sequence of input indices that are in the column Map on the
2167 // calling process. Doing a sequence at a time, instead of one at a
2168 // time, amortizes some overhead.
2169 Teuchos::Array<LO> lclIndices;
2170 size_t endOffset = curOffset;
2171 for ( ; endOffset < numInputEnt; ++endOffset) {
2172 auto lclIndex = colMap.getLocalElement(inputGblColInds[endOffset]);
2173 if (lclIndex != OTLO::invalid())
2174 lclIndices.push_back(lclIndex);
2175 else
2176 break;
2178 // curOffset, endOffset: half-exclusive range of indices in the column
2179 // Map on the calling process. If endOffset == curOffset, the range is
2180 // empty.
2181 const LO numIndInSeq = (endOffset - curOffset);
2182 if (numIndInSeq != 0) {
2183 this->insertLocalValues(lclRow, lclIndices(), values(curOffset, numIndInSeq));
2184 }
2185 // Invariant before the increment line: Either endOffset ==
2186 // numInputEnt, or inputGblColInds[endOffset] is not in the column Map
2187 // on the calling process.
2188 if (debug) {
2189 const bool invariant = endOffset == numInputEnt ||
2190 colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2191 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2192 (! invariant, std::logic_error, std::endl << "Invariant failed!");
2193 }
2194 curOffset = endOffset + 1;
2195 }
2196 }
2197 else if (! graph.colMap_.is_null ()) { // We have a column Map.
2198 const map_type& colMap = * (graph.colMap_);
2199 size_t curOffset = 0;
2200 while (curOffset < numInputEnt) {
2201 // Find a sequence of input indices that are in the column
2202 // Map on the calling process. Doing a sequence at a time,
2203 // instead of one at a time, amortizes some overhead.
2204 size_t endOffset = curOffset;
2205 for ( ; endOffset < numInputEnt &&
2206 colMap.getLocalElement (inputGblColInds[endOffset]) != OTLO::invalid ();
2207 ++endOffset)
2209 // curOffset, endOffset: half-exclusive range of indices in
2210 // the column Map on the calling process. If endOffset ==
2211 // curOffset, the range is empty.
2212 const LO numIndInSeq = (endOffset - curOffset);
2213 if (numIndInSeq != 0) {
2214 rowInfo = graph.getRowInfo(lclRow); // KDD 5/19 Need fresh RowInfo in each loop iteration
2215 this->insertGlobalValuesImpl (graph, rowInfo,
2216 inputGblColInds + curOffset,
2217 inputVals + curOffset,
2218 numIndInSeq);
2219 }
2220 // Invariant before the increment line: Either endOffset ==
2221 // numInputEnt, or inputGblColInds[endOffset] is not in the
2222 // column Map on the calling process.
2223 if (debug) {
2224 const bool invariant = endOffset == numInputEnt ||
2225 colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2226 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2227 (! invariant, std::logic_error, std::endl << "Invariant failed!");
2228 }
2229 curOffset = endOffset + 1;
2230 }
2231 }
2232 else { // we don't have a column Map.
2233 this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2234 inputVals, numInputEnt);
2235 }
2236 }
2238
2239 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2240 void
2243 const GlobalOrdinal gblRow,
2244 const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2245 const Teuchos::ArrayView<const Scalar>& values,
2246 const char* const prefix,
2247 const bool debug,
2248 const bool verbose)
2249 {
2251 using std::endl;
2252
2253 try {
2254 insertGlobalValuesFiltered(gblRow, indices, values, debug);
2255 }
2256 catch(std::exception& e) {
2257 std::ostringstream os;
2258 if (verbose) {
2259 const size_t maxNumToPrint =
2261 os << *prefix << ": insertGlobalValuesFiltered threw an "
2262 "exception: " << e.what() << endl
2263 << "Global row index: " << gblRow << endl;
2264 verbosePrintArray(os, indices, "Global column indices",
2265 maxNumToPrint);
2266 os << endl;
2267 verbosePrintArray(os, values, "Values", maxNumToPrint);
2268 os << endl;
2269 }
2270 else {
2271 os << ": insertGlobalValuesFiltered threw an exception: "
2272 << e.what();
2273 }
2274 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, os.str());
2276 }
2277
2278 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2279 LocalOrdinal
2282 const crs_graph_type& graph,
2283 const RowInfo& rowInfo,
2284 const LocalOrdinal inds[],
2285 const impl_scalar_type newVals[],
2286 const LocalOrdinal numElts)
2287 {
2288 typedef LocalOrdinal LO;
2289 typedef GlobalOrdinal GO;
2290 const bool sorted = graph.isSorted ();
2291
2292 size_t hint = 0; // Guess for the current index k into rowVals
2293 LO numValid = 0; // number of valid local column indices
2294
2295 if (graph.isLocallyIndexed ()) {
2296 // Get a view of the column indices in the row. This amortizes
2297 // the cost of getting the view over all the entries of inds.
2298 auto colInds = graph.getLocalIndsViewHost (rowInfo);
2299
2300 for (LO j = 0; j < numElts; ++j) {
2301 const LO lclColInd = inds[j];
2302 const size_t offset =
2303 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2304 lclColInd, hint, sorted);
2305 if (offset != rowInfo.numEntries) {
2306 rowVals[offset] = newVals[j];
2307 hint = offset + 1;
2308 ++numValid;
2309 }
2310 }
2311 }
2312 else if (graph.isGloballyIndexed ()) {
2313 if (graph.colMap_.is_null ()) {
2314 return Teuchos::OrdinalTraits<LO>::invalid ();
2315 }
2316 const map_type colMap = * (graph.colMap_);
2317
2318 // Get a view of the column indices in the row. This amortizes
2319 // the cost of getting the view over all the entries of inds.
2320 auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2321
2322 for (LO j = 0; j < numElts; ++j) {
2323 const GO gblColInd = colMap.getGlobalElement (inds[j]);
2324 if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
2325 const size_t offset =
2326 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2327 gblColInd, hint, sorted);
2328 if (offset != rowInfo.numEntries) {
2329 rowVals[offset] = newVals[j];
2330 hint = offset + 1;
2331 ++numValid;
2332 }
2333 }
2334 }
2335 }
2336 // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2337 // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2338 // to be neither locally nor globally indexed on a process.
2339 // This means that the graph or matrix has no entries on that
2340 // process. Epetra also works like this. It's related to lazy
2341 // allocation (on first insertion, not at graph / matrix
2342 // construction). Lazy allocation will go away because it is
2343 // not thread scalable.
2344
2345 return numValid;
2346 }
2347
2348 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2349 LocalOrdinal
2351 replaceLocalValues (const LocalOrdinal localRow,
2352 const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
2353 const Teuchos::ArrayView<const Scalar>& vals)
2354 {
2355 typedef LocalOrdinal LO;
2356
2357 const LO numInputEnt = static_cast<LO> (lclCols.size ());
2358 if (static_cast<LO> (vals.size ()) != numInputEnt) {
2359 return Teuchos::OrdinalTraits<LO>::invalid ();
2361 const LO* const inputInds = lclCols.getRawPtr ();
2362 const Scalar* const inputVals = vals.getRawPtr ();
2363 return this->replaceLocalValues (localRow, numInputEnt,
2364 inputVals, inputInds);
2366
2367 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2368 typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2369 local_ordinal_type
2372 const local_ordinal_type localRow,
2373 const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2374 const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2375 {
2376 using LO = local_ordinal_type;
2377 const LO numInputEnt = inputInds.extent(0);
2378 if (numInputEnt != static_cast<LO>(inputVals.extent(0))) {
2379 return Teuchos::OrdinalTraits<LO>::invalid();
2380 }
2381 const Scalar* const inVals =
2382 reinterpret_cast<const Scalar*>(inputVals.data());
2383 return this->replaceLocalValues(localRow, numInputEnt,
2384 inVals, inputInds.data());
2385 }
2386
2387 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2388 LocalOrdinal
2390 replaceLocalValues (const LocalOrdinal localRow,
2391 const LocalOrdinal numEnt,
2392 const Scalar inputVals[],
2393 const LocalOrdinal inputCols[])
2394 {
2395 typedef impl_scalar_type IST;
2396 typedef LocalOrdinal LO;
2397
2398 if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2399 // Fill must be active and the "nonconst" graph must exist.
2400 return Teuchos::OrdinalTraits<LO>::invalid ();
2401 }
2402 const crs_graph_type& graph = * (this->staticGraph_);
2403 const RowInfo rowInfo = graph.getRowInfo (localRow);
2404
2405 if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2406 // The calling process does not own this row, so it is not
2407 // allowed to modify its values.
2408 return static_cast<LO> (0);
2409 }
2410 auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2411 const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2412 return this->replaceLocalValuesImpl (curRowVals.data (), graph, rowInfo,
2413 inputCols, inVals, numEnt);
2415
2416 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2417 LocalOrdinal
2420 const crs_graph_type& graph,
2421 const RowInfo& rowInfo,
2422 const GlobalOrdinal inds[],
2423 const impl_scalar_type newVals[],
2424 const LocalOrdinal numElts)
2425 {
2426 Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
2427 auto fun =
2428 [&](size_t const k, size_t const /*start*/, size_t const offset) {
2429 rowVals[offset] = newVals[k];
2430 };
2431 std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2432 return graph.findGlobalIndices(rowInfo, indsT, cb);
2433 }
2434
2435 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2436 LocalOrdinal
2438 replaceGlobalValues (const GlobalOrdinal globalRow,
2439 const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2440 const Teuchos::ArrayView<const Scalar>& inputVals)
2441 {
2442 typedef LocalOrdinal LO;
2443
2444 const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2445 if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2446 return Teuchos::OrdinalTraits<LO>::invalid ();
2447 }
2448 return this->replaceGlobalValues (globalRow, numInputEnt,
2449 inputVals.getRawPtr (),
2450 inputGblColInds.getRawPtr ());
2451 }
2452
2453 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2454 LocalOrdinal
2456 replaceGlobalValues (const GlobalOrdinal globalRow,
2457 const LocalOrdinal numEnt,
2458 const Scalar inputVals[],
2459 const GlobalOrdinal inputGblColInds[])
2460 {
2461 typedef impl_scalar_type IST;
2462 typedef LocalOrdinal LO;
2463
2464 if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2465 // Fill must be active and the "nonconst" graph must exist.
2466 return Teuchos::OrdinalTraits<LO>::invalid ();
2467 }
2468 const crs_graph_type& graph = * (this->staticGraph_);
2469
2470 const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow);
2471 if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2472 // The input local row is invalid on the calling process,
2473 // which means that the calling process summed 0 entries.
2474 return static_cast<LO> (0);
2475 }
2476
2477 auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2478 const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2479 return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2480 inputGblColInds, inVals, numEnt);
2481 }
2482
2483 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2484 typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2485 local_ordinal_type
2488 const global_ordinal_type globalRow,
2489 const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2490 const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2491 {
2492 // We use static_assert here to check the template parameters,
2493 // rather than std::enable_if (e.g., on the return value, to
2494 // enable compilation only if the template parameters match the
2495 // desired attributes). This turns obscure link errors into
2496 // clear compilation errors. It also makes the return value a
2497 // lot easier to see.
2498 using LO = local_ordinal_type;
2499 const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2500 if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2501 return Teuchos::OrdinalTraits<LO>::invalid();
2502 }
2503 const Scalar* const inVals =
2504 reinterpret_cast<const Scalar*>(inputVals.data());
2505 return this->replaceGlobalValues(globalRow, numInputEnt, inVals,
2506 inputInds.data());
2508
2509 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2510 LocalOrdinal
2513 const crs_graph_type& graph,
2514 const RowInfo& rowInfo,
2515 const GlobalOrdinal inds[],
2516 const impl_scalar_type newVals[],
2517 const LocalOrdinal numElts,
2518 const bool atomic)
2520 typedef LocalOrdinal LO;
2521 typedef GlobalOrdinal GO;
2522
2523 const bool sorted = graph.isSorted ();
2524
2525 size_t hint = 0; // guess at the index's relative offset in the row
2526 LO numValid = 0; // number of valid input column indices
2527
2528 if (graph.isLocallyIndexed ()) {
2529 // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2530 // pointer does NOT change its reference count. Thus, this
2531 // code is still thread safe.
2532 if (graph.colMap_.is_null ()) {
2533 // NO input column indices are valid in this case, since if
2534 // the column Map is null on the calling process, then the
2535 // calling process owns no graph entries.
2536 return numValid;
2537 }
2538 const map_type& colMap = * (graph.colMap_);
2539
2540 // Get a view of the column indices in the row. This amortizes
2541 // the cost of getting the view over all the entries of inds.
2542 auto colInds = graph.getLocalIndsViewHost (rowInfo);
2543 const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2544
2545 for (LO j = 0; j < numElts; ++j) {
2546 const LO lclColInd = colMap.getLocalElement (inds[j]);
2547 if (lclColInd != LINV) {
2548 const size_t offset =
2549 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2550 lclColInd, hint, sorted);
2551 if (offset != rowInfo.numEntries) {
2552 if (atomic) {
2553 Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2554 }
2555 else {
2556 rowVals[offset] += newVals[j];
2557 }
2558 hint = offset + 1;
2559 numValid++;
2560 }
2561 }
2562 }
2563 }
2564 else if (graph.isGloballyIndexed ()) {
2565 // Get a view of the column indices in the row. This amortizes
2566 // the cost of getting the view over all the entries of inds.
2567 auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2568
2569 for (LO j = 0; j < numElts; ++j) {
2570 const GO gblColInd = inds[j];
2571 const size_t offset =
2572 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2573 gblColInd, hint, sorted);
2574 if (offset != rowInfo.numEntries) {
2575 if (atomic) {
2576 Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2577 }
2578 else {
2579 rowVals[offset] += newVals[j];
2580 }
2581 hint = offset + 1;
2582 numValid++;
2583 }
2584 }
2585 }
2586 // If the graph is neither locally nor globally indexed on the
2587 // calling process, that means the calling process has no graph
2588 // entries. Thus, none of the input column indices are valid.
2589
2590 return numValid;
2591 }
2592
2593 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2594 LocalOrdinal
2596 sumIntoGlobalValues (const GlobalOrdinal gblRow,
2597 const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2598 const Teuchos::ArrayView<const Scalar>& inputVals,
2599 const bool atomic)
2600 {
2601 typedef LocalOrdinal LO;
2602
2603 const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2604 if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2605 return Teuchos::OrdinalTraits<LO>::invalid ();
2606 }
2607 return this->sumIntoGlobalValues (gblRow, numInputEnt,
2608 inputVals.getRawPtr (),
2609 inputGblColInds.getRawPtr (),
2610 atomic);
2611 }
2612
2613 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2614 LocalOrdinal
2616 sumIntoGlobalValues (const GlobalOrdinal gblRow,
2617 const LocalOrdinal numInputEnt,
2618 const Scalar inputVals[],
2619 const GlobalOrdinal inputGblColInds[],
2620 const bool atomic)
2621 {
2622 typedef impl_scalar_type IST;
2623 typedef LocalOrdinal LO;
2624 typedef GlobalOrdinal GO;
2626 if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2627 // Fill must be active and the "nonconst" graph must exist.
2628 return Teuchos::OrdinalTraits<LO>::invalid ();
2629 }
2630 const crs_graph_type& graph = * (this->staticGraph_);
2631
2632 const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2633 if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2634 // mfh 23 Mar 2017, 26 Jul 2017: This branch may not be not
2635 // thread safe in a debug build, in part because it uses
2636 // Teuchos::ArrayView, and in part because of the data structure
2637 // used to stash outgoing entries.
2638 using Teuchos::ArrayView;
2639 ArrayView<const GO> inputGblColInds_av(
2640 numInputEnt == 0 ? nullptr : inputGblColInds,
2641 numInputEnt);
2642 ArrayView<const Scalar> inputVals_av(
2643 numInputEnt == 0 ? nullptr :
2644 inputVals, numInputEnt);
2645 // gblRow is not in the row Map on the calling process, so stash
2646 // the given entries away in a separate data structure.
2647 // globalAssemble() (called during fillComplete()) will exchange
2648 // that data and sum it in using sumIntoGlobalValues().
2649 this->insertNonownedGlobalValues (gblRow, inputGblColInds_av,
2650 inputVals_av);
2651 // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
2652 // since we won't know whether the given indices were valid
2653 // until globalAssemble (called in fillComplete) is called.
2654 // That's why insertNonownedGlobalValues doesn't return
2655 // anything. Just for consistency, I'll return the number of
2656 // entries that the user gave us.
2657 return numInputEnt;
2658 }
2659 else { // input row is in the row Map on the calling process
2660 auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2661 const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2662 return this->sumIntoGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2663 inputGblColInds, inVals,
2664 numInputEnt, atomic);
2665 }
2666 }
2667
2668 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2669 LocalOrdinal
2670 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2671 transformLocalValues (const LocalOrdinal lclRow,
2672 const LocalOrdinal numInputEnt,
2673 const impl_scalar_type inputVals[],
2674 const LocalOrdinal inputCols[],
2675 std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2676 const bool atomic)
2677 {
2678 using Tpetra::Details::OrdinalTraits;
2679 typedef LocalOrdinal LO;
2681 if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2682 // Fill must be active and the "nonconst" graph must exist.
2683 return Teuchos::OrdinalTraits<LO>::invalid ();
2684 }
2685 const crs_graph_type& graph = * (this->staticGraph_);
2686 const RowInfo rowInfo = graph.getRowInfo (lclRow);
2687
2688 if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2689 // The calling process does not own this row, so it is not
2690 // allowed to modify its values.
2691 return static_cast<LO> (0);
2692 }
2693 auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2694 return this->transformLocalValues (curRowVals.data (), graph,
2695 rowInfo, inputCols, inputVals,
2696 numInputEnt, f, atomic);
2697 }
2698
2699 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2700 LocalOrdinal
2702 transformGlobalValues (const GlobalOrdinal gblRow,
2703 const LocalOrdinal numInputEnt,
2704 const impl_scalar_type inputVals[],
2705 const GlobalOrdinal inputCols[],
2706 std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2707 const bool atomic)
2708 {
2709 using Tpetra::Details::OrdinalTraits;
2710 typedef LocalOrdinal LO;
2711
2712 if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2713 // Fill must be active and the "nonconst" graph must exist.
2714 return OrdinalTraits<LO>::invalid ();
2715 }
2716 const crs_graph_type& graph = * (this->staticGraph_);
2717 const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2718
2719 if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2720 // The calling process does not own this row, so it is not
2721 // allowed to modify its values.
2722 return static_cast<LO> (0);
2723 }
2724 auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2725 return this->transformGlobalValues (curRowVals.data (), graph,
2726 rowInfo, inputCols, inputVals,
2727 numInputEnt, f, atomic);
2728 }
2729
2730 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2731 LocalOrdinal
2734 const crs_graph_type& graph,
2735 const RowInfo& rowInfo,
2736 const LocalOrdinal inds[],
2737 const impl_scalar_type newVals[],
2738 const LocalOrdinal numElts,
2739 std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2740 const bool atomic)
2741 {
2742 typedef impl_scalar_type ST;
2743 typedef LocalOrdinal LO;
2744 typedef GlobalOrdinal GO;
2745
2746 //if (newVals.extent (0) != inds.extent (0)) {
2747 // The sizes of the input arrays must match.
2748 //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2749 //}
2750 //const LO numElts = static_cast<LO> (inds.extent (0));
2751 const bool sorted = graph.isSorted ();
2752
2753 LO numValid = 0; // number of valid input column indices
2754 size_t hint = 0; // Guess for the current index k into rowVals
2755
2756 if (graph.isLocallyIndexed ()) {
2757 // Get a view of the column indices in the row. This amortizes
2758 // the cost of getting the view over all the entries of inds.
2759 auto colInds = graph.getLocalIndsViewHost (rowInfo);
2760
2761 for (LO j = 0; j < numElts; ++j) {
2762 const LO lclColInd = inds[j];
2763 const size_t offset =
2764 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2765 lclColInd, hint, sorted);
2766 if (offset != rowInfo.numEntries) {
2767 if (atomic) {
2768 // NOTE (mfh 30 Nov 2015) The commented-out code is
2769 // wrong because another thread may have changed
2770 // rowVals[offset] between those two lines of code.
2771 //
2772 //const ST newVal = f (rowVals[offset], newVals[j]);
2773 //Kokkos::atomic_assign (&rowVals[offset], newVal);
2774
2775 volatile ST* const dest = &rowVals[offset];
2776 (void) atomic_binary_function_update (dest, newVals[j], f);
2777 }
2778 else {
2779 // use binary function f
2780 rowVals[offset] = f (rowVals[offset], newVals[j]);
2781 }
2782 hint = offset + 1;
2783 ++numValid;
2785 }
2786 }
2787 else if (graph.isGloballyIndexed ()) {
2788 // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2789 // pointer does NOT change its reference count. Thus, this
2790 // code is still thread safe.
2791 if (graph.colMap_.is_null ()) {
2792 // NO input column indices are valid in this case. Either
2793 // the column Map hasn't been set yet (so local indices
2794 // don't exist yet), or the calling process owns no graph
2795 // entries.
2796 return numValid;
2797 }
2798 const map_type& colMap = * (graph.colMap_);
2799 // Get a view of the column indices in the row. This amortizes
2800 // the cost of getting the view over all the entries of inds.
2801 auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2802
2803 const GO GINV = Teuchos::OrdinalTraits<GO>::invalid ();
2804 for (LO j = 0; j < numElts; ++j) {
2805 const GO gblColInd = colMap.getGlobalElement (inds[j]);
2806 if (gblColInd != GINV) {
2807 const size_t offset =
2808 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2809 gblColInd, hint, sorted);
2810 if (offset != rowInfo.numEntries) {
2811 if (atomic) {
2812 // NOTE (mfh 30 Nov 2015) The commented-out code is
2813 // wrong because another thread may have changed
2814 // rowVals[offset] between those two lines of code.
2815 //
2816 //const ST newVal = f (rowVals[offset], newVals[j]);
2817 //Kokkos::atomic_assign (&rowVals[offset], newVal);
2818
2819 volatile ST* const dest = &rowVals[offset];
2820 (void) atomic_binary_function_update (dest, newVals[j], f);
2821 }
2822 else {
2823 // use binary function f
2824 rowVals[offset] = f (rowVals[offset], newVals[j]);
2825 }
2826 hint = offset + 1;
2827 numValid++;
2828 }
2829 }
2830 }
2831 }
2832 // If the graph is neither locally nor globally indexed on the
2833 // calling process, that means the calling process has no graph
2834 // entries. Thus, none of the input column indices are valid.
2835
2836 return numValid;
2837 }
2838
2839 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2840 LocalOrdinal
2843 const crs_graph_type& graph,
2844 const RowInfo& rowInfo,
2845 const GlobalOrdinal inds[],
2846 const impl_scalar_type newVals[],
2847 const LocalOrdinal numElts,
2848 std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2849 const bool atomic)
2850 {
2851 typedef impl_scalar_type ST;
2852 typedef LocalOrdinal LO;
2853 typedef GlobalOrdinal GO;
2854
2855 //if (newVals.extent (0) != inds.extent (0)) {
2856 // The sizes of the input arrays must match.
2857 //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2858 //}
2859 //const LO numElts = static_cast<LO> (inds.extent (0));
2860 const bool sorted = graph.isSorted ();
2861
2862 LO numValid = 0; // number of valid input column indices
2863 size_t hint = 0; // Guess for the current index k into rowVals
2864
2865 if (graph.isGloballyIndexed ()) {
2866 // Get a view of the column indices in the row. This amortizes
2867 // the cost of getting the view over all the entries of inds.
2868 auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2870 for (LO j = 0; j < numElts; ++j) {
2871 const GO gblColInd = inds[j];
2872 const size_t offset =
2873 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2874 gblColInd, hint, sorted);
2875 if (offset != rowInfo.numEntries) {
2876 if (atomic) {
2877 // NOTE (mfh 30 Nov 2015) The commented-out code is
2878 // wrong because another thread may have changed
2879 // rowVals[offset] between those two lines of code.
2880 //
2881 //const ST newVal = f (rowVals[offset], newVals[j]);
2882 //Kokkos::atomic_assign (&rowVals[offset], newVal);
2883
2884 volatile ST* const dest = &rowVals[offset];
2885 (void) atomic_binary_function_update (dest, newVals[j], f);
2887 else {
2888 // use binary function f
2889 rowVals[offset] = f (rowVals[offset], newVals[j]);
2890 }
2891 hint = offset + 1;
2892 ++numValid;
2893 }
2895 }
2896 else if (graph.isLocallyIndexed ()) {
2897 // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2898 // pointer does NOT change its reference count. Thus, this
2899 // code is still thread safe.
2900 if (graph.colMap_.is_null ()) {
2901 // NO input column indices are valid in this case. Either the
2902 // column Map hasn't been set yet (so local indices don't
2903 // exist yet), or the calling process owns no graph entries.
2904 return numValid;
2905 }
2906 const map_type& colMap = * (graph.colMap_);
2907 // Get a view of the column indices in the row. This amortizes
2908 // the cost of getting the view over all the entries of inds.
2909 auto colInds = graph.getLocalIndsViewHost (rowInfo);
2911 const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2912 for (LO j = 0; j < numElts; ++j) {
2913 const LO lclColInd = colMap.getLocalElement (inds[j]);
2914 if (lclColInd != LINV) {
2915 const size_t offset =
2916 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2917 lclColInd, hint, sorted);
2918 if (offset != rowInfo.numEntries) {
2919 if (atomic) {
2920 // NOTE (mfh 30 Nov 2015) The commented-out code is
2921 // wrong because another thread may have changed
2922 // rowVals[offset] between those two lines of code.
2923 //
2924 //const ST newVal = f (rowVals[offset], newVals[j]);
2925 //Kokkos::atomic_assign (&rowVals[offset], newVal);
2926
2927 volatile ST* const dest = &rowVals[offset];
2928 (void) atomic_binary_function_update (dest, newVals[j], f);
2929 }
2930 else {
2931 // use binary function f
2932 rowVals[offset] = f (rowVals[offset], newVals[j]);
2933 }
2934 hint = offset + 1;
2935 numValid++;
2936 }
2937 }
2938 }
2940 // If the graph is neither locally nor globally indexed on the
2941 // calling process, that means the calling process has no graph
2942 // entries. Thus, none of the input column indices are valid.
2943
2944 return numValid;
2945 }
2946
2947 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2948 LocalOrdinal
2951 const crs_graph_type& graph,
2952 const RowInfo& rowInfo,
2953 const LocalOrdinal inds[],
2954 const impl_scalar_type newVals[],
2955 const LocalOrdinal numElts,
2956 const bool atomic)
2957 {
2958 typedef LocalOrdinal LO;
2959 typedef GlobalOrdinal GO;
2961 const bool sorted = graph.isSorted ();
2962
2963 size_t hint = 0; // Guess for the current index k into rowVals
2964 LO numValid = 0; // number of valid local column indices
2965
2966 if (graph.isLocallyIndexed ()) {
2967 // Get a view of the column indices in the row. This amortizes
2968 // the cost of getting the view over all the entries of inds.
2969 auto colInds = graph.getLocalIndsViewHost (rowInfo);
2970
2971 for (LO j = 0; j < numElts; ++j) {
2972 const LO lclColInd = inds[j];
2973 const size_t offset =
2974 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2975 lclColInd, hint, sorted);
2976 if (offset != rowInfo.numEntries) {
2977 if (atomic) {
2978 Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2979 }
2980 else {
2981 rowVals[offset] += newVals[j];
2982 }
2983 hint = offset + 1;
2984 ++numValid;
2985 }
2986 }
2987 }
2988 else if (graph.isGloballyIndexed ()) {
2989 if (graph.colMap_.is_null ()) {
2990 return Teuchos::OrdinalTraits<LO>::invalid ();
2991 }
2992 const map_type colMap = * (graph.colMap_);
2994 // Get a view of the column indices in the row. This amortizes
2995 // the cost of getting the view over all the entries of inds.
2996 auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2997
2998 for (LO j = 0; j < numElts; ++j) {
2999 const GO gblColInd = colMap.getGlobalElement (inds[j]);
3000 if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
3001 const size_t offset =
3002 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3003 gblColInd, hint, sorted);
3004 if (offset != rowInfo.numEntries) {
3005 if (atomic) {
3006 Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3007 }
3008 else {
3009 rowVals[offset] += newVals[j];
3010 }
3011 hint = offset + 1;
3012 ++numValid;
3013 }
3014 }
3015 }
3016 }
3017 // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
3018 // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
3019 // to be neither locally nor globally indexed on a process.
3020 // This means that the graph or matrix has no entries on that
3021 // process. Epetra also works like this. It's related to lazy
3022 // allocation (on first insertion, not at graph / matrix
3023 // construction). Lazy allocation will go away because it is
3024 // not thread scalable.
3025
3026 return numValid;
3027 }
3028
3029 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3030 LocalOrdinal
3032 sumIntoLocalValues (const LocalOrdinal localRow,
3033 const Teuchos::ArrayView<const LocalOrdinal>& indices,
3034 const Teuchos::ArrayView<const Scalar>& values,
3035 const bool atomic)
3036 {
3037 using LO = local_ordinal_type;
3038 const LO numInputEnt = static_cast<LO>(indices.size());
3039 if (static_cast<LO>(values.size()) != numInputEnt) {
3040 return Teuchos::OrdinalTraits<LO>::invalid();
3041 }
3042 const LO* const inputInds = indices.getRawPtr();
3043 const scalar_type* const inputVals = values.getRawPtr();
3044 return this->sumIntoLocalValues(localRow, numInputEnt,
3045 inputVals, inputInds, atomic);
3046 }
3047
3048 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3050 local_ordinal_type
3053 const local_ordinal_type localRow,
3054 const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
3055 const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
3056 const bool atomic)
3057 {
3058 using LO = local_ordinal_type;
3059 const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
3060 if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
3061 return Teuchos::OrdinalTraits<LO>::invalid();
3062 }
3063 const scalar_type* inVals =
3064 reinterpret_cast<const scalar_type*>(inputVals.data());
3065 return this->sumIntoLocalValues(localRow, numInputEnt, inVals,
3066 inputInds.data(), atomic);
3067 }
3068
3069 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3070 LocalOrdinal
3072 sumIntoLocalValues (const LocalOrdinal localRow,
3073 const LocalOrdinal numEnt,
3074 const Scalar vals[],
3075 const LocalOrdinal cols[],
3076 const bool atomic)
3077 {
3078 typedef impl_scalar_type IST;
3079 typedef LocalOrdinal LO;
3080
3081 if (! this->isFillActive () || this->staticGraph_.is_null ()) {
3082 // Fill must be active and the "nonconst" graph must exist.
3083 return Teuchos::OrdinalTraits<LO>::invalid ();
3084 }
3085 const crs_graph_type& graph = * (this->staticGraph_);
3086 const RowInfo rowInfo = graph.getRowInfo (localRow);
3087
3088 if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
3089 // The calling process does not own this row, so it is not
3090 // allowed to modify its values.
3091 return static_cast<LO> (0);
3092 }
3093 auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
3094 const IST* const inputVals = reinterpret_cast<const IST*> (vals);
3095 return this->sumIntoLocalValuesImpl (curRowVals.data (), graph, rowInfo,
3096 cols, inputVals, numEnt, atomic);
3097 }
3098
3099 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3101 values_dualv_type::t_host::const_type
3103 getValuesViewHost (const RowInfo& rowinfo) const
3104 {
3105 if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3106 return typename values_dualv_type::t_host::const_type ();
3107 else
3108 return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3109 rowinfo.allocSize,
3110 Access::ReadOnly);
3111 }
3112
3113 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3115 values_dualv_type::t_host
3117 getValuesViewHostNonConst (const RowInfo& rowinfo)
3118 {
3119 if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3120 return typename values_dualv_type::t_host ();
3121 else
3122 return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3123 rowinfo.allocSize,
3124 Access::ReadWrite);
3125 }
3126
3127 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3129 values_dualv_type::t_dev::const_type
3131 getValuesViewDevice (const RowInfo& rowinfo) const
3132 {
3133 if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3134 return typename values_dualv_type::t_dev::const_type ();
3135 else
3136 return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3137 rowinfo.allocSize,
3138 Access::ReadOnly);
3139 }
3140
3141 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3143 values_dualv_type::t_dev
3146 {
3147 if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3148 return typename values_dualv_type::t_dev ();
3149 else
3150 return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3151 rowinfo.allocSize,
3152 Access::ReadWrite);
3153 }
3154
3155
3156 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3157 void
3160 nonconst_local_inds_host_view_type &indices,
3161 nonconst_values_host_view_type &values,
3162 size_t& numEntries) const
3163 {
3164 using Teuchos::ArrayView;
3165 using Teuchos::av_reinterpret_cast;
3166 const char tfecfFuncName[] = "getLocalRowCopy: ";
3167
3168 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3169 (! this->hasColMap (), std::runtime_error,
3170 "The matrix does not have a column Map yet. This means we don't have "
3171 "local indices for columns yet, so it doesn't make sense to call this "
3172 "method. If the matrix doesn't have a column Map yet, you should call "
3173 "fillComplete on it first.");
3174
3175 const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3176 const size_t theNumEntries = rowinfo.numEntries;
3177 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3178 (static_cast<size_t> (indices.size ()) < theNumEntries ||
3179 static_cast<size_t> (values.size ()) < theNumEntries,
3180 std::runtime_error, "Row with local index " << localRow << " has " <<
3181 theNumEntries << " entry/ies, but indices.size() = " <<
3182 indices.size () << " and values.size() = " << values.size () << ".");
3183 numEntries = theNumEntries; // first side effect
3184
3185 if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3186 if (staticGraph_->isLocallyIndexed ()) {
3187 auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3188 auto curVals = getValuesViewHost(rowinfo);
3189
3190 for (size_t j = 0; j < theNumEntries; ++j) {
3191 values[j] = curVals[j];
3192 indices[j] = curLclInds(j);
3193 }
3194 }
3195 else if (staticGraph_->isGloballyIndexed ()) {
3196 // Don't call getColMap(), because it touches RCP's reference count.
3197 const map_type& colMap = * (staticGraph_->colMap_);
3198 auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3199 auto curVals = getValuesViewHost(rowinfo);
3200
3201 for (size_t j = 0; j < theNumEntries; ++j) {
3202 values[j] = curVals[j];
3203 indices[j] = colMap.getLocalElement (curGblInds(j));
3204 }
3205 }
3206 }
3207 }
3208
3209
3210template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3211void
3214 nonconst_global_inds_host_view_type &indices,
3215 nonconst_values_host_view_type &values,
3216 size_t& numEntries) const
3217 {
3218 using Teuchos::ArrayView;
3219 using Teuchos::av_reinterpret_cast;
3220 const char tfecfFuncName[] = "getGlobalRowCopy: ";
3221
3222 const RowInfo rowinfo =
3223 staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3224 const size_t theNumEntries = rowinfo.numEntries;
3225 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3226 static_cast<size_t> (indices.size ()) < theNumEntries ||
3227 static_cast<size_t> (values.size ()) < theNumEntries,
3228 std::runtime_error, "Row with global index " << globalRow << " has "
3229 << theNumEntries << " entry/ies, but indices.size() = " <<
3230 indices.size () << " and values.size() = " << values.size () << ".");
3231 numEntries = theNumEntries; // first side effect
3232
3233 if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3234 if (staticGraph_->isLocallyIndexed ()) {
3235 const map_type& colMap = * (staticGraph_->colMap_);
3236 auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3237 auto curVals = getValuesViewHost(rowinfo);
3238
3239 for (size_t j = 0; j < theNumEntries; ++j) {
3240 values[j] = curVals[j];
3241 indices[j] = colMap.getGlobalElement (curLclInds(j));
3242 }
3243 }
3244 else if (staticGraph_->isGloballyIndexed ()) {
3245 auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3246 auto curVals = getValuesViewHost(rowinfo);
3247
3248 for (size_t j = 0; j < theNumEntries; ++j) {
3249 values[j] = curVals[j];
3250 indices[j] = curGblInds(j);
3251 }
3252 }
3253 }
3254 }
3255
3256
3257 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3258 void
3260 getLocalRowView(LocalOrdinal localRow,
3261 local_inds_host_view_type &indices,
3262 values_host_view_type &values) const
3263 {
3264 const char tfecfFuncName[] = "getLocalRowView: ";
3265
3266 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3267 isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
3268 "its indices as global indices, so you cannot get a view with local "
3269 "column indices. If the matrix has a column Map, you may call "
3270 "getLocalRowCopy() to get local column indices; otherwise, you may get "
3271 "a view with global column indices by calling getGlobalRowCopy().");
3272
3273 const RowInfo rowInfo = staticGraph_->getRowInfo (localRow);
3274 if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3275 rowInfo.numEntries > 0) {
3276 indices = staticGraph_->lclIndsUnpacked_wdv.getHostSubview(
3277 rowInfo.offset1D,
3278 rowInfo.numEntries,
3279 Access::ReadOnly);
3280 values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3281 rowInfo.numEntries,
3282 Access::ReadOnly);
3283 }
3284 else {
3285 // This does the right thing (reports an empty row) if the input
3286 // row is invalid.
3287 indices = local_inds_host_view_type();
3288 values = values_host_view_type();
3289 }
3290
3291#ifdef HAVE_TPETRA_DEBUG
3292 const char suffix[] = ". This should never happen. Please report this "
3293 "bug to the Tpetra developers.";
3294 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3295 (static_cast<size_t> (indices.size ()) !=
3296 static_cast<size_t> (values.size ()), std::logic_error,
3297 "At the end of this method, for local row " << localRow << ", "
3298 "indices.size() = " << indices.size () << " != values.size () = "
3299 << values.size () << suffix);
3300 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3301 (static_cast<size_t> (indices.size ()) !=
3302 static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3303 "At the end of this method, for local row " << localRow << ", "
3304 "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3305 << rowInfo.numEntries << suffix);
3306 const size_t expectedNumEntries = getNumEntriesInLocalRow (localRow);
3307 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3308 (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3309 "of this method, for local row " << localRow << ", rowInfo.numEntries = "
3310 << rowInfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " <<
3311 expectedNumEntries << suffix);
3312#endif // HAVE_TPETRA_DEBUG
3313 }
3314
3315
3316 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3317 void
3319 getGlobalRowView (GlobalOrdinal globalRow,
3320 global_inds_host_view_type &indices,
3321 values_host_view_type &values) const
3322 {
3323 const char tfecfFuncName[] = "getGlobalRowView: ";
3324
3325 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3326 isLocallyIndexed (), std::runtime_error,
3327 "The matrix is locally indexed, so we cannot return a view of the row "
3328 "with global column indices. Use getGlobalRowCopy() instead.");
3329
3330 // This does the right thing (reports an empty row) if the input
3331 // row is invalid.
3332 const RowInfo rowInfo =
3333 staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3334 if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3335 rowInfo.numEntries > 0) {
3336 indices = staticGraph_->gblInds_wdv.getHostSubview(rowInfo.offset1D,
3337 rowInfo.numEntries,
3338 Access::ReadOnly);
3339 values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3340 rowInfo.numEntries,
3341 Access::ReadOnly);
3342 }
3343 else {
3344 indices = global_inds_host_view_type();
3345 values = values_host_view_type();
3346 }
3347
3348#ifdef HAVE_TPETRA_DEBUG
3349 const char suffix[] = ". This should never happen. Please report this "
3350 "bug to the Tpetra developers.";
3351 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3352 (static_cast<size_t> (indices.size ()) !=
3353 static_cast<size_t> (values.size ()), std::logic_error,
3354 "At the end of this method, for global row " << globalRow << ", "
3355 "indices.size() = " << indices.size () << " != values.size () = "
3356 << values.size () << suffix);
3357 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3358 (static_cast<size_t> (indices.size ()) !=
3359 static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3360 "At the end of this method, for global row " << globalRow << ", "
3361 "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3362 << rowInfo.numEntries << suffix);
3363 const size_t expectedNumEntries = getNumEntriesInGlobalRow (globalRow);
3364 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3365 (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3366 "of this method, for global row " << globalRow << ", rowInfo.numEntries "
3367 "= " << rowInfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
3368 " " << expectedNumEntries << suffix);
3369#endif // HAVE_TPETRA_DEBUG
3370 }
3371
3372
3373 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3374 void
3376 scale (const Scalar& alpha)
3377 {
3378 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3379
3380 const size_t nlrs = staticGraph_->getLocalNumRows ();
3381 const size_t numEntries = staticGraph_->getLocalNumEntries ();
3382 if (! staticGraph_->indicesAreAllocated () ||
3383 nlrs == 0 || numEntries == 0) {
3384 // do nothing
3385 }
3386 else {
3387
3388 auto vals = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
3389 KokkosBlas::scal(vals, theAlpha, vals);
3390
3391 }
3392 }
3393
3394 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3395 void
3397 setAllToScalar (const Scalar& alpha)
3398 {
3399 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3400
3401 // replace all values in the matrix
3402 // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
3403 // however, if there are no valid entries, we can short-circuit
3404 // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
3405 const size_t numEntries = staticGraph_->getLocalNumEntries();
3406 if (! staticGraph_->indicesAreAllocated () || numEntries == 0) {
3407 // do nothing
3408 }
3409 else {
3410 // DEEP_COPY REVIEW - VALUE-TO-DEVICE
3411 Kokkos::deep_copy (execution_space(), valuesUnpacked_wdv.getDeviceView(Access::OverwriteAll),
3412 theAlpha);
3413 // CAG: This fence was found to be required on Cuda with UVM=on.
3414 Kokkos::fence("CrsMatrix::setAllToScalar");
3415 }
3416 }
3417
3418 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3419 void
3421 setAllValues (const typename local_graph_device_type::row_map_type& rowPointers,
3422 const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
3423 const typename local_matrix_device_type::values_type& values)
3424 {
3425 using ProfilingRegion=Details::ProfilingRegion;
3426 ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues");
3427 const char tfecfFuncName[] = "setAllValues: ";
3428 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3429 (columnIndices.size () != values.size (), std::invalid_argument,
3430 "columnIndices.size() = " << columnIndices.size () << " != values.size()"
3431 " = " << values.size () << ".");
3432 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3433 (myGraph_.is_null (), std::runtime_error, "myGraph_ must not be null.");
3434
3435 try {
3436 myGraph_->setAllIndices (rowPointers, columnIndices);
3437 }
3438 catch (std::exception &e) {
3439 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3440 (true, std::runtime_error, "myGraph_->setAllIndices() threw an "
3441 "exception: " << e.what ());
3442 }
3443
3444 // Make sure that myGraph_ now has a local graph. It may not be
3445 // fillComplete yet, so it's important to check. We don't care
3446 // whether setAllIndices() did a shallow copy or a deep copy, so a
3447 // good way to check is to compare dimensions.
3448 auto lclGraph = myGraph_->getLocalGraphDevice ();
3449 const size_t numEnt = lclGraph.entries.extent (0);
3450 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3451 (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
3452 numEnt != static_cast<size_t> (columnIndices.extent (0)),
3453 std::logic_error, "myGraph_->setAllIndices() did not correctly create "
3454 "local graph. Please report this bug to the Tpetra developers.");
3455
3456 valuesPacked_wdv = values_wdv_type(values);
3457 valuesUnpacked_wdv = valuesPacked_wdv;
3458
3459 // Storage MUST be packed, since the interface doesn't give any
3460 // way to indicate any extra space at the end of each row.
3461 this->storageStatus_ = Details::STORAGE_1D_PACKED;
3462
3463 checkInternalState ();
3464 }
3465
3466 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3467 void
3469 setAllValues ( const local_matrix_device_type& localDeviceMatrix)
3470 {
3471 using ProfilingRegion=Details::ProfilingRegion;
3472 ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix");
3473
3474 auto graph = localDeviceMatrix.graph;
3475 //FIXME how to check whether graph is allocated
3476
3477 auto rows = graph.row_map;
3478 auto columns = graph.entries;
3479 auto values = localDeviceMatrix.values;
3480
3481 setAllValues(rows,columns,values);
3482 }
3483
3484 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3485 void
3487 setAllValues (const Teuchos::ArrayRCP<size_t>& ptr,
3488 const Teuchos::ArrayRCP<LocalOrdinal>& ind,
3489 const Teuchos::ArrayRCP<Scalar>& val)
3490 {
3491 using Kokkos::Compat::getKokkosViewDeepCopy;
3492 using Teuchos::ArrayRCP;
3493 using Teuchos::av_reinterpret_cast;
3494 typedef device_type DT;
3495 typedef impl_scalar_type IST;
3496 typedef typename local_graph_device_type::row_map_type row_map_type;
3497 //typedef typename row_map_type::non_const_value_type row_offset_type;
3498 const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
3499
3500 // The row offset type may depend on the execution space. It may
3501 // not necessarily be size_t. If it's not, we need to make a deep
3502 // copy. We need to make a deep copy anyway so that Kokkos can
3503 // own the memory. Regardless, ptrIn gets the copy.
3504 typename row_map_type::non_const_type ptrNative ("ptr", ptr.size ());
3505 Kokkos::View<const size_t*,
3506 typename row_map_type::array_layout,
3507 Kokkos::HostSpace,
3508 Kokkos::MemoryUnmanaged> ptrSizeT (ptr.getRawPtr (), ptr.size ());
3509 ::Tpetra::Details::copyOffsets (ptrNative, ptrSizeT);
3510
3511 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3512 (ptrNative.extent (0) != ptrSizeT.extent (0),
3513 std::logic_error, "ptrNative.extent(0) = " <<
3514 ptrNative.extent (0) << " != ptrSizeT.extent(0) = "
3515 << ptrSizeT.extent (0) << ". Please report this bug to the "
3516 "Tpetra developers.");
3517
3518 auto indIn = getKokkosViewDeepCopy<DT> (ind ());
3519 auto valIn = getKokkosViewDeepCopy<DT> (av_reinterpret_cast<IST> (val ()));
3520 this->setAllValues (ptrNative, indIn, valIn);
3521 }
3522
3523 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3524 void
3526 getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
3527 {
3528 const char tfecfFuncName[] = "getLocalDiagOffsets: ";
3529 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3530 (staticGraph_.is_null (), std::runtime_error, "The matrix has no graph.");
3531
3532 // mfh 11 May 2016: We plan to deprecate the ArrayRCP version of
3533 // this method in CrsGraph too, so don't call it (otherwise build
3534 // warnings will show up and annoy users). Instead, copy results
3535 // in and out, if the memory space requires it.
3536
3537 const size_t lclNumRows = staticGraph_->getLocalNumRows ();
3538 if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
3539 offsets.resize (lclNumRows);
3540 }
3541
3542 // The input ArrayRCP must always be a host pointer. Thus, if
3543 // device_type::memory_space is Kokkos::HostSpace, it's OK for us
3544 // to write to that allocation directly as a Kokkos::View.
3545 if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
3546 // It is always syntactically correct to assign a raw host
3547 // pointer to a device View, so this code will compile correctly
3548 // even if this branch never runs.
3549 typedef Kokkos::View<size_t*, device_type,
3550 Kokkos::MemoryUnmanaged> output_type;
3551 output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3552 staticGraph_->getLocalDiagOffsets (offsetsOut);
3553 }
3554 else {
3555 Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
3556 staticGraph_->getLocalDiagOffsets (offsetsTmp);
3557 typedef Kokkos::View<size_t*, Kokkos::HostSpace,
3558 Kokkos::MemoryUnmanaged> output_type;
3559 output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3560 // DEEP_COPY REVIEW - DEVICE-TO-HOST
3561 Kokkos::deep_copy (execution_space(), offsetsOut, offsetsTmp);
3562 }
3563 }
3564
3565 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3566 void
3569 {
3570 using Teuchos::ArrayRCP;
3571 using Teuchos::ArrayView;
3572 using Teuchos::av_reinterpret_cast;
3573 const char tfecfFuncName[] = "getLocalDiagCopy (1-arg): ";
3574 typedef local_ordinal_type LO;
3575
3576
3577 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3578 staticGraph_.is_null (), std::runtime_error,
3579 "This method requires that the matrix have a graph.");
3580 auto rowMapPtr = this->getRowMap ();
3581 if (rowMapPtr.is_null () || rowMapPtr->getComm ().is_null ()) {
3582 // Processes on which the row Map or its communicator is null
3583 // don't participate. Users shouldn't even call this method on
3584 // those processes.
3585 return;
3586 }
3587 auto colMapPtr = this->getColMap ();
3588 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3589 (! this->hasColMap () || colMapPtr.is_null (), std::runtime_error,
3590 "This method requires that the matrix have a column Map.");
3591 const map_type& rowMap = * rowMapPtr;
3592 const map_type& colMap = * colMapPtr;
3593 const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3594
3595#ifdef HAVE_TPETRA_DEBUG
3596 // isCompatible() requires an all-reduce, and thus this check
3597 // should only be done in debug mode.
3598 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3599 ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3600 "The input Vector's Map must be compatible with the CrsMatrix's row "
3601 "Map. You may check this by using Map's isCompatible method: "
3602 "diag.getMap ()->isCompatible (A.getRowMap ());");
3603#endif // HAVE_TPETRA_DEBUG
3604
3605 if (this->isFillComplete ()) {
3606 const auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
3607 // 1-D subview of the first (and only) column of D_lcl.
3608 const auto D_lcl_1d =
3609 Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3610
3611 const auto lclRowMap = rowMap.getLocalMap ();
3612 const auto lclColMap = colMap.getLocalMap ();
3614 (void) getDiagCopyWithoutOffsets (D_lcl_1d, lclRowMap,
3615 lclColMap,
3617 }
3618 else {
3620 (void) getLocalDiagCopyWithoutOffsetsNotFillComplete (diag, *this);
3621 }
3622 }
3623
3624 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3625 void
3628 const Kokkos::View<const size_t*, device_type,
3629 Kokkos::MemoryUnmanaged>& offsets) const
3630 {
3631 typedef LocalOrdinal LO;
3632
3633#ifdef HAVE_TPETRA_DEBUG
3634 const char tfecfFuncName[] = "getLocalDiagCopy: ";
3635 const map_type& rowMap = * (this->getRowMap ());
3636 // isCompatible() requires an all-reduce, and thus this check
3637 // should only be done in debug mode.
3638 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3639 ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3640 "The input Vector's Map must be compatible with (in the sense of Map::"
3641 "isCompatible) the CrsMatrix's row Map.");
3642#endif // HAVE_TPETRA_DEBUG
3643
3644 // For now, we fill the Vector on the host and sync to device.
3645 // Later, we may write a parallel kernel that works entirely on
3646 // device.
3647 //
3648 // NOTE (mfh 21 Jan 2016): The host kernel here assumes UVM. Once
3649 // we write a device kernel, it will not need to assume UVM.
3650
3651 auto D_lcl = diag.getLocalViewDevice (Access::OverwriteAll);
3652 const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3653 // Get 1-D subview of the first (and only) column of D_lcl.
3654 auto D_lcl_1d =
3655 Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3656
3657 KokkosSparse::getDiagCopy (D_lcl_1d, offsets,
3659 }
3660
3661 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3662 void
3665 const Teuchos::ArrayView<const size_t>& offsets) const
3666 {
3667 using LO = LocalOrdinal;
3668 using host_execution_space = Kokkos::DefaultHostExecutionSpace;
3669 using IST = impl_scalar_type;
3670
3671#ifdef HAVE_TPETRA_DEBUG
3672 const char tfecfFuncName[] = "getLocalDiagCopy: ";
3673 const map_type& rowMap = * (this->getRowMap ());
3674 // isCompatible() requires an all-reduce, and thus this check
3675 // should only be done in debug mode.
3676 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3677 ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3678 "The input Vector's Map must be compatible with (in the sense of Map::"
3679 "isCompatible) the CrsMatrix's row Map.");
3680#endif // HAVE_TPETRA_DEBUG
3681
3682 // See #1510. In case diag has already been marked modified on
3683 // device, we need to clear that flag, since the code below works
3684 // on host.
3685 //diag.clear_sync_state ();
3686
3687 // For now, we fill the Vector on the host and sync to device.
3688 // Later, we may write a parallel kernel that works entirely on
3689 // device.
3690 auto lclVecHost = diag.getLocalViewHost(Access::OverwriteAll);
3691 // 1-D subview of the first (and only) column of lclVecHost.
3692 auto lclVecHost1d = Kokkos::subview (lclVecHost, Kokkos::ALL (), 0);
3693
3694 using host_offsets_view_type =
3695 Kokkos::View<const size_t*, Kokkos::HostSpace,
3696 Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
3697 host_offsets_view_type h_offsets (offsets.getRawPtr (), offsets.size ());
3698 // Find the diagonal entries and put them in lclVecHost1d.
3699 using range_type = Kokkos::RangePolicy<host_execution_space, LO>;
3700 const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3701 const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid ();
3702
3703 auto rowPtrsPackedHost = staticGraph_->getRowPtrsPackedHost();
3704 auto valuesPackedHost = valuesPacked_wdv.getHostView(Access::ReadOnly);
3705 Kokkos::parallel_for
3706 ("Tpetra::CrsMatrix::getLocalDiagCopy",
3707 range_type (0, myNumRows),
3708 [&, INV, h_offsets] (const LO lclRow) { // Value capture is a workaround for cuda + gcc-7.2 compiler bug w/c++14
3709 lclVecHost1d(lclRow) = STS::zero (); // default value if no diag entry
3710 if (h_offsets[lclRow] != INV) {
3711 auto curRowOffset = rowPtrsPackedHost (lclRow);
3712 lclVecHost1d(lclRow) =
3713 static_cast<IST> (valuesPackedHost(curRowOffset+h_offsets[lclRow]));
3714 }
3715 });
3716 //diag.sync_device ();
3717 }
3718
3719
3720 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3721 void
3724 {
3726 using Teuchos::ArrayRCP;
3727 using Teuchos::ArrayView;
3728 using Teuchos::null;
3729 using Teuchos::RCP;
3730 using Teuchos::rcp;
3731 using Teuchos::rcpFromRef;
3733 const char tfecfFuncName[] = "leftScale: ";
3734
3735 ProfilingRegion region ("Tpetra::CrsMatrix::leftScale");
3736
3737 RCP<const vec_type> xp;
3738 if (this->getRangeMap ()->isSameAs (* (x.getMap ()))) {
3739 // Take from Epetra: If we have a non-trivial exporter, we must
3740 // import elements that are permuted or are on other processors.
3741 auto exporter = this->getCrsGraphRef ().getExporter ();
3742 if (exporter.get () != nullptr) {
3743 RCP<vec_type> tempVec (new vec_type (this->getRowMap ()));
3744 tempVec->doImport (x, *exporter, REPLACE); // reverse mode
3745 xp = tempVec;
3746 }
3747 else {
3748 xp = rcpFromRef (x);
3749 }
3750 }
3751 else if (this->getRowMap ()->isSameAs (* (x.getMap ()))) {
3752 xp = rcpFromRef (x);
3753 }
3754 else {
3755 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3756 (true, std::invalid_argument, "x's Map must be the same as "
3757 "either the row Map or the range Map of the CrsMatrix.");
3758 }
3759
3760 if (this->isFillComplete()) {
3761 auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3762 auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3764 leftScaleLocalCrsMatrix (getLocalMatrixDevice (),
3765 x_lcl_1d, false, false);
3766 }
3767 else {
3768 // 6/2020 Disallow leftScale of non-fillComplete matrices #7446
3769 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3770 (true, std::runtime_error, "CrsMatrix::leftScale requires matrix to be"
3771 " fillComplete");
3772 }
3774
3775 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3776 void
3779 {
3781 using Teuchos::ArrayRCP;
3782 using Teuchos::ArrayView;
3783 using Teuchos::null;
3784 using Teuchos::RCP;
3785 using Teuchos::rcp;
3786 using Teuchos::rcpFromRef;
3788 const char tfecfFuncName[] = "rightScale: ";
3789
3790 ProfilingRegion region ("Tpetra::CrsMatrix::rightScale");
3791
3792 RCP<const vec_type> xp;
3793 if (this->getDomainMap ()->isSameAs (* (x.getMap ()))) {
3794 // Take from Epetra: If we have a non-trivial exporter, we must
3795 // import elements that are permuted or are on other processors.
3796 auto importer = this->getCrsGraphRef ().getImporter ();
3797 if (importer.get () != nullptr) {
3798 RCP<vec_type> tempVec (new vec_type (this->getColMap ()));
3799 tempVec->doImport (x, *importer, REPLACE);
3800 xp = tempVec;
3801 }
3802 else {
3803 xp = rcpFromRef (x);
3805 }
3806 else if (this->getColMap ()->isSameAs (* (x.getMap ()))) {
3807 xp = rcpFromRef (x);
3808 } else {
3809 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3810 (true, std::runtime_error, "x's Map must be the same as "
3811 "either the domain Map or the column Map of the CrsMatrix.");
3812 }
3813
3814 if (this->isFillComplete()) {
3815 auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3816 auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3818 rightScaleLocalCrsMatrix (getLocalMatrixDevice (),
3819 x_lcl_1d, false, false);
3820 }
3821 else {
3822 // 6/2020 Disallow rightScale of non-fillComplete matrices #7446
3823 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3824 (true, std::runtime_error, "CrsMatrix::rightScale requires matrix to be"
3825 " fillComplete");
3826 }
3827 }
3828
3829 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3832 getFrobeniusNorm () const
3833 {
3834 using Teuchos::ArrayView;
3835 using Teuchos::outArg;
3836 using Teuchos::REDUCE_SUM;
3837 using Teuchos::reduceAll;
3838
3839 // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
3840 // local part of this computation. It could make sense to put
3841 // this operation in the Kokkos::CrsMatrix.
3842
3843 // check the cache first
3844 mag_type mySum = STM::zero ();
3845 if (getLocalNumEntries() > 0) {
3846 if (isStorageOptimized ()) {
3847 // "Optimized" storage is packed storage. That means we can
3848 // iterate in one pass through the 1-D values array.
3849 const size_t numEntries = getLocalNumEntries ();
3850 auto values = valuesPacked_wdv.getHostView(Access::ReadOnly);
3851 for (size_t k = 0; k < numEntries; ++k) {
3852 auto val = values[k];
3853 // Note (etp 06 Jan 2015) We need abs() here for composite types
3854 // (in general, if mag_type is on the left-hand-side, we need
3855 // abs() on the right-hand-side)
3856 const mag_type val_abs = STS::abs (val);
3857 mySum += val_abs * val_abs;
3858 }
3859 }
3860 else {
3861 const LocalOrdinal numRows =
3862 static_cast<LocalOrdinal> (this->getLocalNumRows ());
3863 for (LocalOrdinal r = 0; r < numRows; ++r) {
3864 const RowInfo rowInfo = myGraph_->getRowInfo (r);
3865 const size_t numEntries = rowInfo.numEntries;
3866 auto A_r = this->getValuesViewHost(rowInfo);
3867 for (size_t k = 0; k < numEntries; ++k) {
3868 const impl_scalar_type val = A_r[k];
3869 const mag_type val_abs = STS::abs (val);
3870 mySum += val_abs * val_abs;
3871 }
3872 }
3873 }
3874 }
3875 mag_type totalSum = STM::zero ();
3876 reduceAll<int, mag_type> (* (getComm ()), REDUCE_SUM,
3877 mySum, outArg (totalSum));
3878 return STM::sqrt (totalSum);
3879 }
3880
3881 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3882 void
3884 replaceColMap (const Teuchos::RCP<const map_type>& newColMap)
3885 {
3886 const char tfecfFuncName[] = "replaceColMap: ";
3887 // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
3888 // Then replacing the column Map might mean that we need to
3889 // reindex the column indices.
3890 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3891 myGraph_.is_null (), std::runtime_error,
3892 "This method does not work if the matrix has a const graph. The whole "
3893 "idea of a const graph is that you are not allowed to change it, but "
3894 "this method necessarily must modify the graph, since the graph owns "
3895 "the matrix's column Map.");
3896 myGraph_->replaceColMap (newColMap);
3897 }
3899 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3900 void
3902 reindexColumns (crs_graph_type* const graph,
3903 const Teuchos::RCP<const map_type>& newColMap,
3904 const Teuchos::RCP<const import_type>& newImport,
3905 const bool sortEachRow)
3906 {
3907 const char tfecfFuncName[] = "reindexColumns: ";
3908 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3909 graph == nullptr && myGraph_.is_null (), std::invalid_argument,
3910 "The input graph is null, but the matrix does not own its graph.");
3911
3912 crs_graph_type& theGraph = (graph == nullptr) ? *myGraph_ : *graph;
3913 const bool sortGraph = false; // we'll sort graph & matrix together below
3915 theGraph.reindexColumns (newColMap, newImport, sortGraph);
3916
3917 if (sortEachRow && theGraph.isLocallyIndexed () && ! theGraph.isSorted ()) {
3918 const LocalOrdinal lclNumRows =
3919 static_cast<LocalOrdinal> (theGraph.getLocalNumRows ());
3920
3921 for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
3922
3923 const RowInfo rowInfo = theGraph.getRowInfo (row);
3924 auto lclColInds = theGraph.getLocalIndsViewHostNonConst (rowInfo);
3925 auto vals = this->getValuesViewHostNonConst (rowInfo);
3926
3927 sort2 (lclColInds.data (),
3928 lclColInds.data () + rowInfo.numEntries,
3929 vals.data ());
3930 }
3931 theGraph.indicesAreSorted_ = true;
3932 }
3933 }
3934
3935 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3936 void
3938 replaceDomainMap (const Teuchos::RCP<const map_type>& newDomainMap)
3939 {
3940 const char tfecfFuncName[] = "replaceDomainMap: ";
3941 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3942 myGraph_.is_null (), std::runtime_error,
3943 "This method does not work if the matrix has a const graph. The whole "
3944 "idea of a const graph is that you are not allowed to change it, but this"
3945 " method necessarily must modify the graph, since the graph owns the "
3946 "matrix's domain Map and Import objects.");
3947 myGraph_->replaceDomainMap (newDomainMap);
3948 }
3949
3950 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3951 void
3953 replaceDomainMapAndImporter (const Teuchos::RCP<const map_type>& newDomainMap,
3954 Teuchos::RCP<const import_type>& newImporter)
3955 {
3956 const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
3957 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3958 myGraph_.is_null (), std::runtime_error,
3959 "This method does not work if the matrix has a const graph. The whole "
3960 "idea of a const graph is that you are not allowed to change it, but this"
3961 " method necessarily must modify the graph, since the graph owns the "
3962 "matrix's domain Map and Import objects.");
3963 myGraph_->replaceDomainMapAndImporter (newDomainMap, newImporter);
3964 }
3965
3966 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3967 void
3969 replaceRangeMap (const Teuchos::RCP<const map_type>& newRangeMap)
3970 {
3971 const char tfecfFuncName[] = "replaceRangeMap: ";
3972 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3973 myGraph_.is_null (), std::runtime_error,
3974 "This method does not work if the matrix has a const graph. The whole "
3975 "idea of a const graph is that you are not allowed to change it, but this"
3976 " method necessarily must modify the graph, since the graph owns the "
3977 "matrix's domain Map and Import objects.");
3978 myGraph_->replaceRangeMap (newRangeMap);
3979 }
3981 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3982 void
3984 replaceRangeMapAndExporter (const Teuchos::RCP<const map_type>& newRangeMap,
3985 Teuchos::RCP<const export_type>& newExporter)
3986 {
3987 const char tfecfFuncName[] = "replaceRangeMapAndExporter: ";
3988 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3989 myGraph_.is_null (), std::runtime_error,
3990 "This method does not work if the matrix has a const graph. The whole "
3991 "idea of a const graph is that you are not allowed to change it, but this"
3992 " method necessarily must modify the graph, since the graph owns the "
3993 "matrix's domain Map and Import objects.");
3994 myGraph_->replaceRangeMapAndExporter (newRangeMap, newExporter);
3995 }
3996
3997 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3998 void
3999 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
4000 insertNonownedGlobalValues (const GlobalOrdinal globalRow,
4001 const Teuchos::ArrayView<const GlobalOrdinal>& indices,
4002 const Teuchos::ArrayView<const Scalar>& values)
4003 {
4004 using Teuchos::Array;
4005 typedef GlobalOrdinal GO;
4006 typedef typename Array<GO>::size_type size_type;
4007
4008 const size_type numToInsert = indices.size ();
4009 // Add the new data to the list of nonlocals.
4010 // This creates the arrays if they don't exist yet.
4011 std::pair<Array<GO>, Array<Scalar> >& curRow = nonlocals_[globalRow];
4012 Array<GO>& curRowInds = curRow.first;
4013 Array<Scalar>& curRowVals = curRow.second;
4014 const size_type newCapacity = curRowInds.size () + numToInsert;
4015 curRowInds.reserve (newCapacity);
4016 curRowVals.reserve (newCapacity);
4017 for (size_type k = 0; k < numToInsert; ++k) {
4018 curRowInds.push_back (indices[k]);
4019 curRowVals.push_back (values[k]);
4020 }
4021 }
4022
4023 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4024 void
4027 {
4028 using Details::Behavior;
4030 using Teuchos::Comm;
4031 using Teuchos::outArg;
4032 using Teuchos::RCP;
4033 using Teuchos::rcp;
4034 using Teuchos::REDUCE_MAX;
4035 using Teuchos::REDUCE_MIN;
4036 using Teuchos::reduceAll;
4037 using std::endl;
4038 typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crs_matrix_type;
4039 //typedef LocalOrdinal LO;
4040 typedef GlobalOrdinal GO;
4041 typedef typename Teuchos::Array<GO>::size_type size_type;
4042 const char tfecfFuncName[] = "globalAssemble: "; // for exception macro
4043 ProfilingRegion regionGlobalAssemble ("Tpetra::CrsMatrix::globalAssemble");
4044
4045 const bool verbose = Behavior::verbose("CrsMatrix");
4046 std::unique_ptr<std::string> prefix;
4047 if (verbose) {
4048 prefix = this->createPrefix("CrsMatrix", "globalAssemble");
4049 std::ostringstream os;
4050 os << *prefix << "nonlocals_.size()=" << nonlocals_.size()
4051 << endl;
4052 std::cerr << os.str();
4053 }
4054 RCP<const Comm<int> > comm = getComm ();
4055
4056 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4057 (! isFillActive (), std::runtime_error, "Fill must be active before "
4058 "you may call this method.");
4059
4060 const size_t myNumNonlocalRows = nonlocals_.size ();
4061
4062 // If no processes have nonlocal rows, then we don't have to do
4063 // anything. Checking this is probably cheaper than constructing
4064 // the Map of nonlocal rows (see below) and noticing that it has
4065 // zero global entries.
4066 {
4067 const int iHaveNonlocalRows = (myNumNonlocalRows == 0) ? 0 : 1;
4068 int someoneHasNonlocalRows = 0;
4069 reduceAll<int, int> (*comm, REDUCE_MAX, iHaveNonlocalRows,
4070 outArg (someoneHasNonlocalRows));
4071 if (someoneHasNonlocalRows == 0) {
4072 return; // no process has nonlocal rows, so nothing to do
4073 }
4074 }
4075
4076 // 1. Create a list of the "nonlocal" rows on each process. this
4077 // requires iterating over nonlocals_, so while we do this,
4078 // deduplicate the entries and get a count for each nonlocal
4079 // row on this process.
4080 // 2. Construct a new row Map corresponding to those rows. This
4081 // Map is likely overlapping. We know that the Map is not
4082 // empty on all processes, because the above all-reduce and
4083 // return exclude that case.
4084
4085 RCP<const map_type> nonlocalRowMap;
4086 Teuchos::Array<size_t> numEntPerNonlocalRow (myNumNonlocalRows);
4087 {
4088 Teuchos::Array<GO> myNonlocalGblRows (myNumNonlocalRows);
4089 size_type curPos = 0;
4090 for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4091 ++mapIter, ++curPos) {
4092 myNonlocalGblRows[curPos] = mapIter->first;
4093 // Get the values and column indices by reference, since we
4094 // intend to change them in place (that's what "erase" does).
4095 Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4096 Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4097
4098 // Sort both arrays jointly, using the column indices as keys,
4099 // then merge them jointly. "Merge" here adds values
4100 // corresponding to the same column indices. The first 2 args
4101 // of merge2 are output arguments that work just like the
4102 // return value of std::unique.
4103 sort2 (gblCols.begin (), gblCols.end (), vals.begin ());
4104 typename Teuchos::Array<GO>::iterator gblCols_newEnd;
4105 typename Teuchos::Array<Scalar>::iterator vals_newEnd;
4106 merge2 (gblCols_newEnd, vals_newEnd,
4107 gblCols.begin (), gblCols.end (),
4108 vals.begin (), vals.end ());
4109 gblCols.erase (gblCols_newEnd, gblCols.end ());
4110 vals.erase (vals_newEnd, vals.end ());
4111 numEntPerNonlocalRow[curPos] = gblCols.size ();
4112 }
4113
4114 // Currently, Map requires that its indexBase be the global min
4115 // of all its global indices. Map won't compute this for us, so
4116 // we must do it. If our process has no nonlocal rows, set the
4117 // "min" to the max possible GO value. This ensures that if
4118 // some process has at least one nonlocal row, then it will pick
4119 // that up as the min. We know that at least one process has a
4120 // nonlocal row, since the all-reduce and return at the top of
4121 // this method excluded that case.
4122 GO myMinNonlocalGblRow = std::numeric_limits<GO>::max ();
4123 {
4124 auto iter = std::min_element (myNonlocalGblRows.begin (),
4125 myNonlocalGblRows.end ());
4126 if (iter != myNonlocalGblRows.end ()) {
4127 myMinNonlocalGblRow = *iter;
4128 }
4129 }
4130 GO gblMinNonlocalGblRow = 0;
4131 reduceAll<int, GO> (*comm, REDUCE_MIN, myMinNonlocalGblRow,
4132 outArg (gblMinNonlocalGblRow));
4133 const GO indexBase = gblMinNonlocalGblRow;
4134 const global_size_t INV = Teuchos::OrdinalTraits<global_size_t>::invalid ();
4135 nonlocalRowMap = rcp (new map_type (INV, myNonlocalGblRows (), indexBase, comm));
4136 }
4137
4138 // 3. Use the values and column indices for each nonlocal row, as
4139 // stored in nonlocals_, to construct a CrsMatrix corresponding
4140 // to nonlocal rows. We have
4141 // exact counts of the number of entries in each nonlocal row.
4142
4143 if (verbose) {
4144 std::ostringstream os;
4145 os << *prefix << "Create nonlocal matrix" << endl;
4146 std::cerr << os.str();
4147 }
4148 RCP<crs_matrix_type> nonlocalMatrix =
4149 rcp (new crs_matrix_type (nonlocalRowMap, numEntPerNonlocalRow ()));
4150 {
4151 size_type curPos = 0;
4152 for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4153 ++mapIter, ++curPos) {
4154 const GO gblRow = mapIter->first;
4155 // Get values & column indices by ref, just to avoid copy.
4156 Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4157 Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4158 //const LO numEnt = static_cast<LO> (numEntPerNonlocalRow[curPos]);
4159 nonlocalMatrix->insertGlobalValues (gblRow, gblCols (), vals ());
4160 }
4161 }
4162 // There's no need to fill-complete the nonlocals matrix.
4163 // We just use it as a temporary container for the Export.
4164
4165 // 4. If the original row Map is one to one, then we can Export
4166 // directly from nonlocalMatrix into this. Otherwise, we have
4167 // to create a temporary matrix with a one-to-one row Map,
4168 // Export into that, then Import from the temporary matrix into
4169 // *this.
4170
4171 auto origRowMap = this->getRowMap ();
4172 const bool origRowMapIsOneToOne = origRowMap->isOneToOne ();
4173
4174 int isLocallyComplete = 1; // true by default
4175
4176 if (origRowMapIsOneToOne) {
4177 if (verbose) {
4178 std::ostringstream os;
4179 os << *prefix << "Original row Map is 1-to-1" << endl;
4180 std::cerr << os.str();
4181 }
4182 export_type exportToOrig (nonlocalRowMap, origRowMap);
4183 if (! exportToOrig.isLocallyComplete ()) {
4184 isLocallyComplete = 0;
4185 }
4186 if (verbose) {
4187 std::ostringstream os;
4188 os << *prefix << "doExport from nonlocalMatrix" << endl;
4189 std::cerr << os.str();
4190 }
4191 this->doExport (*nonlocalMatrix, exportToOrig, Tpetra::ADD);
4192 // We're done at this point!
4193 }
4194 else {
4195 if (verbose) {
4196 std::ostringstream os;
4197 os << *prefix << "Original row Map is NOT 1-to-1" << endl;
4198 std::cerr << os.str();
4199 }
4200 // If you ask a Map whether it is one to one, it does some
4201 // communication and stashes intermediate results for later use
4202 // by createOneToOne. Thus, calling createOneToOne doesn't cost
4203 // much more then the original cost of calling isOneToOne.
4204 auto oneToOneRowMap = Tpetra::createOneToOne (origRowMap);
4205 export_type exportToOneToOne (nonlocalRowMap, oneToOneRowMap);
4206 if (! exportToOneToOne.isLocallyComplete ()) {
4207 isLocallyComplete = 0;
4208 }
4209
4210 // Create a temporary matrix with the one-to-one row Map.
4211 //
4212 // TODO (mfh 09 Sep 2016, 12 Sep 2016) Estimate # entries in
4213 // each row, to avoid reallocation during the Export operation.
4214 if (verbose) {
4215 std::ostringstream os;
4216 os << *prefix << "Create & doExport into 1-to-1 matrix"
4217 << endl;
4218 std::cerr << os.str();
4219 }
4220 crs_matrix_type oneToOneMatrix (oneToOneRowMap, 0);
4221 // Export from matrix of nonlocals into the temp one-to-one matrix.
4222 oneToOneMatrix.doExport(*nonlocalMatrix, exportToOneToOne,
4223 Tpetra::ADD);
4224
4225 // We don't need the matrix of nonlocals anymore, so get rid of
4226 // it, to keep the memory high-water mark down.
4227 if (verbose) {
4228 std::ostringstream os;
4229 os << *prefix << "Free nonlocalMatrix" << endl;
4230 std::cerr << os.str();
4231 }
4232 nonlocalMatrix = Teuchos::null;
4233
4234 // Import from the one-to-one matrix to the original matrix.
4235 if (verbose) {
4236 std::ostringstream os;
4237 os << *prefix << "doImport from 1-to-1 matrix" << endl;
4238 std::cerr << os.str();
4239 }
4240 import_type importToOrig (oneToOneRowMap, origRowMap);
4241 this->doImport (oneToOneMatrix, importToOrig, Tpetra::ADD);
4242 }
4243
4244 // It's safe now to clear out nonlocals_, since we've already
4245 // committed side effects to *this. The standard idiom for
4246 // clearing a Container like std::map, is to swap it with an empty
4247 // Container and let the swapped Container fall out of scope.
4248 if (verbose) {
4249 std::ostringstream os;
4250 os << *prefix << "Free nonlocals_ (std::map)" << endl;
4251 std::cerr << os.str();
4252 }
4253 decltype (nonlocals_) newNonlocals;
4254 std::swap (nonlocals_, newNonlocals);
4255
4256 // FIXME (mfh 12 Sep 2016) I don't like this all-reduce, and I
4257 // don't like throwing an exception here. A local return value
4258 // would likely be more useful to users. However, if users find
4259 // themselves exercising nonlocal inserts often, then they are
4260 // probably novice users who need the help. See Gibhub Issues
4261 // #603 and #601 (esp. the latter) for discussion.
4262
4263 int isGloballyComplete = 0; // output argument of reduceAll
4264 reduceAll<int, int> (*comm, REDUCE_MIN, isLocallyComplete,
4265 outArg (isGloballyComplete));
4266 TEUCHOS_TEST_FOR_EXCEPTION
4267 (isGloballyComplete != 1, std::runtime_error, "On at least one process, "
4268 "you called insertGlobalValues with a global row index which is not in "
4269 "the matrix's row Map on any process in its communicator.");
4270 }
4271
4272 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4273 void
4275 resumeFill (const Teuchos::RCP<Teuchos::ParameterList>& params)
4276 {
4277 if (! isStaticGraph ()) { // Don't resume fill of a nonowned graph.
4278 myGraph_->resumeFill (params);
4279 }
4280#if KOKKOSKERNELS_VERSION >= 40299
4281 // Delete the apply helper (if it exists)
4282 applyHelper.reset();
4283#endif
4284 fillComplete_ = false;
4285 }
4286
4287 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4288 bool
4290 haveGlobalConstants() const {
4291 return getCrsGraphRef ().haveGlobalConstants ();
4292 }
4293
4294 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4295 void
4297 fillComplete (const Teuchos::RCP<Teuchos::ParameterList>& params)
4298 {
4299 const char tfecfFuncName[] = "fillComplete(params): ";
4300
4301 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4302 (this->getCrsGraph ().is_null (), std::logic_error,
4303 "getCrsGraph() returns null. This should not happen at this point. "
4304 "Please report this bug to the Tpetra developers.");
4305
4306 const crs_graph_type& graph = this->getCrsGraphRef ();
4307 if (this->isStaticGraph () && graph.isFillComplete ()) {
4308 // If this matrix's graph is fill complete and the user did not
4309 // supply a domain or range Map, use the graph's domain and
4310 // range Maps.
4311 this->fillComplete (graph.getDomainMap (), graph.getRangeMap (), params);
4312 }
4313 else { // assume that user's row Map is the domain and range Map
4314 Teuchos::RCP<const map_type> rangeMap = graph.getRowMap ();
4315 Teuchos::RCP<const map_type> domainMap = rangeMap;
4316 this->fillComplete (domainMap, rangeMap, params);
4317 }
4318 }
4319
4320 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4321 void
4323 fillComplete (const Teuchos::RCP<const map_type>& domainMap,
4324 const Teuchos::RCP<const map_type>& rangeMap,
4325 const Teuchos::RCP<Teuchos::ParameterList>& params)
4326 {
4327 using Details::Behavior;
4329 using Teuchos::ArrayRCP;
4330 using Teuchos::RCP;
4331 using Teuchos::rcp;
4332 using std::endl;
4333 const char tfecfFuncName[] = "fillComplete: ";
4334 ProfilingRegion regionFillComplete
4335 ("Tpetra::CrsMatrix::fillComplete");
4336 const bool verbose = Behavior::verbose("CrsMatrix");
4337 std::unique_ptr<std::string> prefix;
4338 if (verbose) {
4339 prefix = this->createPrefix("CrsMatrix", "fillComplete(dom,ran,p)");
4340 std::ostringstream os;
4341 os << *prefix << endl;
4342 std::cerr << os.str ();
4343 }
4345 "Tpetra::CrsMatrix::fillCompete",
4346 "fillCompete");
4347
4348 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4349 (! this->isFillActive () || this->isFillComplete (), std::runtime_error,
4350 "Matrix fill state must be active (isFillActive() "
4351 "must be true) before you may call fillComplete().");
4352 const int numProcs = this->getComm ()->getSize ();
4353
4354 //
4355 // Read parameters from the input ParameterList.
4356 //
4357 {
4358 Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");
4359
4360 // If true, the caller promises that no process did nonlocal
4361 // changes since the last call to fillComplete.
4362 bool assertNoNonlocalInserts = false;
4363 // If true, makeColMap sorts remote GIDs (within each remote
4364 // process' group).
4365 bool sortGhosts = true;
4366
4367 if (! params.is_null ()) {
4368 assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
4369 assertNoNonlocalInserts);
4370 if (params->isParameter ("sort column map ghost gids")) {
4371 sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
4372 }
4373 else if (params->isParameter ("Sort column Map ghost GIDs")) {
4374 sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
4375 }
4376 }
4377 // We also don't need to do global assembly if there is only one
4378 // process in the communicator.
4379 const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
4380 // This parameter only matters if this matrix owns its graph.
4381 if (! this->myGraph_.is_null ()) {
4382 this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
4383 }
4384
4385 if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
4386 if (this->hasColMap ()) { // use local indices
4387 allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
4388 }
4389 else { // no column Map, so use global indices
4390 allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
4391 }
4392 }
4393 // Global assemble, if we need to. This call only costs a single
4394 // all-reduce if we didn't need global assembly after all.
4395 if (needGlobalAssemble) {
4396 this->globalAssemble ();
4397 }
4398 else {
4399 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4400 (numProcs == 1 && nonlocals_.size() > 0,
4401 std::runtime_error, "Cannot have nonlocal entries on a serial run. "
4402 "An invalid entry (i.e., with row index not in the row Map) must have "
4403 "been submitted to the CrsMatrix.");
4404 }
4405 }
4406 if (this->isStaticGraph ()) {
4407 Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
4408 // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
4409 // checks below only in debug mode. It would be nicer to do a
4410 // local check, then propagate the error state in a deferred
4411 // way, whenever communication happens. That would reduce the
4412 // cost of checking, to the point where it may make sense to
4413 // enable it even in release mode.
4414#ifdef HAVE_TPETRA_DEBUG
4415 // FIXME (mfh 18 Jun 2014) This check for correctness of the
4416 // input Maps incurs a penalty of two all-reduces for the
4417 // otherwise optimal const graph case.
4418 //
4419 // We could turn these (max) 2 all-reduces into (max) 1, by
4420 // fusing them. We could do this by adding a "locallySameAs"
4421 // method to Map, which would return one of four states:
4422 //
4423 // a. Certainly globally the same
4424 // b. Certainly globally not the same
4425 // c. Locally the same
4426 // d. Locally not the same
4427 //
4428 // The first two states don't require further communication.
4429 // The latter two states require an all-reduce to communicate
4430 // globally, but we only need one all-reduce, since we only need
4431 // to check whether at least one of the Maps is wrong.
4432 const bool domainMapsMatch =
4433 this->staticGraph_->getDomainMap ()->isSameAs (*domainMap);
4434 const bool rangeMapsMatch =
4435 this->staticGraph_->getRangeMap ()->isSameAs (*rangeMap);
4436
4437 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4438 (! domainMapsMatch, std::runtime_error,
4439 "The CrsMatrix's domain Map does not match the graph's domain Map. "
4440 "The graph cannot be changed because it was given to the CrsMatrix "
4441 "constructor as const. You can fix this by passing in the graph's "
4442 "domain Map and range Map to the matrix's fillComplete call.");
4443
4444 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4445 (! rangeMapsMatch, std::runtime_error,
4446 "The CrsMatrix's range Map does not match the graph's range Map. "
4447 "The graph cannot be changed because it was given to the CrsMatrix "
4448 "constructor as const. You can fix this by passing in the graph's "
4449 "domain Map and range Map to the matrix's fillComplete call.");
4450#endif // HAVE_TPETRA_DEBUG
4451
4452 // The matrix does _not_ own the graph, and the graph's
4453 // structure is already fixed, so just fill the local matrix.
4454 this->fillLocalMatrix (params);
4455 }
4456 else {
4457 Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
4458 // Set the graph's domain and range Maps. This will clear the
4459 // Import if the domain Map has changed (is a different
4460 // pointer), and the Export if the range Map has changed (is a
4461 // different pointer).
4462 this->myGraph_->setDomainRangeMaps (domainMap, rangeMap);
4463
4464 // Make the graph's column Map, if necessary.
4465 Teuchos::Array<int> remotePIDs (0);
4466 const bool mustBuildColMap = ! this->hasColMap ();
4467 if (mustBuildColMap) {
4468 this->myGraph_->makeColMap (remotePIDs);
4469 }
4470
4471 // Make indices local, if necessary. The method won't do
4472 // anything if the graph is already locally indexed.
4473 const std::pair<size_t, std::string> makeIndicesLocalResult =
4474 this->myGraph_->makeIndicesLocal(verbose);
4475 // TODO (mfh 20 Jul 2017) Instead of throwing here, pass along
4476 // the error state to makeImportExport
4477 // which may do all-reduces and thus may
4478 // have the opportunity to communicate that error state.
4479 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4480 (makeIndicesLocalResult.first != 0, std::runtime_error,
4481 makeIndicesLocalResult.second);
4482
4483 const bool sorted = this->myGraph_->isSorted ();
4484 const bool merged = this->myGraph_->isMerged ();
4485 this->sortAndMergeIndicesAndValues (sorted, merged);
4486
4487 // Make Import and Export objects, if they haven't been made
4488 // already. If we made a column Map above, reuse information
4489 // from that process to avoid communiation in the Import setup.
4490 this->myGraph_->makeImportExport (remotePIDs, mustBuildColMap);
4491
4492 // The matrix _does_ own the graph, so fill the local graph at
4493 // the same time as the local matrix.
4494 this->fillLocalGraphAndMatrix (params);
4495
4496 const bool callGraphComputeGlobalConstants = params.get () == nullptr ||
4497 params->get ("compute global constants", true);
4498 if (callGraphComputeGlobalConstants) {
4499 this->myGraph_->computeGlobalConstants ();
4500 }
4501 else {
4502 this->myGraph_->computeLocalConstants ();
4503 }
4504 this->myGraph_->fillComplete_ = true;
4505 this->myGraph_->checkInternalState ();
4506 }
4507
4508 // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4509
4510 this->fillComplete_ = true; // Now we're fill complete!
4511 {
4512 Details::ProfilingRegion region_cis(
4513 "Tpetra::CrsMatrix::fillCompete", "checkInternalState"
4514 );
4515 this->checkInternalState ();
4516 }
4517 } //fillComplete(domainMap, rangeMap, params)
4518
4519 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4520 void
4522 expertStaticFillComplete (const Teuchos::RCP<const map_type> & domainMap,
4523 const Teuchos::RCP<const map_type> & rangeMap,
4524 const Teuchos::RCP<const import_type>& importer,
4525 const Teuchos::RCP<const export_type>& exporter,
4526 const Teuchos::RCP<Teuchos::ParameterList> &params)
4527 {
4528#ifdef HAVE_TPETRA_MMM_TIMINGS
4529 std::string label;
4530 if(!params.is_null())
4531 label = params->get("Timer Label",label);
4532 std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
4533 using Teuchos::TimeMonitor;
4534
4535 Teuchos::TimeMonitor all(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-all")));
4536#endif
4537
4538 const char tfecfFuncName[] = "expertStaticFillComplete: ";
4539 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( ! isFillActive() || isFillComplete(),
4540 std::runtime_error, "Matrix fill state must be active (isFillActive() "
4541 "must be true) before calling fillComplete().");
4542 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4543 myGraph_.is_null (), std::logic_error, "myGraph_ is null. This is not allowed.");
4544
4545 {
4546#ifdef HAVE_TPETRA_MMM_TIMINGS
4547 Teuchos::TimeMonitor graph(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-Graph")));
4548#endif
4549 // We will presume globalAssemble is not needed, so we do the ESFC on the graph
4550 myGraph_->expertStaticFillComplete (domainMap, rangeMap, importer, exporter,params);
4551 }
4552
4553 {
4554#ifdef HAVE_TPETRA_MMM_TIMINGS
4555 TimeMonitor fLGAM(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-fLGAM")));
4556#endif
4557 // Fill the local graph and matrix
4558 fillLocalGraphAndMatrix (params);
4559 }
4560 // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4561
4562 // Now we're fill complete!
4563 fillComplete_ = true;
4564
4565 // Sanity checks at the end.
4566#ifdef HAVE_TPETRA_DEBUG
4567 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
4568 ": We're at the end of fillComplete(), but isFillActive() is true. "
4569 "Please report this bug to the Tpetra developers.");
4570 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete(), std::logic_error,
4571 ": We're at the end of fillComplete(), but isFillActive() is true. "
4572 "Please report this bug to the Tpetra developers.");
4573#endif // HAVE_TPETRA_DEBUG
4574 {
4575#ifdef HAVE_TPETRA_MMM_TIMINGS
4576 Teuchos::TimeMonitor cIS(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-M-cIS")));
4577#endif
4578
4580 }
4581 }
4582
4583 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4585 mergeRowIndicesAndValues (size_t rowLen, LocalOrdinal* cols, impl_scalar_type* vals)
4586 {
4587 impl_scalar_type* rowValueIter = vals;
4588 // beg,end define a half-exclusive interval over which to iterate.
4589 LocalOrdinal* beg = cols;
4590 LocalOrdinal* end = cols + rowLen;
4591 LocalOrdinal* newend = beg;
4592 if (beg != end) {
4593 LocalOrdinal* cur = beg + 1;
4594 impl_scalar_type* vcur = rowValueIter + 1;
4595 impl_scalar_type* vend = rowValueIter;
4596 cur = beg+1;
4597 while (cur != end) {
4598 if (*cur != *newend) {
4599 // new entry; save it
4600 ++newend;
4601 ++vend;
4602 (*newend) = (*cur);
4603 (*vend) = (*vcur);
4604 }
4605 else {
4606 // old entry; merge it
4607 //(*vend) = f (*vend, *vcur);
4608 (*vend) += *vcur;
4609 }
4610 ++cur;
4611 ++vcur;
4612 }
4613 ++newend; // one past the last entry, per typical [beg,end) semantics
4614 }
4615 return newend - beg;
4616 }
4617
4618 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4619 void
4621 sortAndMergeIndicesAndValues (const bool sorted, const bool merged)
4622 {
4624 typedef LocalOrdinal LO;
4625 typedef typename Kokkos::View<LO*, device_type>::HostMirror::execution_space
4626 host_execution_space;
4627 typedef Kokkos::RangePolicy<host_execution_space, LO> range_type;
4628 const char tfecfFuncName[] = "sortAndMergeIndicesAndValues: ";
4629 ProfilingRegion regionSAM ("Tpetra::CrsMatrix::sortAndMergeIndicesAndValues");
4630
4631 if (! sorted || ! merged) {
4632 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4633 (this->isStaticGraph (), std::runtime_error, "Cannot sort or merge with "
4634 "\"static\" (const) graph, since the matrix does not own the graph.");
4635 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4636 (this->myGraph_.is_null (), std::logic_error, "myGraph_ is null, but "
4637 "this matrix claims ! isStaticGraph(). "
4638 "Please report this bug to the Tpetra developers.");
4639 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4640 (this->isStorageOptimized (), std::logic_error, "It is invalid to call "
4641 "this method if the graph's storage has already been optimized. "
4642 "Please report this bug to the Tpetra developers.");
4643
4644 crs_graph_type& graph = * (this->myGraph_);
4645 const LO lclNumRows = static_cast<LO> (this->getLocalNumRows ());
4646 size_t totalNumDups = 0;
4647 {
4648 //Accessing host unpacked (4-array CRS) local matrix.
4649 auto rowBegins_ = graph.getRowPtrsUnpackedHost();
4650 auto rowLengths_ = graph.k_numRowEntries_;
4651 auto vals_ = this->valuesUnpacked_wdv.getHostView(Access::ReadWrite);
4652 auto cols_ = graph.lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
4653 Kokkos::parallel_reduce ("sortAndMergeIndicesAndValues", range_type (0, lclNumRows),
4654 [=] (const LO lclRow, size_t& numDups) {
4655 size_t rowBegin = rowBegins_(lclRow);
4656 size_t rowLen = rowLengths_(lclRow);
4657 LO* cols = cols_.data() + rowBegin;
4658 impl_scalar_type* vals = vals_.data() + rowBegin;
4659 if (! sorted) {
4660 sort2 (cols, cols + rowLen, vals);
4661 }
4662 if (! merged) {
4663 size_t newRowLength = mergeRowIndicesAndValues (rowLen, cols, vals);
4664 rowLengths_(lclRow) = newRowLength;
4665 numDups += rowLen - newRowLength;
4666 }
4667 }, totalNumDups);
4668 }
4669 if (! sorted) {
4670 graph.indicesAreSorted_ = true; // we just sorted every row
4671 }
4672 if (! merged) {
4673 graph.noRedundancies_ = true; // we just merged every row
4674 }
4675 }
4676 }
4677
4678 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4679 void
4683 Scalar alpha,
4684 Scalar beta) const
4685 {
4687 using Teuchos::RCP;
4688 using Teuchos::rcp;
4689 using Teuchos::rcp_const_cast;
4690 using Teuchos::rcpFromRef;
4691 const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4692 const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one ();
4693
4694 // mfh 05 Jun 2014: Special case for alpha == 0. I added this to
4695 // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
4696 // failing only for the Kokkos refactor version of Tpetra. It's a
4697 // good idea regardless to have the bypass.
4698 if (alpha == ZERO) {
4699 if (beta == ZERO) {
4700 Y_in.putScalar (ZERO);
4701 } else if (beta != ONE) {
4702 Y_in.scale (beta);
4703 }
4704 return;
4705 }
4706
4707 // It's possible that X is a view of Y or vice versa. We don't
4708 // allow this (apply() requires that X and Y not alias one
4709 // another), but it's helpful to detect and work around this case.
4710 // We don't try to to detect the more subtle cases (e.g., one is a
4711 // subview of the other, but their initial pointers differ). We
4712 // only need to do this if this matrix's Import is trivial;
4713 // otherwise, we don't actually apply the operator from X into Y.
4714
4715 RCP<const import_type> importer = this->getGraph ()->getImporter ();
4716 RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4717
4718 // If beta == 0, then the output MV will be overwritten; none of
4719 // its entries should be read. (Sparse BLAS semantics say that we
4720 // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
4721 // This matters if we need to do an Export operation; see below.
4722 const bool Y_is_overwritten = (beta == ZERO);
4723
4724 // We treat the case of a replicated MV output specially.
4725 const bool Y_is_replicated =
4726 (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4727
4728 // This is part of the special case for replicated MV output.
4729 // We'll let each process do its thing, but do an all-reduce at
4730 // the end to sum up the results. Setting beta=0 on all processes
4731 // but Proc 0 makes the math work out for the all-reduce. (This
4732 // assumes that the replicated data is correctly replicated, so
4733 // that the data are the same on all processes.)
4734 if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4735 beta = ZERO;
4736 }
4737
4738 // Temporary MV for Import operation. After the block of code
4739 // below, this will be an (Imported if necessary) column Map MV
4740 // ready to give to localApply(...).
4741 RCP<const MV> X_colMap;
4742 if (importer.is_null ()) {
4743 if (! X_in.isConstantStride ()) {
4744 // Not all sparse mat-vec kernels can handle an input MV with
4745 // nonconstant stride correctly, so we have to copy it in that
4746 // case into a constant stride MV. To make a constant stride
4747 // copy of X_in, we force creation of the column (== domain)
4748 // Map MV (if it hasn't already been created, else fetch the
4749 // cached copy). This avoids creating a new MV each time.
4750 RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
4751 Tpetra::deep_copy (*X_colMapNonConst, X_in);
4752 X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4753 }
4754 else {
4755 // The domain and column Maps are the same, so do the local
4756 // multiply using the domain Map input MV X_in.
4757 X_colMap = rcpFromRef (X_in);
4758 }
4759 }
4760 else { // need to Import source (multi)vector
4761 ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply: Import");
4762
4763 // We're doing an Import anyway, which will copy the relevant
4764 // elements of the domain Map MV X_in into a separate column Map
4765 // MV. Thus, we don't have to worry whether X_in is constant
4766 // stride.
4767 RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
4768
4769 // Import from the domain Map MV to the column Map MV.
4770 X_colMapNonConst->doImport (X_in, *importer, INSERT);
4771 X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4772 }
4773
4774 // Temporary MV for doExport (if needed), or for copying a
4775 // nonconstant stride output MV into a constant stride MV. This
4776 // is null if we don't need the temporary MV, that is, if the
4777 // Export is trivial (null).
4778 RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
4779
4780 // If we have a nontrivial Export object, we must perform an
4781 // Export. In that case, the local multiply result will go into
4782 // the row Map multivector. We don't have to make a
4783 // constant-stride version of Y_in in this case, because we had to
4784 // make a constant stride Y_rowMap MV and do an Export anyway.
4785 if (! exporter.is_null ()) {
4786 this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, ZERO);
4787 {
4788 ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply: Export");
4789
4790 // If we're overwriting the output MV Y_in completely (beta ==
4791 // 0), then make sure that it is filled with zeros before we
4792 // do the Export. Otherwise, the ADD combine mode will use
4793 // data in Y_in, which is supposed to be zero.
4794 if (Y_is_overwritten) {
4795 Y_in.putScalar (ZERO);
4796 }
4797 else {
4798 // Scale output MV by beta, so that doExport sums in the
4799 // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
4800 Y_in.scale (beta);
4801 }
4802 // Do the Export operation.
4803 Y_in.doExport (*Y_rowMap, *exporter, ADD_ASSIGN);
4804 }
4805 }
4806 else { // Don't do an Export: row Map and range Map are the same.
4807 //
4808 // If Y_in does not have constant stride, or if the column Map
4809 // MV aliases Y_in, then we can't let the kernel write directly
4810 // to Y_in. Instead, we have to use the cached row (== range)
4811 // Map MV as temporary storage.
4812 //
4813 // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4814 // the user passed in the same MultiVector for both X and Y. It
4815 // won't detect whether one MultiVector views the other. We
4816 // should also check the MultiVectors' raw data pointers.
4817 if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
4818 // Force creating the MV if it hasn't been created already.
4819 // This will reuse a previously created cached MV.
4820 Y_rowMap = getRowMapMultiVector (Y_in, true);
4821
4822 // If beta == 0, we don't need to copy Y_in into Y_rowMap,
4823 // since we're overwriting it anyway.
4824 if (beta != ZERO) {
4825 Tpetra::deep_copy (*Y_rowMap, Y_in);
4826 }
4827 this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, beta);
4828 Tpetra::deep_copy (Y_in, *Y_rowMap);
4829 }
4830 else {
4831 this->localApply (*X_colMap, Y_in, Teuchos::NO_TRANS, alpha, beta);
4832 }
4833 }
4834
4835 // If the range Map is a locally replicated Map, sum up
4836 // contributions from each process. We set beta = 0 on all
4837 // processes but Proc 0 initially, so this will handle the scaling
4838 // factor beta correctly.
4839 if (Y_is_replicated) {
4840 ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply: Reduce Y");
4841 Y_in.reduce ();
4842 }
4843 }
4844
4845 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4846 void
4850 const Teuchos::ETransp mode,
4851 Scalar alpha,
4852 Scalar beta) const
4853 {
4855 using Teuchos::null;
4856 using Teuchos::RCP;
4857 using Teuchos::rcp;
4858 using Teuchos::rcp_const_cast;
4859 using Teuchos::rcpFromRef;
4860 const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4861
4862 // Take shortcuts for alpha == 0.
4863 if (alpha == ZERO) {
4864 // Follow the Sparse BLAS convention by ignoring both the matrix
4865 // and X_in, in this case.
4866 if (beta == ZERO) {
4867 // Follow the Sparse BLAS convention by overwriting any Inf or
4868 // NaN values in Y_in, in this case.
4869 Y_in.putScalar (ZERO);
4870 }
4871 else {
4872 Y_in.scale (beta);
4873 }
4874 return;
4875 }
4876 else if (beta == ZERO) {
4877 //Thyra was implicitly assuming that Y gets set to zero / or is overwritten
4878 //when bets==0. This was not the case with transpose in a multithreaded
4879 //environment where a multiplication with subsequent atomic_adds is used
4880 //since 0 is effectively not special cased. Doing the explicit set to zero here
4881 //This catches cases where Y is nan or inf.
4882 Y_in.putScalar (ZERO);
4883 }
4884
4885 const size_t numVectors = X_in.getNumVectors ();
4886
4887 // We don't allow X_in and Y_in to alias one another. It's hard
4888 // to check this, because advanced users could create views from
4889 // raw pointers. However, if X_in and Y_in reference the same
4890 // object, we will do the user a favor by copying X into new
4891 // storage (with a warning). We only need to do this if we have
4892 // trivial importers; otherwise, we don't actually apply the
4893 // operator from X into Y.
4894 RCP<const import_type> importer = this->getGraph ()->getImporter ();
4895 RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4896 // access X indirectly, in case we need to create temporary storage
4897 RCP<const MV> X;
4898
4899 // some parameters for below
4900 const bool Y_is_replicated = (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4901 const bool Y_is_overwritten = (beta == ZERO);
4902 if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4903 beta = ZERO;
4904 }
4905
4906 // The kernels do not allow input or output with nonconstant stride.
4907 if (! X_in.isConstantStride () && importer.is_null ()) {
4908 X = rcp (new MV (X_in, Teuchos::Copy)); // Constant-stride copy of X_in
4909 } else {
4910 X = rcpFromRef (X_in); // Reference to X_in
4911 }
4912
4913 // Set up temporary multivectors for Import and/or Export.
4914 if (importer != Teuchos::null) {
4915 if (importMV_ != Teuchos::null && importMV_->getNumVectors() != numVectors) {
4916 importMV_ = null;
4917 }
4918 if (importMV_ == null) {
4919 importMV_ = rcp (new MV (this->getColMap (), numVectors));
4920 }
4921 }
4922 if (exporter != Teuchos::null) {
4923 if (exportMV_ != Teuchos::null && exportMV_->getNumVectors() != numVectors) {
4924 exportMV_ = null;
4925 }
4926 if (exportMV_ == null) {
4927 exportMV_ = rcp (new MV (this->getRowMap (), numVectors));
4928 }
4929 }
4930
4931 // If we have a non-trivial exporter, we must import elements that
4932 // are permuted or are on other processors.
4933 if (! exporter.is_null ()) {
4934 ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply (transpose): Import");
4935 exportMV_->doImport (X_in, *exporter, INSERT);
4936 X = exportMV_; // multiply out of exportMV_
4937 }
4938
4939 // If we have a non-trivial importer, we must export elements that
4940 // are permuted or belong to other processors. We will compute
4941 // solution into the to-be-exported MV; get a view.
4942 if (importer != Teuchos::null) {
4943 ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply (transpose): Export");
4944
4945 // FIXME (mfh 18 Apr 2015) Temporary fix suggested by Clark
4946 // Dohrmann on Fri 17 Apr 2015. At some point, we need to go
4947 // back and figure out why this helps. importMV_ SHOULD be
4948 // completely overwritten in the localApply(...) call
4949 // below, because beta == ZERO there.
4950 importMV_->putScalar (ZERO);
4951 // Do the local computation.
4952 this->localApply (*X, *importMV_, mode, alpha, ZERO);
4953
4954 if (Y_is_overwritten) {
4955 Y_in.putScalar (ZERO);
4956 } else {
4957 Y_in.scale (beta);
4958 }
4959 Y_in.doExport (*importMV_, *importer, ADD_ASSIGN);
4960 }
4961 // otherwise, multiply into Y
4962 else {
4963 // can't multiply in-situ; can't multiply into non-strided multivector
4964 //
4965 // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4966 // the user passed in the same MultiVector for both X and Y. It
4967 // won't detect whether one MultiVector views the other. We
4968 // should also check the MultiVectors' raw data pointers.
4969 if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
4970 // Make a deep copy of Y_in, into which to write the multiply result.
4971 MV Y (Y_in, Teuchos::Copy);
4972 this->localApply (*X, Y, mode, alpha, beta);
4973 Tpetra::deep_copy (Y_in, Y);
4974 } else {
4975 this->localApply (*X, Y_in, mode, alpha, beta);
4976 }
4977 }
4978
4979 // If the range Map is a locally replicated map, sum the
4980 // contributions from each process. (That's why we set beta=0
4981 // above for all processes but Proc 0.)
4982 if (Y_is_replicated) {
4983 ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply (transpose): Reduce Y");
4984 Y_in.reduce ();
4985 }
4986 }
4987
4988 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4989 void
4993 const Teuchos::ETransp mode,
4994 const Scalar& alpha,
4995 const Scalar& beta) const
4996 {
4998 using Teuchos::NO_TRANS;
4999 ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localApply");
5000
5001 auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
5002 auto Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
5003
5004 const bool debug = ::Tpetra::Details::Behavior::debug ();
5005 if (debug) {
5006 const char tfecfFuncName[] = "localApply: ";
5007 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5008 (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
5009 "X.getNumVectors() = " << X.getNumVectors () << " != "
5010 "Y.getNumVectors() = " << Y.getNumVectors () << ".");
5011 const bool transpose = (mode != Teuchos::NO_TRANS);
5012 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5013 (! transpose && X.getLocalLength () !=
5014 getColMap ()->getLocalNumElements (), std::runtime_error,
5015 "NO_TRANS case: X has the wrong number of local rows. "
5016 "X.getLocalLength() = " << X.getLocalLength () << " != "
5017 "getColMap()->getLocalNumElements() = " <<
5018 getColMap ()->getLocalNumElements () << ".");
5019 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5020 (! transpose && Y.getLocalLength () !=
5021 getRowMap ()->getLocalNumElements (), std::runtime_error,
5022 "NO_TRANS case: Y has the wrong number of local rows. "
5023 "Y.getLocalLength() = " << Y.getLocalLength () << " != "
5024 "getRowMap()->getLocalNumElements() = " <<
5025 getRowMap ()->getLocalNumElements () << ".");
5026 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5027 (transpose && X.getLocalLength () !=
5028 getRowMap ()->getLocalNumElements (), std::runtime_error,
5029 "TRANS or CONJ_TRANS case: X has the wrong number of local "
5030 "rows. X.getLocalLength() = " << X.getLocalLength ()
5031 << " != getRowMap()->getLocalNumElements() = "
5032 << getRowMap ()->getLocalNumElements () << ".");
5033 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5034 (transpose && Y.getLocalLength () !=
5035 getColMap ()->getLocalNumElements (), std::runtime_error,
5036 "TRANS or CONJ_TRANS case: X has the wrong number of local "
5037 "rows. Y.getLocalLength() = " << Y.getLocalLength ()
5038 << " != getColMap()->getLocalNumElements() = "
5039 << getColMap ()->getLocalNumElements () << ".");
5040 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5041 (! isFillComplete (), std::runtime_error, "The matrix is not "
5042 "fill complete. You must call fillComplete() (possibly with "
5043 "domain and range Map arguments) without an intervening "
5044 "resumeFill() call before you may call this method.");
5045 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5046 (! X.isConstantStride () || ! Y.isConstantStride (),
5047 std::runtime_error, "X and Y must be constant stride.");
5048 // If the two pointers are null, then they don't alias one
5049 // another, even though they are equal.
5050 // Kokkos does not guarantee that zero row-extent vectors
5051 // point to different places, so we have to check that too.
5052 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5053 (X_lcl.data () == Y_lcl.data () && X_lcl.data () != nullptr
5054 && X_lcl.extent(0) != 0,
5055 std::runtime_error, "X and Y may not alias one another.");
5056 }
5057
5058#if KOKKOSKERNELS_VERSION >= 40299
5059 auto A_lcl = getLocalMatrixDevice();
5060
5061 if(!applyHelper.get()) {
5062 // The apply helper does not exist, so create it.
5063 // Decide now whether to use the imbalanced row path, or the default.
5064 bool useMergePath = false;
5065#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
5066 //TODO: when https://github.com/kokkos/kokkos-kernels/issues/2166 is fixed and,
5067 //we can use SPMV_MERGE_PATH for the native spmv as well.
5068 //Take out this ifdef to enable that.
5069 //
5070 //Until then, only use SPMV_MERGE_PATH when calling cuSPARSE.
5071 if constexpr(std::is_same_v<execution_space, Kokkos::Cuda>) {
5072 LocalOrdinal nrows = getLocalNumRows();
5073 LocalOrdinal maxRowImbalance = 0;
5074 if(nrows != 0)
5075 maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5076
5077 if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5078 useMergePath = true;
5079 }
5080#endif
5081 applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map,
5082 useMergePath ? KokkosSparse::SPMV_MERGE_PATH : KokkosSparse::SPMV_DEFAULT);
5083 }
5084
5085 // Translate mode (Teuchos enum) to KokkosKernels (1-character string)
5086 const char* modeKK = nullptr;
5087 switch(mode)
5088 {
5089 case Teuchos::NO_TRANS:
5090 modeKK = KokkosSparse::NoTranspose; break;
5091 case Teuchos::TRANS:
5092 modeKK = KokkosSparse::Transpose; break;
5093 case Teuchos::CONJ_TRANS:
5094 modeKK = KokkosSparse::ConjugateTranspose; break;
5095 default:
5096 throw std::invalid_argument("Tpetra::CrsMatrix::localApply: invalid mode");
5097 }
5098
5099 if(applyHelper->shouldUseIntRowptrs())
5100 {
5101 auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
5102 KokkosSparse::spmv(
5103 &applyHelper->handle_int, modeKK,
5104 impl_scalar_type(alpha), A_lcl_int_rowptrs, X_lcl, impl_scalar_type(beta), Y_lcl);
5105 }
5106 else
5107 {
5108 KokkosSparse::spmv(
5109 &applyHelper->handle, modeKK,
5110 impl_scalar_type(alpha), A_lcl, X_lcl, impl_scalar_type(beta), Y_lcl);
5111 }
5112#else
5113 LocalOrdinal nrows = getLocalNumRows();
5114 LocalOrdinal maxRowImbalance = 0;
5115 if(nrows != 0)
5116 maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5117
5118 auto matrix_lcl = getLocalMultiplyOperator();
5119 if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5120 matrix_lcl->applyImbalancedRows (X_lcl, Y_lcl, mode, alpha, beta);
5121 else
5122 matrix_lcl->apply (X_lcl, Y_lcl, mode, alpha, beta);
5123#endif
5124 }
5125
5126 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5127 void
5131 Teuchos::ETransp mode,
5132 Scalar alpha,
5133 Scalar beta) const
5134 {
5136 const char fnName[] = "Tpetra::CrsMatrix::apply";
5137
5138 TEUCHOS_TEST_FOR_EXCEPTION
5139 (! isFillComplete (), std::runtime_error,
5140 fnName << ": Cannot call apply() until fillComplete() "
5141 "has been called.");
5142
5143 if (mode == Teuchos::NO_TRANS) {
5144 ProfilingRegion regionNonTranspose (fnName);
5145 this->applyNonTranspose (X, Y, alpha, beta);
5146 }
5147 else {
5148 ProfilingRegion regionTranspose ("Tpetra::CrsMatrix::apply (transpose)");
5149 this->applyTranspose (X, Y, mode, alpha, beta);
5150 }
5151 }
5152
5153
5154 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5155 template<class T>
5156 Teuchos::RCP<CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> >
5158 convert () const
5159 {
5160 using Teuchos::RCP;
5161 typedef CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> output_matrix_type;
5162 const char tfecfFuncName[] = "convert: ";
5163
5164 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5165 (! this->isFillComplete (), std::runtime_error, "This matrix (the source "
5166 "of the conversion) is not fill complete. You must first call "
5167 "fillComplete() (possibly with the domain and range Map) without an "
5168 "intervening call to resumeFill(), before you may call this method.");
5169
5170 RCP<output_matrix_type> newMatrix
5171 (new output_matrix_type (this->getCrsGraph ()));
5172 // Copy old values into new values. impl_scalar_type and T may
5173 // differ, so we can't use Kokkos::deep_copy.
5175 copyConvert (newMatrix->getLocalMatrixDevice ().values,
5176 this->getLocalMatrixDevice ().values);
5177 // Since newmat has a static (const) graph, the graph already has
5178 // a column Map, and Import and Export objects already exist (if
5179 // applicable). Thus, calling fillComplete is cheap.
5180 newMatrix->fillComplete (this->getDomainMap (), this->getRangeMap ());
5181
5182 return newMatrix;
5183 }
5184
5185
5186 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5187 void
5189 checkInternalState () const
5190 {
5191 const bool debug = ::Tpetra::Details::Behavior::debug ("CrsGraph");
5192 if (debug) {
5193 const char tfecfFuncName[] = "checkInternalState: ";
5194 const char err[] = "Internal state is not consistent. "
5195 "Please report this bug to the Tpetra developers.";
5196
5197 // This version of the graph (RCP<const crs_graph_type>) must
5198 // always be nonnull.
5199 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5200 (staticGraph_.is_null (), std::logic_error, err);
5201 // myGraph == null means that the matrix has a const ("static")
5202 // graph. Otherwise, the matrix has a dynamic graph (it owns its
5203 // graph).
5204 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5205 (! myGraph_.is_null () && myGraph_ != staticGraph_,
5206 std::logic_error, err);
5207 // if matrix is fill complete, then graph must be fill complete
5208 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5209 (isFillComplete () && ! staticGraph_->isFillComplete (),
5210 std::logic_error, err << " Specifically, the matrix is fill complete, "
5211 "but its graph is NOT fill complete.");
5212 // if values are allocated and they are non-zero in number, then
5213 // one of the allocations should be present
5214 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5215 (staticGraph_->indicesAreAllocated () &&
5216 staticGraph_->getLocalAllocationSize() > 0 &&
5217 staticGraph_->getLocalNumRows() > 0 &&
5218 valuesUnpacked_wdv.extent (0) == 0,
5219 std::logic_error, err);
5220 }
5221 }
5222
5223 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5224 std::string
5226 description () const
5227 {
5228 std::ostringstream os;
5229
5230 os << "Tpetra::CrsMatrix (Kokkos refactor): {";
5231 if (this->getObjectLabel () != "") {
5232 os << "Label: \"" << this->getObjectLabel () << "\", ";
5233 }
5234 if (isFillComplete ()) {
5235 os << "isFillComplete: true"
5236 << ", global dimensions: [" << getGlobalNumRows () << ", "
5237 << getGlobalNumCols () << "]"
5238 << ", global number of entries: " << getGlobalNumEntries ()
5239 << "}";
5240 }
5241 else {
5242 os << "isFillComplete: false"
5243 << ", global dimensions: [" << getGlobalNumRows () << ", "
5244 << getGlobalNumCols () << "]}";
5245 }
5246 return os.str ();
5247 }
5248
5249 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5250 void
5252 describe (Teuchos::FancyOStream &out,
5253 const Teuchos::EVerbosityLevel verbLevel) const
5254 {
5255 using std::endl;
5256 using std::setw;
5257 using Teuchos::ArrayView;
5258 using Teuchos::Comm;
5259 using Teuchos::RCP;
5260 using Teuchos::TypeNameTraits;
5261 using Teuchos::VERB_DEFAULT;
5262 using Teuchos::VERB_NONE;
5263 using Teuchos::VERB_LOW;
5264 using Teuchos::VERB_MEDIUM;
5265 using Teuchos::VERB_HIGH;
5266 using Teuchos::VERB_EXTREME;
5267
5268 const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
5269
5270 if (vl == VERB_NONE) {
5271 return; // Don't print anything at all
5272 }
5273
5274 // By convention, describe() always begins with a tab.
5275 Teuchos::OSTab tab0 (out);
5276
5277 RCP<const Comm<int> > comm = this->getComm();
5278 const int myRank = comm->getRank();
5279 const int numProcs = comm->getSize();
5280 size_t width = 1;
5281 for (size_t dec=10; dec<getGlobalNumRows(); dec *= 10) {
5282 ++width;
5283 }
5284 width = std::max<size_t> (width, static_cast<size_t> (11)) + 2;
5285
5286 // none: print nothing
5287 // low: print O(1) info from node 0
5288 // medium: print O(P) info, num entries per process
5289 // high: print O(N) info, num entries per row
5290 // extreme: print O(NNZ) info: print indices and values
5291 //
5292 // for medium and higher, print constituent objects at specified verbLevel
5293 if (myRank == 0) {
5294 out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
5295 }
5296 Teuchos::OSTab tab1 (out);
5297
5298 if (myRank == 0) {
5299 if (this->getObjectLabel () != "") {
5300 out << "Label: \"" << this->getObjectLabel () << "\", ";
5301 }
5302 {
5303 out << "Template parameters:" << endl;
5304 Teuchos::OSTab tab2 (out);
5305 out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
5306 << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
5307 << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
5308 << "Node: " << TypeNameTraits<Node>::name () << endl;
5309 }
5310 if (isFillComplete()) {
5311 out << "isFillComplete: true" << endl
5312 << "Global dimensions: [" << getGlobalNumRows () << ", "
5313 << getGlobalNumCols () << "]" << endl
5314 << "Global number of entries: " << getGlobalNumEntries () << endl
5315 << endl << "Global max number of entries in a row: "
5316 << getGlobalMaxNumRowEntries () << endl;
5317 }
5318 else {
5319 out << "isFillComplete: false" << endl
5320 << "Global dimensions: [" << getGlobalNumRows () << ", "
5321 << getGlobalNumCols () << "]" << endl;
5322 }
5323 }
5324
5325 if (vl < VERB_MEDIUM) {
5326 return; // all done!
5327 }
5328
5329 // Describe the row Map.
5330 if (myRank == 0) {
5331 out << endl << "Row Map:" << endl;
5332 }
5333 if (getRowMap ().is_null ()) {
5334 if (myRank == 0) {
5335 out << "null" << endl;
5336 }
5337 }
5338 else {
5339 if (myRank == 0) {
5340 out << endl;
5341 }
5342 getRowMap ()->describe (out, vl);
5343 }
5344
5345 // Describe the column Map.
5346 if (myRank == 0) {
5347 out << "Column Map: ";
5348 }
5349 if (getColMap ().is_null ()) {
5350 if (myRank == 0) {
5351 out << "null" << endl;
5352 }
5353 } else if (getColMap () == getRowMap ()) {
5354 if (myRank == 0) {
5355 out << "same as row Map" << endl;
5356 }
5357 } else {
5358 if (myRank == 0) {
5359 out << endl;
5360 }
5361 getColMap ()->describe (out, vl);
5362 }
5363
5364 // Describe the domain Map.
5365 if (myRank == 0) {
5366 out << "Domain Map: ";
5367 }
5368 if (getDomainMap ().is_null ()) {
5369 if (myRank == 0) {
5370 out << "null" << endl;
5371 }
5372 } else if (getDomainMap () == getRowMap ()) {
5373 if (myRank == 0) {
5374 out << "same as row Map" << endl;
5375 }
5376 } else if (getDomainMap () == getColMap ()) {
5377 if (myRank == 0) {
5378 out << "same as column Map" << endl;
5379 }
5380 } else {
5381 if (myRank == 0) {
5382 out << endl;
5383 }
5384 getDomainMap ()->describe (out, vl);
5385 }
5386
5387 // Describe the range Map.
5388 if (myRank == 0) {
5389 out << "Range Map: ";
5390 }
5391 if (getRangeMap ().is_null ()) {
5392 if (myRank == 0) {
5393 out << "null" << endl;
5394 }
5395 } else if (getRangeMap () == getDomainMap ()) {
5396 if (myRank == 0) {
5397 out << "same as domain Map" << endl;
5398 }
5399 } else if (getRangeMap () == getRowMap ()) {
5400 if (myRank == 0) {
5401 out << "same as row Map" << endl;
5402 }
5403 } else {
5404 if (myRank == 0) {
5405 out << endl;
5406 }
5407 getRangeMap ()->describe (out, vl);
5408 }
5409
5410 // O(P) data
5411 for (int curRank = 0; curRank < numProcs; ++curRank) {
5412 if (myRank == curRank) {
5413 out << "Process rank: " << curRank << endl;
5414 Teuchos::OSTab tab2 (out);
5415 if (! staticGraph_->indicesAreAllocated ()) {
5416 out << "Graph indices not allocated" << endl;
5417 }
5418 else {
5419 out << "Number of allocated entries: "
5420 << staticGraph_->getLocalAllocationSize () << endl;
5421 }
5422 out << "Number of entries: " << getLocalNumEntries () << endl
5423 << "Max number of entries per row: " << getLocalMaxNumRowEntries ()
5424 << endl;
5425 }
5426 // Give output time to complete by executing some barriers.
5427 comm->barrier ();
5428 comm->barrier ();
5429 comm->barrier ();
5430 }
5431
5432 if (vl < VERB_HIGH) {
5433 return; // all done!
5434 }
5435
5436 // O(N) and O(NNZ) data
5437 for (int curRank = 0; curRank < numProcs; ++curRank) {
5438 if (myRank == curRank) {
5439 out << std::setw(width) << "Proc Rank"
5440 << std::setw(width) << "Global Row"
5441 << std::setw(width) << "Num Entries";
5442 if (vl == VERB_EXTREME) {
5443 out << std::setw(width) << "(Index,Value)";
5444 }
5445 out << endl;
5446 for (size_t r = 0; r < getLocalNumRows (); ++r) {
5447 const size_t nE = getNumEntriesInLocalRow(r);
5448 GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
5449 out << std::setw(width) << myRank
5450 << std::setw(width) << gid
5451 << std::setw(width) << nE;
5452 if (vl == VERB_EXTREME) {
5453 if (isGloballyIndexed()) {
5454 global_inds_host_view_type rowinds;
5455 values_host_view_type rowvals;
5456 getGlobalRowView (gid, rowinds, rowvals);
5457 for (size_t j = 0; j < nE; ++j) {
5458 out << " (" << rowinds[j]
5459 << ", " << rowvals[j]
5460 << ") ";
5461 }
5462 }
5463 else if (isLocallyIndexed()) {
5464 local_inds_host_view_type rowinds;
5465 values_host_view_type rowvals;
5466 getLocalRowView (r, rowinds, rowvals);
5467 for (size_t j=0; j < nE; ++j) {
5468 out << " (" << getColMap()->getGlobalElement(rowinds[j])
5469 << ", " << rowvals[j]
5470 << ") ";
5471 }
5472 } // globally or locally indexed
5473 } // vl == VERB_EXTREME
5474 out << endl;
5475 } // for each row r on this process
5476 } // if (myRank == curRank)
5477
5478 // Give output time to complete
5479 comm->barrier ();
5480 comm->barrier ();
5481 comm->barrier ();
5482 } // for each process p
5483 }
5484
5485 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5486 bool
5488 checkSizes (const SrcDistObject& source)
5489 {
5490 // It's not clear what kind of compatibility checks on sizes can
5491 // be performed here. Epetra_CrsGraph doesn't check any sizes for
5492 // compatibility.
5493
5494 // Currently, the source object must be a RowMatrix with the same
5495 // four template parameters as the target CrsMatrix. We might
5496 // relax this requirement later.
5497 const row_matrix_type* srcRowMat =
5498 dynamic_cast<const row_matrix_type*> (&source);
5499 return (srcRowMat != nullptr);
5500 }
5501
5502 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5503 void
5506 const typename crs_graph_type::padding_type& padding,
5507 const bool verbose)
5508 {
5511 using std::endl;
5512 using LO = local_ordinal_type;
5513 using row_ptrs_type =
5514 typename local_graph_device_type::row_map_type::non_const_type;
5515 using range_policy =
5516 Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
5517 const char tfecfFuncName[] = "applyCrsPadding";
5518 const char suffix[] =
5519 ". Please report this bug to the Tpetra developers.";
5520 ProfilingRegion regionCAP("Tpetra::CrsMatrix::applyCrsPadding");
5521
5522 std::unique_ptr<std::string> prefix;
5523 if (verbose) {
5524 prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
5525 std::ostringstream os;
5526 os << *prefix << "padding: ";
5527 padding.print(os);
5528 os << endl;
5529 std::cerr << os.str();
5530 }
5531 const int myRank = ! verbose ? -1 : [&] () {
5532 auto map = this->getMap();
5533 if (map.is_null()) {
5534 return -1;
5535 }
5536 auto comm = map->getComm();
5537 if (comm.is_null()) {
5538 return -1;
5539 }
5540 return comm->getRank();
5541 } ();
5542
5543 // NOTE (mfh 29 Jan 2020) This allocates the values array.
5544 if (! myGraph_->indicesAreAllocated()) {
5545 if (verbose) {
5546 std::ostringstream os;
5547 os << *prefix << "Call allocateIndices" << endl;
5548 std::cerr << os.str();
5549 }
5550 allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
5551 }
5552
5553 // FIXME (mfh 10 Feb 2020) We shouldn't actually reallocate
5554 // row_ptrs_beg or allocate row_ptrs_end unless the allocation
5555 // size needs to increase. That should be the job of
5556 // padCrsArrays.
5557
5558 // Making copies here because rowPtrsUnpacked_ has a const type. Otherwise, we
5559 // would use it directly.
5560
5561 if (verbose) {
5562 std::ostringstream os;
5563 os << *prefix << "Allocate row_ptrs_beg: "
5564 << myGraph_->getRowPtrsUnpackedHost().extent(0) << endl;
5565 std::cerr << os.str();
5566 }
5567 using Kokkos::view_alloc;
5568 using Kokkos::WithoutInitializing;
5569 row_ptrs_type row_ptr_beg(view_alloc("row_ptr_beg", WithoutInitializing),
5570 myGraph_->rowPtrsUnpacked_dev_.extent(0));
5571 // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
5572 Kokkos::deep_copy(execution_space(),row_ptr_beg, myGraph_->rowPtrsUnpacked_dev_);
5573
5574 const size_t N = row_ptr_beg.extent(0) == 0 ? size_t(0) :
5575 size_t(row_ptr_beg.extent(0) - 1);
5576 if (verbose) {
5577 std::ostringstream os;
5578 os << *prefix << "Allocate row_ptrs_end: " << N << endl;
5579 std::cerr << os.str();
5580 }
5581 row_ptrs_type row_ptr_end(
5582 view_alloc("row_ptr_end", WithoutInitializing), N);
5583
5584 row_ptrs_type num_row_entries_d;
5585
5586 const bool refill_num_row_entries =
5587 myGraph_->k_numRowEntries_.extent(0) != 0;
5588
5589 if (refill_num_row_entries) { // unpacked storage
5590 // We can't assume correct *this capture until C++17, and it's
5591 // likely more efficient just to capture what we need anyway.
5592 num_row_entries_d = create_mirror_view_and_copy(memory_space(),
5593 myGraph_->k_numRowEntries_);
5594 Kokkos::parallel_for
5595 ("Fill end row pointers", range_policy(0, N),
5596 KOKKOS_LAMBDA (const size_t i) {
5597 row_ptr_end(i) = row_ptr_beg(i) + num_row_entries_d(i);
5598 });
5599 }
5600 else {
5601 // FIXME (mfh 04 Feb 2020) Fix padCrsArrays so that if packed
5602 // storage, we don't need row_ptr_end to be separate allocation;
5603 // could just have it alias row_ptr_beg+1.
5604 Kokkos::parallel_for
5605 ("Fill end row pointers", range_policy(0, N),
5606 KOKKOS_LAMBDA (const size_t i) {
5607 row_ptr_end(i) = row_ptr_beg(i+1);
5608 });
5609 }
5610
5611 if (myGraph_->isGloballyIndexed()) {
5612 padCrsArrays(row_ptr_beg, row_ptr_end,
5613 myGraph_->gblInds_wdv,
5614 valuesUnpacked_wdv, padding, myRank, verbose);
5615 const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5616 const auto newColIndsLen = myGraph_->gblInds_wdv.extent(0);
5617 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5618 (newValuesLen != newColIndsLen, std::logic_error,
5619 ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5620 << " != myGraph_->gblInds_wdv.extent(0)=" << newColIndsLen
5621 << suffix);
5622 }
5623 else {
5624 padCrsArrays(row_ptr_beg, row_ptr_end,
5625 myGraph_->lclIndsUnpacked_wdv,
5626 valuesUnpacked_wdv, padding, myRank, verbose);
5627 const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5628 const auto newColIndsLen = myGraph_->lclIndsUnpacked_wdv.extent(0);
5629 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5630 (newValuesLen != newColIndsLen, std::logic_error,
5631 ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5632 << " != myGraph_->lclIndsUnpacked_wdv.extent(0)=" << newColIndsLen
5633 << suffix);
5634 }
5635
5636 if (refill_num_row_entries) {
5637 Kokkos::parallel_for
5638 ("Fill num entries", range_policy(0, N),
5639 KOKKOS_LAMBDA (const size_t i) {
5640 num_row_entries_d(i) = row_ptr_end(i) - row_ptr_beg(i);
5641 });
5642 Kokkos::deep_copy(myGraph_->k_numRowEntries_, num_row_entries_d);
5643 }
5644
5645 if (verbose) {
5646 std::ostringstream os;
5647 os << *prefix << "Assign myGraph_->rowPtrsUnpacked_; "
5648 << "old size: " << myGraph_->rowPtrsUnpacked_host_.extent(0)
5649 << ", new size: " << row_ptr_beg.extent(0) << endl;
5650 std::cerr << os.str();
5651 TEUCHOS_ASSERT( myGraph_->getRowPtrsUnpackedHost().extent(0) ==
5652 row_ptr_beg.extent(0) );
5653 }
5654 myGraph_->setRowPtrsUnpacked(row_ptr_beg);
5655 }
5656
5657 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5658 void
5662 const size_t numSameIDs,
5663 const LocalOrdinal permuteToLIDs[],
5664 const LocalOrdinal permuteFromLIDs[],
5665 const size_t numPermutes)
5666 {
5668 using Teuchos::Array;
5669 using Teuchos::ArrayView;
5670 using std::endl;
5671 using LO = LocalOrdinal;
5672 using GO = GlobalOrdinal;
5673 const char tfecfFuncName[] = "copyAndPermuteStaticGraph";
5674 const char suffix[] =
5675 " Please report this bug to the Tpetra developers.";
5676 ProfilingRegion regionCAP
5677 ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph");
5678
5679 const bool debug = Details::Behavior::debug("CrsGraph");
5680 const bool verbose = Details::Behavior::verbose("CrsGraph");
5681 std::unique_ptr<std::string> prefix;
5682 if (verbose) {
5683 prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5684 std::ostringstream os;
5685 os << *prefix << "Start" << endl;
5686 }
5687 const char* const prefix_raw =
5688 verbose ? prefix.get()->c_str() : nullptr;
5689
5690 const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5691 //
5692 // Copy the first numSame row from source to target (this matrix).
5693 // This involves copying rows corresponding to LIDs [0, numSame-1].
5694 //
5695 const map_type& srcRowMap = * (srcMat.getRowMap ());
5696 nonconst_global_inds_host_view_type rowInds;
5697 nonconst_values_host_view_type rowVals;
5698 const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5699 for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5700 // Global ID for the current row index in the source matrix.
5701 // The first numSameIDs GIDs in the two input lists are the
5702 // same, so sourceGID == targetGID in this case.
5703 const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5704 const GO targetGID = sourceGID;
5705
5706 ArrayView<const GO>rowIndsConstView;
5707 ArrayView<const Scalar> rowValsConstView;
5708
5709 if (sourceIsLocallyIndexed) {
5710 const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5711 if (rowLength > static_cast<size_t> (rowInds.size())) {
5712 Kokkos::resize(rowInds,rowLength);
5713 Kokkos::resize(rowVals,rowLength);
5714 }
5715 // Resizing invalidates an Array's views, so we must make new
5716 // ones, even if rowLength hasn't changed.
5717 nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5718 nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5719
5720 // The source matrix is locally indexed, so we have to get a
5721 // copy. Really it's the GIDs that have to be copied (because
5722 // they have to be converted from LIDs).
5723 size_t checkRowLength = 0;
5724 srcMat.getGlobalRowCopy (sourceGID, rowIndsView,
5725 rowValsView, checkRowLength);
5726 if (debug) {
5727 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5728 (rowLength != checkRowLength, std::logic_error, "For "
5729 "global row index " << sourceGID << ", the source "
5730 "matrix's getNumEntriesInGlobalRow returns a row length "
5731 "of " << rowLength << ", but getGlobalRowCopy reports "
5732 "a row length of " << checkRowLength << "." << suffix);
5733 }
5734
5735 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5736 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5737 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5738 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5739 rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5740 rowIndsView.data(), rowIndsView.extent(0),
5741 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5742 rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5743 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5744 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5745 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5746 // KDDKDD UVM TEMPORARY: KokkosView interface
5747 }
5748 else { // source matrix is globally indexed.
5749 global_inds_host_view_type rowIndsView;
5750 values_host_view_type rowValsView;
5751 srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5752 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5753 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5754 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5755 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5756 rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5757 rowIndsView.data(), rowIndsView.extent(0),
5758 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5759 rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5760 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5761 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5762 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5763 // KDDKDD UVM TEMPORARY: KokkosView interface
5764
5765 }
5766
5767 // Applying a permutation to a matrix with a static graph
5768 // means REPLACE-ing entries.
5769 combineGlobalValues(targetGID, rowIndsConstView,
5770 rowValsConstView, REPLACE,
5771 prefix_raw, debug, verbose);
5772 }
5773
5774 if (verbose) {
5775 std::ostringstream os;
5776 os << *prefix << "Do permutes" << endl;
5777 }
5778
5779 const map_type& tgtRowMap = * (this->getRowMap ());
5780 for (size_t p = 0; p < numPermutes; ++p) {
5781 const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5782 const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5783
5784 ArrayView<const GO> rowIndsConstView;
5785 ArrayView<const Scalar> rowValsConstView;
5786
5787 if (sourceIsLocallyIndexed) {
5788 const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5789 if (rowLength > static_cast<size_t> (rowInds.size ())) {
5790 Kokkos::resize(rowInds,rowLength);
5791 Kokkos::resize(rowVals,rowLength);
5792 }
5793 // Resizing invalidates an Array's views, so we must make new
5794 // ones, even if rowLength hasn't changed.
5795 nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5796 nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5797
5798 // The source matrix is locally indexed, so we have to get a
5799 // copy. Really it's the GIDs that have to be copied (because
5800 // they have to be converted from LIDs).
5801 size_t checkRowLength = 0;
5802 srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5803 rowValsView, checkRowLength);
5804 if (debug) {
5805 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5806 (rowLength != checkRowLength, std::logic_error, "For "
5807 "source matrix global row index " << sourceGID << ", "
5808 "getNumEntriesInGlobalRow returns a row length of " <<
5809 rowLength << ", but getGlobalRowCopy a row length of "
5810 << checkRowLength << "." << suffix);
5811 }
5812
5813 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5814 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5815 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5816 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5817 rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5818 rowIndsView.data(), rowIndsView.extent(0),
5819 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5820 rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5821 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5822 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5823 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5824 // KDDKDD UVM TEMPORARY: KokkosView interface
5825 }
5826 else {
5827 global_inds_host_view_type rowIndsView;
5828 values_host_view_type rowValsView;
5829 srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5830 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5831 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5832 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5833 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5834 rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5835 rowIndsView.data(), rowIndsView.extent(0),
5836 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5837 rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5838 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5839 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5840 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5841 // KDDKDD UVM TEMPORARY: KokkosView interface
5842 }
5843
5844 combineGlobalValues(targetGID, rowIndsConstView,
5845 rowValsConstView, REPLACE,
5846 prefix_raw, debug, verbose);
5847 }
5848
5849 if (verbose) {
5850 std::ostringstream os;
5851 os << *prefix << "Done" << endl;
5852 }
5853 }
5854
5855 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5856 void
5860 const size_t numSameIDs,
5861 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs_dv,
5862 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs_dv,
5863 const size_t numPermutes)
5864 {
5866 using Teuchos::Array;
5867 using Teuchos::ArrayView;
5868 using std::endl;
5869 using LO = LocalOrdinal;
5870 using GO = GlobalOrdinal;
5871 const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph";
5872 const char suffix[] =
5873 " Please report this bug to the Tpetra developers.";
5874 ProfilingRegion regionCAP
5875 ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph");
5876
5877 const bool debug = Details::Behavior::debug("CrsGraph");
5878 const bool verbose = Details::Behavior::verbose("CrsGraph");
5879 std::unique_ptr<std::string> prefix;
5880 if (verbose) {
5881 prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5882 std::ostringstream os;
5883 os << *prefix << "Start" << endl;
5884 }
5885 const char* const prefix_raw =
5886 verbose ? prefix.get()->c_str() : nullptr;
5887
5888 {
5889 using row_graph_type = RowGraph<LO, GO, Node>;
5890 const row_graph_type& srcGraph = *(srcMat.getGraph());
5891 auto padding =
5892 myGraph_->computeCrsPadding(srcGraph, numSameIDs,
5893 permuteToLIDs_dv, permuteFromLIDs_dv, verbose);
5894 applyCrsPadding(*padding, verbose);
5895 }
5896 const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5897 //
5898 // Copy the first numSame row from source to target (this matrix).
5899 // This involves copying rows corresponding to LIDs [0, numSame-1].
5900 //
5901 const map_type& srcRowMap = * (srcMat.getRowMap ());
5902 const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5903 using gids_type = nonconst_global_inds_host_view_type;
5904 using vals_type = nonconst_values_host_view_type;
5905 gids_type rowInds;
5906 vals_type rowVals;
5907 for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5908 // Global ID for the current row index in the source matrix.
5909 // The first numSameIDs GIDs in the two input lists are the
5910 // same, so sourceGID == targetGID in this case.
5911 const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5912 const GO targetGID = sourceGID;
5913
5914 ArrayView<const GO> rowIndsConstView;
5915 ArrayView<const Scalar> rowValsConstView;
5916
5917 if (sourceIsLocallyIndexed) {
5918
5919 const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5920 if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5921 Kokkos::resize(rowInds,rowLength);
5922 Kokkos::resize(rowVals,rowLength);
5923 }
5924 // Resizing invalidates an Array's views, so we must make new
5925 // ones, even if rowLength hasn't changed.
5926 gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5927 vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5928
5929 // The source matrix is locally indexed, so we have to get a
5930 // copy. Really it's the GIDs that have to be copied (because
5931 // they have to be converted from LIDs).
5932 size_t checkRowLength = 0;
5933 srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView,
5934 checkRowLength);
5935 if (debug) {
5936 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5937 (rowLength != checkRowLength, std::logic_error, ": For "
5938 "global row index " << sourceGID << ", the source "
5939 "matrix's getNumEntriesInGlobalRow returns a row length "
5940 "of " << rowLength << ", but getGlobalRowCopy reports "
5941 "a row length of " << checkRowLength << "." << suffix);
5942 }
5943 rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5944 rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
5945 }
5946 else { // source matrix is globally indexed.
5947 global_inds_host_view_type rowIndsView;
5948 values_host_view_type rowValsView;
5949 srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5950
5951 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5952 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5953 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5954 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5955 rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5956 rowIndsView.data(), rowIndsView.extent(0),
5957 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5958 rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5959 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5960 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5961 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5962 // KDDKDD UVM TEMPORARY: KokkosView interface
5963 }
5964
5965 // Combine the data into the target matrix.
5966 insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5967 rowValsConstView, prefix_raw, debug, verbose);
5968 }
5969
5970 if (verbose) {
5971 std::ostringstream os;
5972 os << *prefix << "Do permutes" << endl;
5973 }
5974 const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data();
5975 const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data();
5976
5977 const map_type& tgtRowMap = * (this->getRowMap ());
5978 for (size_t p = 0; p < numPermutes; ++p) {
5979 const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5980 const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5981
5982 ArrayView<const GO> rowIndsConstView;
5983 ArrayView<const Scalar> rowValsConstView;
5984
5985 if (sourceIsLocallyIndexed) {
5986 const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5987 if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5988 Kokkos::resize(rowInds,rowLength);
5989 Kokkos::resize(rowVals,rowLength);
5990 }
5991 // Resizing invalidates an Array's views, so we must make new
5992 // ones, even if rowLength hasn't changed.
5993 gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5994 vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5995
5996 // The source matrix is locally indexed, so we have to get a
5997 // copy. Really it's the GIDs that have to be copied (because
5998 // they have to be converted from LIDs).
5999 size_t checkRowLength = 0;
6000 srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
6001 rowValsView, checkRowLength);
6002 if (debug) {
6003 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6004 (rowLength != checkRowLength, std::logic_error, "For "
6005 "source matrix global row index " << sourceGID << ", "
6006 "getNumEntriesInGlobalRow returns a row length of " <<
6007 rowLength << ", but getGlobalRowCopy a row length of "
6008 << checkRowLength << "." << suffix);
6009 }
6010 rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
6011 rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
6012 }
6013 else {
6014 global_inds_host_view_type rowIndsView;
6015 values_host_view_type rowValsView;
6016 srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
6017
6018 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
6019 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
6020 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
6021 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
6022 rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
6023 rowIndsView.data(), rowIndsView.extent(0),
6024 Teuchos::RCP_DISABLE_NODE_LOOKUP);
6025 rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
6026 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
6027 Teuchos::RCP_DISABLE_NODE_LOOKUP);
6028 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6029 // KDDKDD UVM TEMPORARY: KokkosView interface
6030 }
6031
6032 // Combine the data into the target matrix.
6033 insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
6034 rowValsConstView, prefix_raw, debug, verbose);
6035 }
6036
6037 if (verbose) {
6038 std::ostringstream os;
6039 os << *prefix << "Done" << endl;
6040 }
6041 }
6042
6043 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6044 void
6047 const SrcDistObject& srcObj,
6048 const size_t numSameIDs,
6049 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
6050 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
6051 const CombineMode /*CM*/)
6052 {
6053 using Details::Behavior;
6056 using std::endl;
6057
6058 // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
6059 const char tfecfFuncName[] = "copyAndPermute: ";
6060 ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermute");
6061
6062 const bool verbose = Behavior::verbose("CrsMatrix");
6063 std::unique_ptr<std::string> prefix;
6064 if (verbose) {
6065 prefix = this->createPrefix("CrsMatrix", "copyAndPermute");
6066 std::ostringstream os;
6067 os << *prefix << endl
6068 << *prefix << " numSameIDs: " << numSameIDs << endl
6069 << *prefix << " numPermute: " << permuteToLIDs.extent(0)
6070 << endl
6071 << *prefix << " "
6072 << dualViewStatusToString (permuteToLIDs, "permuteToLIDs")
6073 << endl
6074 << *prefix << " "
6075 << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs")
6076 << endl
6077 << *prefix << " "
6078 << "isStaticGraph: " << (isStaticGraph() ? "true" : "false")
6079 << endl;
6080 std::cerr << os.str ();
6081 }
6082
6083 const auto numPermute = permuteToLIDs.extent (0);
6084 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6085 (numPermute != permuteFromLIDs.extent (0),
6086 std::invalid_argument, "permuteToLIDs.extent(0) = "
6087 << numPermute << "!= permuteFromLIDs.extent(0) = "
6088 << permuteFromLIDs.extent (0) << ".");
6089
6090 // This dynamic cast should succeed, because we've already tested
6091 // it in checkSizes().
6093 const RMT& srcMat = dynamic_cast<const RMT&> (srcObj);
6094 if (isStaticGraph ()) {
6095 TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
6096 auto permuteToLIDs_h = permuteToLIDs.view_host ();
6097 TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
6098 auto permuteFromLIDs_h = permuteFromLIDs.view_host ();
6099
6100 copyAndPermuteStaticGraph(srcMat, numSameIDs,
6101 permuteToLIDs_h.data(),
6102 permuteFromLIDs_h.data(),
6103 numPermute);
6104 }
6105 else {
6106 copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs,
6107 permuteFromLIDs, numPermute);
6108 }
6109
6110 if (verbose) {
6111 std::ostringstream os;
6112 os << *prefix << "Done" << endl;
6113 std::cerr << os.str();
6114 }
6115 }
6116
6117 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6118 void
6121 (const SrcDistObject& source,
6122 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6123 Kokkos::DualView<char*, buffer_device_type>& exports,
6124 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6125 size_t& constantNumPackets)
6126 {
6127 using Details::Behavior;
6130 using Teuchos::outArg;
6131 using Teuchos::REDUCE_MAX;
6132 using Teuchos::reduceAll;
6133 using std::endl;
6134 typedef LocalOrdinal LO;
6135 typedef GlobalOrdinal GO;
6136 const char tfecfFuncName[] = "packAndPrepare: ";
6137 ProfilingRegion regionPAP ("Tpetra::CrsMatrix::packAndPrepare");
6138
6139 const bool debug = Behavior::debug("CrsMatrix");
6140 const bool verbose = Behavior::verbose("CrsMatrix");
6141
6142 // Processes on which the communicator is null should not participate.
6143 Teuchos::RCP<const Teuchos::Comm<int> > pComm = this->getComm ();
6144 if (pComm.is_null ()) {
6145 return;
6146 }
6147 const Teuchos::Comm<int>& comm = *pComm;
6148 const int myRank = comm.getSize ();
6149
6150 std::unique_ptr<std::string> prefix;
6151 if (verbose) {
6152 prefix = this->createPrefix("CrsMatrix", "packAndPrepare");
6153 std::ostringstream os;
6154 os << *prefix << "Start" << endl
6155 << *prefix << " "
6156 << dualViewStatusToString (exportLIDs, "exportLIDs")
6157 << endl
6158 << *prefix << " "
6159 << dualViewStatusToString (exports, "exports")
6160 << endl
6161 << *prefix << " "
6162 << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6163 << endl;
6164 std::cerr << os.str ();
6165 }
6166
6167 // Attempt to cast the source object to CrsMatrix. If successful,
6168 // use the source object's packNew() method to pack its data for
6169 // communication. Otherwise, attempt to cast to RowMatrix; if
6170 // successful, use the source object's pack() method. Otherwise,
6171 // the source object doesn't have the right type.
6172 //
6173 // FIXME (mfh 30 Jun 2013, 11 Sep 2017) We don't even need the
6174 // RowMatrix to have the same Node type. Unfortunately, we don't
6175 // have a way to ask if the RowMatrix is "a RowMatrix with any
6176 // Node type," since RowMatrix doesn't have a base class. A
6177 // hypothetical RowMatrixBase<Scalar, LO, GO> class, which does
6178 // not currently exist, would satisfy this requirement.
6179 //
6180 // Why RowMatrixBase<Scalar, LO, GO>? The source object's Scalar
6181 // type doesn't technically need to match the target object's
6182 // Scalar type, so we could just have RowMatrixBase<LO, GO>. LO
6183 // and GO need not be the same, as long as there is no overflow of
6184 // the indices. However, checking for index overflow is global
6185 // and therefore undesirable.
6186
6187 std::ostringstream msg; // for collecting error messages
6188 int lclBad = 0; // to be set below
6189
6190 using crs_matrix_type = CrsMatrix<Scalar, LO, GO, Node>;
6191 const crs_matrix_type* srcCrsMat =
6192 dynamic_cast<const crs_matrix_type*> (&source);
6193 if (srcCrsMat != nullptr) {
6194 if (verbose) {
6195 std::ostringstream os;
6196 os << *prefix << "Source matrix same (CrsMatrix) type as target; "
6197 "calling packNew" << endl;
6198 std::cerr << os.str ();
6199 }
6200 try {
6201 srcCrsMat->packNew (exportLIDs, exports, numPacketsPerLID,
6202 constantNumPackets);
6203 }
6204 catch (std::exception& e) {
6205 lclBad = 1;
6206 msg << "Proc " << myRank << ": " << e.what () << std::endl;
6207 }
6208 }
6209 else {
6210 using Kokkos::HostSpace;
6211 using Kokkos::subview;
6212 using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6213 using range_type = Kokkos::pair<size_t, size_t>;
6214
6215 if (verbose) {
6216 std::ostringstream os;
6217 os << *prefix << "Source matrix NOT same (CrsMatrix) type as target"
6218 << endl;
6219 std::cerr << os.str ();
6220 }
6221
6222 const row_matrix_type* srcRowMat =
6223 dynamic_cast<const row_matrix_type*> (&source);
6224 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6225 (srcRowMat == nullptr, std::invalid_argument,
6226 "The source object of the Import or Export operation is neither a "
6227 "CrsMatrix (with the same template parameters as the target object), "
6228 "nor a RowMatrix (with the same first four template parameters as the "
6229 "target object).");
6230
6231 // For the RowMatrix case, we need to convert from
6232 // Kokkos::DualView to Teuchos::Array*. This doesn't need to be
6233 // so terribly efficient, since packing a non-CrsMatrix
6234 // RowMatrix for Import/Export into a CrsMatrix is not a
6235 // critical case. Thus, we may allocate Teuchos::Array objects
6236 // here and copy to and from Kokkos::*View.
6237
6238 // View exportLIDs's host data as a Teuchos::ArrayView.
6239 TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6240 auto exportLIDs_h = exportLIDs.view_host ();
6241 Teuchos::ArrayView<const LO> exportLIDs_av (exportLIDs_h.data (),
6242 exportLIDs_h.size ());
6243
6244 // pack() will allocate exports_a as needed. We'll copy back
6245 // into exports (after (re)allocating exports if needed) below.
6246 Teuchos::Array<char> exports_a;
6247
6248 // View exportLIDs' host data as a Teuchos::ArrayView. We don't
6249 // need to sync, since we're doing write-only access, but we do
6250 // need to mark the DualView as modified on host.
6251
6252 numPacketsPerLID.clear_sync_state (); // write-only access
6253 numPacketsPerLID.modify_host ();
6254 auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6255 Teuchos::ArrayView<size_t> numPacketsPerLID_av (numPacketsPerLID_h.data (),
6256 numPacketsPerLID_h.size ());
6257
6258 // Invoke RowMatrix's legacy pack() interface, using above
6259 // Teuchos::Array* objects.
6260 try {
6261 srcRowMat->pack (exportLIDs_av, exports_a, numPacketsPerLID_av,
6262 constantNumPackets);
6263 }
6264 catch (std::exception& e) {
6265 lclBad = 1;
6266 msg << "Proc " << myRank << ": " << e.what () << std::endl;
6267 }
6268
6269 // Allocate 'exports', and copy exports_a back into it.
6270 const size_t newAllocSize = static_cast<size_t> (exports_a.size ());
6271 if (static_cast<size_t> (exports.extent (0)) < newAllocSize) {
6272 const std::string oldLabel = exports.view_device().label ();
6273 const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6274 exports = exports_type (newLabel, newAllocSize);
6275 }
6276 // It's safe to assume that we're working on host anyway, so
6277 // just keep exports sync'd to host.
6278 // ignore current device contents
6279 exports.modify_host();
6280
6281 auto exports_h = exports.view_host ();
6282 auto exports_h_sub = subview (exports_h, range_type (0, newAllocSize));
6283
6284 // Kokkos::deep_copy needs a Kokkos::View input, so turn
6285 // exports_a into a nonowning Kokkos::View first before copying.
6286 typedef typename exports_type::t_host::execution_space HES;
6287 typedef Kokkos::Device<HES, HostSpace> host_device_type;
6288 Kokkos::View<const char*, host_device_type>
6289 exports_a_kv (exports_a.getRawPtr (), newAllocSize);
6290 // DEEP_COPY REVIEW - NOT TESTED
6291 Kokkos::deep_copy (exports_h_sub, exports_a_kv);
6292 }
6293
6294 if (debug) {
6295 int gblBad = 0; // output argument; to be set below
6296 reduceAll<int, int> (comm, REDUCE_MAX, lclBad, outArg (gblBad));
6297 if (gblBad != 0) {
6298 Tpetra::Details::gathervPrint (std::cerr, msg.str (), comm);
6299 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6300 (true, std::logic_error, "packNew() or pack() threw an exception on "
6301 "one or more participating processes.");
6302 }
6303 }
6304 else {
6305 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6306 (lclBad != 0, std::logic_error, "packNew threw an exception on one "
6307 "or more participating processes. Here is this process' error "
6308 "message: " << msg.str ());
6309 }
6310
6311 if (verbose) {
6312 std::ostringstream os;
6313 os << *prefix << "packAndPrepare: Done!" << endl
6314 << *prefix << " "
6315 << dualViewStatusToString (exportLIDs, "exportLIDs")
6316 << endl
6317 << *prefix << " "
6318 << dualViewStatusToString (exports, "exports")
6319 << endl
6320 << *prefix << " "
6321 << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6322 << endl;
6323 std::cerr << os.str ();
6324 }
6325 }
6326
6327 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6328 size_t
6330 packRow (char exports[],
6331 const size_t offset,
6332 const size_t numEnt,
6333 const GlobalOrdinal gidsIn[],
6334 const impl_scalar_type valsIn[],
6335 const size_t numBytesPerValue) const
6336 {
6337 using Kokkos::View;
6338 using Kokkos::subview;
6339 using Tpetra::Details::PackTraits;
6340 typedef LocalOrdinal LO;
6341 typedef GlobalOrdinal GO;
6342 typedef impl_scalar_type ST;
6343
6344 if (numEnt == 0) {
6345 // Empty rows always take zero bytes, to ensure sparsity.
6346 return 0;
6347 }
6348
6349 const GO gid = 0; // packValueCount wants this
6350 const LO numEntLO = static_cast<size_t> (numEnt);
6351
6352 const size_t numEntBeg = offset;
6353 const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
6354 const size_t gidsBeg = numEntBeg + numEntLen;
6355 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6356 const size_t valsBeg = gidsBeg + gidsLen;
6357 const size_t valsLen = numEnt * numBytesPerValue;
6358
6359 char* const numEntOut = exports + numEntBeg;
6360 char* const gidsOut = exports + gidsBeg;
6361 char* const valsOut = exports + valsBeg;
6362
6363 size_t numBytesOut = 0;
6364 int errorCode = 0;
6365 numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
6366
6367 {
6368 Kokkos::pair<int, size_t> p;
6369 p = PackTraits<GO>::packArray (gidsOut, gidsIn, numEnt);
6370 errorCode += p.first;
6371 numBytesOut += p.second;
6372
6373 p = PackTraits<ST>::packArray (valsOut, valsIn, numEnt);
6374 errorCode += p.first;
6375 numBytesOut += p.second;
6376 }
6377
6378 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6379 TEUCHOS_TEST_FOR_EXCEPTION
6380 (numBytesOut != expectedNumBytes, std::logic_error, "packRow: "
6381 "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
6382 << expectedNumBytes << ".");
6383 TEUCHOS_TEST_FOR_EXCEPTION
6384 (errorCode != 0, std::runtime_error, "packRow: "
6385 "PackTraits::packArray returned a nonzero error code");
6386
6387 return numBytesOut;
6388 }
6389
6390 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6391 size_t
6393 unpackRow (GlobalOrdinal gidsOut[],
6394 impl_scalar_type valsOut[],
6395 const char imports[],
6396 const size_t offset,
6397 const size_t numBytes,
6398 const size_t numEnt,
6399 const size_t numBytesPerValue)
6400 {
6401 using Kokkos::View;
6402 using Kokkos::subview;
6403 using Tpetra::Details::PackTraits;
6404 typedef LocalOrdinal LO;
6405 typedef GlobalOrdinal GO;
6406 typedef impl_scalar_type ST;
6407
6408 Details::ProfilingRegion region_upack_row(
6409 "Tpetra::CrsMatrix::unpackRow",
6410 "Import/Export"
6411 );
6412
6413 if (numBytes == 0) {
6414 // Rows with zero bytes should always have zero entries.
6415 if (numEnt != 0) {
6416 const int myRank = this->getMap ()->getComm ()->getRank ();
6417 TEUCHOS_TEST_FOR_EXCEPTION
6418 (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6419 "unpackRow: The number of bytes to unpack numBytes=0, but the "
6420 "number of entries to unpack (as reported by numPacketsPerLID) "
6421 "for this row numEnt=" << numEnt << " != 0.");
6422 }
6423 return 0;
6424 }
6425
6426 if (numEnt == 0 && numBytes != 0) {
6427 const int myRank = this->getMap ()->getComm ()->getRank ();
6428 TEUCHOS_TEST_FOR_EXCEPTION
6429 (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6430 "unpackRow: The number of entries to unpack (as reported by "
6431 "numPacketsPerLID) numEnt=0, but the number of bytes to unpack "
6432 "numBytes=" << numBytes << " != 0.");
6433 }
6434
6435 const GO gid = 0; // packValueCount wants this
6436 const LO lid = 0; // packValueCount wants this
6437
6438 const size_t numEntBeg = offset;
6439 const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
6440 const size_t gidsBeg = numEntBeg + numEntLen;
6441 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6442 const size_t valsBeg = gidsBeg + gidsLen;
6443 const size_t valsLen = numEnt * numBytesPerValue;
6444
6445 const char* const numEntIn = imports + numEntBeg;
6446 const char* const gidsIn = imports + gidsBeg;
6447 const char* const valsIn = imports + valsBeg;
6448
6449 size_t numBytesOut = 0;
6450 int errorCode = 0;
6451 LO numEntOut;
6452 numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
6453 if (static_cast<size_t> (numEntOut) != numEnt ||
6454 numEntOut == static_cast<LO> (0)) {
6455 const int myRank = this->getMap ()->getComm ()->getRank ();
6456 std::ostringstream os;
6457 os << "(Proc " << myRank << ") CrsMatrix::unpackRow: ";
6458 bool firstErrorCondition = false;
6459 if (static_cast<size_t> (numEntOut) != numEnt) {
6460 os << "Number of entries from numPacketsPerLID numEnt=" << numEnt
6461 << " does not equal number of entries unpacked from imports "
6462 "buffer numEntOut=" << numEntOut << ".";
6463 firstErrorCondition = true;
6464 }
6465 if (numEntOut == static_cast<LO> (0)) {
6466 if (firstErrorCondition) {
6467 os << " Also, ";
6468 }
6469 os << "Number of entries unpacked from imports buffer numEntOut=0, "
6470 "but number of bytes to unpack for this row numBytes=" << numBytes
6471 << " != 0. This should never happen, since packRow should only "
6472 "ever pack rows with a nonzero number of entries. In this case, "
6473 "the number of entries from numPacketsPerLID is numEnt=" << numEnt
6474 << ".";
6475 }
6476 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, os.str ());
6477 }
6478
6479 {
6480 Kokkos::pair<int, size_t> p;
6481 p = PackTraits<GO>::unpackArray (gidsOut, gidsIn, numEnt);
6482 errorCode += p.first;
6483 numBytesOut += p.second;
6484
6485 p = PackTraits<ST>::unpackArray (valsOut, valsIn, numEnt);
6486 errorCode += p.first;
6487 numBytesOut += p.second;
6488 }
6489
6490 TEUCHOS_TEST_FOR_EXCEPTION
6491 (numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
6492 << numBytesOut << " != numBytes = " << numBytes << ".");
6493
6494 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6495 TEUCHOS_TEST_FOR_EXCEPTION
6496 (numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
6497 "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
6498 << expectedNumBytes << ".");
6499
6500 TEUCHOS_TEST_FOR_EXCEPTION
6501 (errorCode != 0, std::runtime_error, "unpackRow: "
6502 "PackTraits::unpackArray returned a nonzero error code");
6503
6504 return numBytesOut;
6505 }
6506
6507 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6508 void
6510 allocatePackSpaceNew (Kokkos::DualView<char*, buffer_device_type>& exports,
6511 size_t& totalNumEntries,
6512 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs) const
6513 {
6514 using Details::Behavior;
6516 using std::endl;
6517 typedef impl_scalar_type IST;
6518 typedef LocalOrdinal LO;
6519 typedef GlobalOrdinal GO;
6520 //const char tfecfFuncName[] = "allocatePackSpaceNew: ";
6521
6522 // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
6523 // output to std::cerr on every MPI process. This is unwise for
6524 // runs with large numbers of MPI processes.
6525 const bool verbose = Behavior::verbose("CrsMatrix");
6526 std::unique_ptr<std::string> prefix;
6527 if (verbose) {
6528 prefix = this->createPrefix("CrsMatrix", "allocatePackSpaceNew");
6529 std::ostringstream os;
6530 os << *prefix << "Before:"
6531 << endl
6532 << *prefix << " "
6533 << dualViewStatusToString (exports, "exports")
6534 << endl
6535 << *prefix << " "
6536 << dualViewStatusToString (exportLIDs, "exportLIDs")
6537 << endl;
6538 std::cerr << os.str ();
6539 }
6540
6541 // The number of export LIDs must fit in LocalOrdinal, assuming
6542 // that the LIDs are distinct and valid on the calling process.
6543 const LO numExportLIDs = static_cast<LO> (exportLIDs.extent (0));
6544
6545 TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6546 auto exportLIDs_h = exportLIDs.view_host ();
6547
6548 // Count the total number of matrix entries to send.
6549 totalNumEntries = 0;
6550 for (LO i = 0; i < numExportLIDs; ++i) {
6551 const LO lclRow = exportLIDs_h[i];
6552 size_t curNumEntries = this->getNumEntriesInLocalRow (lclRow);
6553 // FIXME (mfh 25 Jan 2015) We should actually report invalid row
6554 // indices as an error. Just consider them nonowned for now.
6555 if (curNumEntries == Teuchos::OrdinalTraits<size_t>::invalid ()) {
6556 curNumEntries = 0;
6557 }
6558 totalNumEntries += curNumEntries;
6559 }
6560
6561 // FIXME (mfh 24 Feb 2013, 24 Mar 2017) This code is only correct
6562 // if sizeof(IST) is a meaningful representation of the amount of
6563 // data in a Scalar instance. (LO and GO are always built-in
6564 // integer types.)
6565 //
6566 // Allocate the exports array. It does NOT need padding for
6567 // alignment, since we use memcpy to write to / read from send /
6568 // receive buffers.
6569 const size_t allocSize =
6570 static_cast<size_t> (numExportLIDs) * sizeof (LO) +
6571 totalNumEntries * (sizeof (IST) + sizeof (GO));
6572 if (static_cast<size_t> (exports.extent (0)) < allocSize) {
6573 using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6574
6575 const std::string oldLabel = exports.view_device().label ();
6576 const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6577 exports = exports_type (newLabel, allocSize);
6578 }
6579
6580 if (verbose) {
6581 std::ostringstream os;
6582 os << *prefix << "After:"
6583 << endl
6584 << *prefix << " "
6585 << dualViewStatusToString (exports, "exports")
6586 << endl
6587 << *prefix << " "
6588 << dualViewStatusToString (exportLIDs, "exportLIDs")
6589 << endl;
6590 std::cerr << os.str ();
6591 }
6592 }
6593
6594 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6595 void
6597 packNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6598 Kokkos::DualView<char*, buffer_device_type>& exports,
6599 const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6600 size_t& constantNumPackets) const
6601 {
6602 // The call to packNew in packAndPrepare catches and handles any exceptions.
6603 Details::ProfilingRegion region_pack_new("Tpetra::CrsMatrix::packNew", "Import/Export");
6604 if (this->isStaticGraph ()) {
6606 packCrsMatrixNew (*this, exports, numPacketsPerLID, exportLIDs,
6607 constantNumPackets);
6608 }
6609 else {
6610 this->packNonStaticNew (exportLIDs, exports, numPacketsPerLID,
6611 constantNumPackets);
6612 }
6613 }
6614
6615 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6616 void
6618 packNonStaticNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6619 Kokkos::DualView<char*, buffer_device_type>& exports,
6620 const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6621 size_t& constantNumPackets) const
6622 {
6623 using Details::Behavior;
6625 using Details::PackTraits;
6627 using Kokkos::View;
6628 using std::endl;
6629 using LO = LocalOrdinal;
6630 using GO = GlobalOrdinal;
6631 using ST = impl_scalar_type;
6632 const char tfecfFuncName[] = "packNonStaticNew: ";
6633
6634 const bool verbose = Behavior::verbose("CrsMatrix");
6635 std::unique_ptr<std::string> prefix;
6636 if (verbose) {
6637 prefix = this->createPrefix("CrsMatrix", "packNonStaticNew");
6638 std::ostringstream os;
6639 os << *prefix << "Start" << endl;
6640 std::cerr << os.str ();
6641 }
6642
6643 const size_t numExportLIDs = static_cast<size_t> (exportLIDs.extent (0));
6644 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6645 (numExportLIDs != static_cast<size_t> (numPacketsPerLID.extent (0)),
6646 std::invalid_argument, "exportLIDs.size() = " << numExportLIDs
6647 << " != numPacketsPerLID.size() = " << numPacketsPerLID.extent (0)
6648 << ".");
6649
6650 // Setting this to zero tells the caller to expect a possibly
6651 // different ("nonconstant") number of packets per local index
6652 // (i.e., a possibly different number of entries per row).
6653 constantNumPackets = 0;
6654
6655 // The pack buffer 'exports' enters this method possibly
6656 // unallocated. Do the first two parts of "Count, allocate, fill,
6657 // compute."
6658 size_t totalNumEntries = 0;
6659 this->allocatePackSpaceNew (exports, totalNumEntries, exportLIDs);
6660 const size_t bufSize = static_cast<size_t> (exports.extent (0));
6661
6662 // Write-only host access
6663 exports.clear_sync_state();
6664 exports.modify_host();
6665 auto exports_h = exports.view_host ();
6666 if (verbose) {
6667 std::ostringstream os;
6668 os << *prefix << "After marking exports as modified on host, "
6669 << dualViewStatusToString (exports, "exports") << endl;
6670 std::cerr << os.str ();
6671 }
6672
6673 // Read-only host access
6674 auto exportLIDs_h = exportLIDs.view_host ();
6675
6676 // Write-only host access
6677 const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->clear_sync_state();
6678 const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->modify_host();
6679 auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6680
6681 // Compute the number of "packets" (in this case, bytes) per
6682 // export LID (in this case, local index of the row to send), and
6683 // actually pack the data.
6684 auto maxRowNumEnt = this->getLocalMaxNumRowEntries();
6685
6686
6687 // Temporary buffer for global column indices.
6688 typename global_inds_host_view_type::non_const_type gidsIn_k;
6689 if (this->isLocallyIndexed()) { // Need storage for Global IDs
6690 gidsIn_k =
6691 typename global_inds_host_view_type::non_const_type("packGids",
6692 maxRowNumEnt);
6693 }
6694
6695 size_t offset = 0; // current index into 'exports' array.
6696 for (size_t i = 0; i < numExportLIDs; ++i) {
6697 const LO lclRow = exportLIDs_h[i];
6698
6699 size_t numBytes = 0;
6700 size_t numEnt = this->getNumEntriesInLocalRow (lclRow);
6701
6702 // Only pack this row's data if it has a nonzero number of
6703 // entries. We can do this because receiving processes get the
6704 // number of packets, and will know that zero packets means zero
6705 // entries.
6706 if (numEnt == 0) {
6707 numPacketsPerLID_h[i] = 0;
6708 continue;
6709 }
6710
6711 if (this->isLocallyIndexed ()) {
6712 typename global_inds_host_view_type::non_const_type gidsIn;
6713 values_host_view_type valsIn;
6714 // If the matrix is locally indexed on the calling process, we
6715 // have to use its column Map (which it _must_ have in this
6716 // case) to convert to global indices.
6717 local_inds_host_view_type lidsIn;
6718 this->getLocalRowView (lclRow, lidsIn, valsIn);
6719 const map_type& colMap = * (this->getColMap ());
6720 for (size_t k = 0; k < numEnt; ++k) {
6721 gidsIn_k[k] = colMap.getGlobalElement (lidsIn[k]);
6722 }
6723 gidsIn = Kokkos::subview(gidsIn_k, Kokkos::make_pair(GO(0),GO(numEnt)));
6724
6725 const size_t numBytesPerValue =
6726 PackTraits<ST>::packValueCount (valsIn[0]);
6727 numBytes = this->packRow (exports_h.data (), offset, numEnt,
6728 gidsIn.data (), valsIn.data (),
6729 numBytesPerValue);
6730 }
6731 else if (this->isGloballyIndexed ()) {
6732 global_inds_host_view_type gidsIn;
6733 values_host_view_type valsIn;
6734 // If the matrix is globally indexed on the calling process,
6735 // then we can use the column indices directly. However, we
6736 // have to get the global row index. The calling process must
6737 // have a row Map, since otherwise it shouldn't be participating
6738 // in packing operations.
6739 const map_type& rowMap = * (this->getRowMap ());
6740 const GO gblRow = rowMap.getGlobalElement (lclRow);
6741 this->getGlobalRowView (gblRow, gidsIn, valsIn);
6742
6743 const size_t numBytesPerValue =
6744 PackTraits<ST>::packValueCount (valsIn[0]);
6745 numBytes = this->packRow (exports_h.data (), offset, numEnt,
6746 gidsIn.data (), valsIn.data (),
6747 numBytesPerValue);
6748 }
6749 // mfh 11 Sep 2017: Currently, if the matrix is neither globally
6750 // nor locally indexed, then it has no entries. Therefore,
6751 // there is nothing to pack. No worries!
6752
6753 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6754 (offset > bufSize || offset + numBytes > bufSize, std::logic_error,
6755 "First invalid offset into 'exports' pack buffer at index i = " << i
6756 << ". exportLIDs_h[i]: " << exportLIDs_h[i] << ", bufSize: " <<
6757 bufSize << ", offset: " << offset << ", numBytes: " << numBytes <<
6758 ".");
6759 // numPacketsPerLID_h[i] is the number of "packets" in the
6760 // current local row i. Packet=char (really "byte") so use the
6761 // number of bytes of the packed data for that row.
6762 numPacketsPerLID_h[i] = numBytes;
6763 offset += numBytes;
6764 }
6765
6766 if (verbose) {
6767 std::ostringstream os;
6768 os << *prefix << "Tpetra::CrsMatrix::packNonStaticNew: After:" << endl
6769 << *prefix << " "
6770 << dualViewStatusToString (exports, "exports")
6771 << endl
6772 << *prefix << " "
6773 << dualViewStatusToString (exportLIDs, "exportLIDs")
6774 << endl;
6775 std::cerr << os.str ();
6776 }
6777 }
6778
6779 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6780 LocalOrdinal
6782 combineGlobalValuesRaw(const LocalOrdinal lclRow,
6783 const LocalOrdinal numEnt,
6784 const impl_scalar_type vals[],
6785 const GlobalOrdinal cols[],
6786 const Tpetra::CombineMode combMode,
6787 const char* const prefix,
6788 const bool debug,
6789 const bool verbose)
6790 {
6791 using GO = GlobalOrdinal;
6792
6793 // mfh 23 Mar 2017: This branch is not thread safe in a debug
6794 // build, due to use of Teuchos::ArrayView; see #229.
6795 const GO gblRow = myGraph_->rowMap_->getGlobalElement(lclRow);
6796 Teuchos::ArrayView<const GO> cols_av
6797 (numEnt == 0 ? nullptr : cols, numEnt);
6798 Teuchos::ArrayView<const Scalar> vals_av
6799 (numEnt == 0 ? nullptr : reinterpret_cast<const Scalar*> (vals), numEnt);
6800
6801 // FIXME (mfh 23 Mar 2017) This is a work-around for less common
6802 // combine modes. combineGlobalValues throws on error; it does
6803 // not return an error code. Thus, if it returns, it succeeded.
6804 combineGlobalValues(gblRow, cols_av, vals_av, combMode,
6805 prefix, debug, verbose);
6806 return numEnt;
6807 }
6808
6809 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6810 void
6813 const GlobalOrdinal globalRowIndex,
6814 const Teuchos::ArrayView<const GlobalOrdinal>& columnIndices,
6815 const Teuchos::ArrayView<const Scalar>& values,
6816 const Tpetra::CombineMode combineMode,
6817 const char* const prefix,
6818 const bool debug,
6819 const bool verbose)
6820 {
6821 const char tfecfFuncName[] = "combineGlobalValues: ";
6822
6823 if (isStaticGraph ()) {
6824 // INSERT doesn't make sense for a static graph, since you
6825 // aren't allowed to change the structure of the graph.
6826 // However, all the other combine modes work.
6827 if (combineMode == ADD) {
6828 sumIntoGlobalValues (globalRowIndex, columnIndices, values);
6829 }
6830 else if (combineMode == REPLACE) {
6831 replaceGlobalValues (globalRowIndex, columnIndices, values);
6832 }
6833 else if (combineMode == ABSMAX) {
6834 using ::Tpetra::Details::AbsMax;
6835 AbsMax<Scalar> f;
6836 this->template transformGlobalValues<AbsMax<Scalar> > (globalRowIndex,
6837 columnIndices,
6838 values, f);
6839 }
6840 else if (combineMode == INSERT) {
6841 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6842 (isStaticGraph() && combineMode == INSERT,
6843 std::invalid_argument, "INSERT combine mode is forbidden "
6844 "if the matrix has a static (const) graph (i.e., was "
6845 "constructed with the CrsMatrix constructor that takes a "
6846 "const CrsGraph pointer).");
6847 }
6848 else {
6849 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6850 (true, std::logic_error, "Invalid combine mode; should "
6851 "never get here! "
6852 "Please report this bug to the Tpetra developers.");
6853 }
6854 }
6855 else { // The matrix has a dynamic graph.
6856 if (combineMode == ADD || combineMode == INSERT) {
6857 // For a dynamic graph, all incoming column indices are
6858 // inserted into the target graph. Duplicate indices will
6859 // have their values summed. In this context, ADD and INSERT
6860 // are equivalent. We need to call insertGlobalValues()
6861 // anyway if the column indices don't yet exist in this row,
6862 // so we just call insertGlobalValues() for both cases.
6863 insertGlobalValuesFilteredChecked(globalRowIndex,
6864 columnIndices, values, prefix, debug, verbose);
6865 }
6866 // FIXME (mfh 14 Mar 2012):
6867 //
6868 // Implementing ABSMAX or REPLACE for a dynamic graph would
6869 // require modifying assembly to attach a possibly different
6870 // combine mode to each inserted (i, j, A_ij) entry. For
6871 // example, consider two different Export operations to the same
6872 // target CrsMatrix, the first with ABSMAX combine mode and the
6873 // second with REPLACE. This isn't a common use case, so we
6874 // won't mess with it for now.
6875 else if (combineMode == ABSMAX) {
6876 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6877 ! isStaticGraph () && combineMode == ABSMAX, std::logic_error,
6878 "ABSMAX combine mode when the matrix has a dynamic graph is not yet "
6879 "implemented.");
6880 }
6881 else if (combineMode == REPLACE) {
6882 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6883 ! isStaticGraph () && combineMode == REPLACE, std::logic_error,
6884 "REPLACE combine mode when the matrix has a dynamic graph is not yet "
6885 "implemented.");
6886 }
6887 else {
6888 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6889 true, std::logic_error, "Should never get here! Please report this "
6890 "bug to the Tpetra developers.");
6891 }
6892 }
6893 }
6894
6895 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6896 void
6899 (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
6900 Kokkos::DualView<char*, buffer_device_type> imports,
6901 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6902 const size_t constantNumPackets,
6903 const CombineMode combineMode)
6904 {
6905 using Details::Behavior;
6908 using std::endl;
6909 const char tfecfFuncName[] = "unpackAndCombine: ";
6910 ProfilingRegion regionUAC ("Tpetra::CrsMatrix::unpackAndCombine");
6911
6912 const bool debug = Behavior::debug("CrsMatrix");
6913 const bool verbose = Behavior::verbose("CrsMatrix");
6914 constexpr int numValidModes = 5;
6915 const CombineMode validModes[numValidModes] =
6917 const char* validModeNames[numValidModes] =
6918 {"ADD", "REPLACE", "ABSMAX", "INSERT", "ZERO"};
6919
6920 std::unique_ptr<std::string> prefix;
6921 if (verbose) {
6922 prefix = this->createPrefix("CrsMatrix", "unpackAndCombine");
6923 std::ostringstream os;
6924 os << *prefix << "Start:" << endl
6925 << *prefix << " "
6926 << dualViewStatusToString (importLIDs, "importLIDs")
6927 << endl
6928 << *prefix << " "
6929 << dualViewStatusToString (imports, "imports")
6930 << endl
6931 << *prefix << " "
6932 << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6933 << endl
6934 << *prefix << " constantNumPackets: " << constantNumPackets
6935 << endl
6936 << *prefix << " combineMode: " << combineModeToString (combineMode)
6937 << endl;
6938 std::cerr << os.str ();
6939 }
6940
6941 if (debug) {
6942 if (std::find (validModes, validModes+numValidModes, combineMode) ==
6943 validModes+numValidModes) {
6944 std::ostringstream os;
6945 os << "Invalid combine mode. Valid modes are {";
6946 for (int k = 0; k < numValidModes; ++k) {
6947 os << validModeNames[k];
6948 if (k < numValidModes - 1) {
6949 os << ", ";
6950 }
6951 }
6952 os << "}.";
6953 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6954 (true, std::invalid_argument, os.str ());
6955 }
6956 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6957 (importLIDs.extent(0) != numPacketsPerLID.extent(0),
6958 std::invalid_argument, "importLIDs.extent(0)="
6959 << importLIDs.extent(0)
6960 << " != numPacketsPerLID.extent(0)="
6961 << numPacketsPerLID.extent(0) << ".");
6962 }
6963
6964 if (combineMode == ZERO) {
6965 return; // nothing to do
6966 }
6967
6968 if (debug) {
6969 using Teuchos::reduceAll;
6970 std::unique_ptr<std::ostringstream> msg (new std::ostringstream ());
6971 int lclBad = 0;
6972 try {
6973 unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
6974 constantNumPackets, combineMode,
6975 verbose);
6976 } catch (std::exception& e) {
6977 lclBad = 1;
6978 *msg << e.what ();
6979 }
6980 int gblBad = 0;
6981 const Teuchos::Comm<int>& comm = * (this->getComm ());
6982 reduceAll<int, int> (comm, Teuchos::REDUCE_MAX,
6983 lclBad, Teuchos::outArg (gblBad));
6984 if (gblBad != 0) {
6985 // mfh 22 Oct 2017: 'prefix' might be null, since it is only
6986 // initialized in a debug build. Thus, we get the process
6987 // rank again here. This is an error message, so the small
6988 // run-time cost doesn't matter. See #1887.
6989 std::ostringstream os;
6990 os << "Proc " << comm.getRank () << ": " << msg->str () << endl;
6991 msg = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
6992 ::Tpetra::Details::gathervPrint (*msg, os.str (), comm);
6993 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6994 (true, std::logic_error, std::endl << "unpackAndCombineImpl "
6995 "threw an exception on one or more participating processes: "
6996 << endl << msg->str ());
6997 }
6998 }
6999 else {
7000 unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
7001 constantNumPackets, combineMode,
7002 verbose);
7003 }
7004
7005 if (verbose) {
7006 std::ostringstream os;
7007 os << *prefix << "Done!" << endl
7008 << *prefix << " "
7009 << dualViewStatusToString (importLIDs, "importLIDs")
7010 << endl
7011 << *prefix << " "
7012 << dualViewStatusToString (imports, "imports")
7013 << endl
7014 << *prefix << " "
7015 << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
7016 << endl;
7017 std::cerr << os.str ();
7018 }
7019 }
7020
7021 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7022 void
7025 const Kokkos::DualView<const local_ordinal_type*,
7026 buffer_device_type>& importLIDs,
7027 Kokkos::DualView<char*, buffer_device_type> imports,
7028 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7029 const size_t constantNumPackets,
7030 const CombineMode combineMode,
7031 const bool verbose)
7032 {
7033 Details::ProfilingRegion region_unpack_and_combine_impl(
7034 "Tpetra::CrsMatrix::unpackAndCombineImpl",
7035 "Import/Export"
7036 );
7037 using std::endl;
7038 const char tfecfFuncName[] = "unpackAndCombineImpl";
7039 std::unique_ptr<std::string> prefix;
7040 if (verbose) {
7041 prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
7042 std::ostringstream os;
7043 os << *prefix << "isStaticGraph(): "
7044 << (isStaticGraph() ? "true" : "false")
7045 << ", importLIDs.extent(0): "
7046 << importLIDs.extent(0)
7047 << ", imports.extent(0): "
7048 << imports.extent(0)
7049 << ", numPacketsPerLID.extent(0): "
7050 << numPacketsPerLID.extent(0)
7051 << endl;
7052 std::cerr << os.str();
7053 }
7054
7055 if (isStaticGraph ()) {
7056 using Details::unpackCrsMatrixAndCombineNew;
7057 unpackCrsMatrixAndCombineNew(*this, imports, numPacketsPerLID,
7058 importLIDs, constantNumPackets,
7059 combineMode);
7060 }
7061 else {
7062 {
7063 using padding_type = typename crs_graph_type::padding_type;
7064 std::unique_ptr<padding_type> padding;
7065 try {
7066 padding = myGraph_->computePaddingForCrsMatrixUnpack(
7067 importLIDs, imports, numPacketsPerLID, verbose);
7068 }
7069 catch (std::exception& e) {
7070 const auto rowMap = getRowMap();
7071 const auto comm = rowMap.is_null() ? Teuchos::null :
7072 rowMap->getComm();
7073 const int myRank = comm.is_null() ? -1 : comm->getRank();
7074 TEUCHOS_TEST_FOR_EXCEPTION
7075 (true, std::runtime_error, "Proc " << myRank << ": "
7076 "Tpetra::CrsGraph::computePaddingForCrsMatrixUnpack "
7077 "threw an exception: " << e.what());
7078 }
7079 if (verbose) {
7080 std::ostringstream os;
7081 os << *prefix << "Call applyCrsPadding" << endl;
7082 std::cerr << os.str();
7083 }
7084 applyCrsPadding(*padding, verbose);
7085 }
7086 if (verbose) {
7087 std::ostringstream os;
7088 os << *prefix << "Call unpackAndCombineImplNonStatic" << endl;
7089 std::cerr << os.str();
7090 }
7091 unpackAndCombineImplNonStatic(importLIDs, imports,
7092 numPacketsPerLID,
7093 constantNumPackets,
7094 combineMode);
7095 }
7096
7097 if (verbose) {
7098 std::ostringstream os;
7099 os << *prefix << "Done" << endl;
7100 std::cerr << os.str();
7101 }
7102 }
7103
7104 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7105 void
7108 const Kokkos::DualView<const local_ordinal_type*,
7109 buffer_device_type>& importLIDs,
7110 Kokkos::DualView<char*, buffer_device_type> imports,
7111 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7112 const size_t constantNumPackets,
7113 const CombineMode combineMode)
7114 {
7115 using Kokkos::View;
7116 using Kokkos::subview;
7117 using Kokkos::MemoryUnmanaged;
7118 using Details::Behavior;
7121 using Details::PackTraits;
7123 using std::endl;
7124 using LO = LocalOrdinal;
7125 using GO = GlobalOrdinal;
7126 using ST = impl_scalar_type;
7127 using size_type = typename Teuchos::ArrayView<LO>::size_type;
7128 using HES =
7129 typename View<int*, device_type>::HostMirror::execution_space;
7130 using pair_type = std::pair<typename View<int*, HES>::size_type,
7131 typename View<int*, HES>::size_type>;
7132 using gids_out_type = View<GO*, HES, MemoryUnmanaged>;
7133 using vals_out_type = View<ST*, HES, MemoryUnmanaged>;
7134 const char tfecfFuncName[] = "unpackAndCombineImplNonStatic";
7135
7136 const bool debug = Behavior::debug("CrsMatrix");
7137 const bool verbose = Behavior::verbose("CrsMatrix");
7138 std::unique_ptr<std::string> prefix;
7139 if (verbose) {
7140 prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
7141 std::ostringstream os;
7142 os << *prefix << endl; // we've already printed DualViews' statuses
7143 std::cerr << os.str ();
7144 }
7145 const char* const prefix_raw =
7146 verbose ? prefix.get()->c_str() : nullptr;
7147
7148 const size_type numImportLIDs = importLIDs.extent (0);
7149 if (combineMode == ZERO || numImportLIDs == 0) {
7150 return; // nothing to do; no need to combine entries
7151 }
7152
7153 Details::ProfilingRegion region_unpack_and_combine_impl_non_static(
7154 "Tpetra::CrsMatrix::unpackAndCombineImplNonStatic",
7155 "Import/Export"
7156 );
7157
7158 // We're unpacking on host. This is read-only host access.
7159 if (imports.need_sync_host()) {
7160 imports.sync_host ();
7161 }
7162 auto imports_h = imports.view_host();
7163
7164 // Read-only host access.
7165 if (numPacketsPerLID.need_sync_host()) {
7166 numPacketsPerLID.sync_host ();
7167 }
7168 auto numPacketsPerLID_h = numPacketsPerLID.view_host();
7169
7170 TEUCHOS_ASSERT( ! importLIDs.need_sync_host() );
7171 auto importLIDs_h = importLIDs.view_host();
7172
7173 size_t numBytesPerValue;
7174 {
7175 // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
7176 // with run-time size? We already assume that all entries in both the
7177 // source and target matrices have the same size. If the calling process
7178 // owns at least one entry in either matrix, we can use that entry to set
7179 // the size. However, it is possible that the calling process owns no
7180 // entries. In that case, we're in trouble. One way to fix this would be
7181 // for each row's data to contain the run-time size. This is only
7182 // necessary if the size is not a compile-time constant.
7183 Scalar val;
7184 numBytesPerValue = PackTraits<ST>::packValueCount (val);
7185 }
7186
7187 // Determine the maximum number of entries in any one row
7188 size_t offset = 0;
7189 size_t maxRowNumEnt = 0;
7190 for (size_type i = 0; i < numImportLIDs; ++i) {
7191 const size_t numBytes = numPacketsPerLID_h[i];
7192 if (numBytes == 0) {
7193 continue; // empty buffer for that row means that the row is empty
7194 }
7195 // We need to unpack a nonzero number of entries for this row.
7196 if (debug) {
7197 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7198 (offset + numBytes > size_t(imports_h.extent (0)),
7199 std::logic_error, ": At local row index importLIDs_h[i="
7200 << i << "]=" << importLIDs_h[i] << ", offset (=" << offset
7201 << ") + numBytes (=" << numBytes << ") > "
7202 "imports_h.extent(0)=" << imports_h.extent (0) << ".");
7203 }
7204 LO numEntLO = 0;
7205
7206 if (debug) {
7207 const size_t theNumBytes =
7208 PackTraits<LO>::packValueCount (numEntLO);
7209 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7210 (theNumBytes > numBytes, std::logic_error, ": theNumBytes="
7211 << theNumBytes << " > numBytes = " << numBytes << ".");
7212 }
7213 const char* const inBuf = imports_h.data () + offset;
7214 const size_t actualNumBytes =
7215 PackTraits<LO>::unpackValue (numEntLO, inBuf);
7216
7217 if (debug) {
7218 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7219 (actualNumBytes > numBytes, std::logic_error, ": At i=" << i
7220 << ", actualNumBytes=" << actualNumBytes
7221 << " > numBytes=" << numBytes << ".");
7222 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7223 (numEntLO == 0, std::logic_error, ": At local row index "
7224 "importLIDs_h[i=" << i << "]=" << importLIDs_h[i] << ", "
7225 "the number of entries read from the packed data is "
7226 "numEntLO=" << numEntLO << ", but numBytes=" << numBytes
7227 << " != 0.");
7228 }
7229
7230 maxRowNumEnt = std::max(size_t(numEntLO), maxRowNumEnt);
7231 offset += numBytes;
7232 }
7233
7234 // Temporary space to cache incoming global column indices and
7235 // values. Column indices come in as global indices, in case the
7236 // source object's column Map differs from the target object's
7237 // (this's) column Map.
7238 View<GO*, HES> gblColInds;
7239 View<LO*, HES> lclColInds;
7240 View<ST*, HES> vals;
7241 {
7242 GO gid = 0;
7243 LO lid = 0;
7244 // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
7245 // with run-time size? We already assume that all entries in both the
7246 // source and target matrices have the same size. If the calling process
7247 // owns at least one entry in either matrix, we can use that entry to set
7248 // the size. However, it is possible that the calling process owns no
7249 // entries. In that case, we're in trouble. One way to fix this would be
7250 // for each row's data to contain the run-time size. This is only
7251 // necessary if the size is not a compile-time constant.
7252 Scalar val;
7253 gblColInds = ScalarViewTraits<GO, HES>::allocateArray(
7254 gid, maxRowNumEnt, "gids");
7255 lclColInds = ScalarViewTraits<LO, HES>::allocateArray(
7256 lid, maxRowNumEnt, "lids");
7257 vals = ScalarViewTraits<ST, HES>::allocateArray(
7258 val, maxRowNumEnt, "vals");
7259 }
7260
7261 offset = 0;
7262 for (size_type i = 0; i < numImportLIDs; ++i) {
7263 const size_t numBytes = numPacketsPerLID_h[i];
7264 if (numBytes == 0) {
7265 continue; // empty buffer for that row means that the row is empty
7266 }
7267 LO numEntLO = 0;
7268 const char* const inBuf = imports_h.data () + offset;
7269 (void) PackTraits<LO>::unpackValue (numEntLO, inBuf);
7270
7271 const size_t numEnt = static_cast<size_t>(numEntLO);;
7272 const LO lclRow = importLIDs_h[i];
7273
7274 gids_out_type gidsOut = subview (gblColInds, pair_type (0, numEnt));
7275 vals_out_type valsOut = subview (vals, pair_type (0, numEnt));
7276
7277 const size_t numBytesOut =
7278 unpackRow (gidsOut.data (), valsOut.data (), imports_h.data (),
7279 offset, numBytes, numEnt, numBytesPerValue);
7280 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7281 (numBytes != numBytesOut, std::logic_error, ": At i=" << i
7282 << ", numBytes=" << numBytes << " != numBytesOut="
7283 << numBytesOut << ".");
7284
7285 const ST* const valsRaw = const_cast<const ST*> (valsOut.data ());
7286 const GO* const gidsRaw = const_cast<const GO*> (gidsOut.data ());
7287 combineGlobalValuesRaw(lclRow, numEnt, valsRaw, gidsRaw,
7288 combineMode, prefix_raw, debug, verbose);
7289 // Don't update offset until current LID has succeeded.
7290 offset += numBytes;
7291 } // for each import LID i
7292
7293 if (verbose) {
7294 std::ostringstream os;
7295 os << *prefix << "Done" << endl;
7296 std::cerr << os.str();
7297 }
7298 }
7299
7300 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7301 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7303 getColumnMapMultiVector (const MV& X_domainMap,
7304 const bool force) const
7305 {
7306 using Teuchos::null;
7307 using Teuchos::RCP;
7308 using Teuchos::rcp;
7309
7310 TEUCHOS_TEST_FOR_EXCEPTION(
7311 ! this->hasColMap (), std::runtime_error, "Tpetra::CrsMatrix::getColumn"
7312 "MapMultiVector: You may only call this method if the matrix has a "
7313 "column Map. If the matrix does not yet have a column Map, you should "
7314 "first call fillComplete (with domain and range Map if necessary).");
7315
7316 // If the graph is not fill complete, then the Import object (if
7317 // one should exist) hasn't been constructed yet.
7318 TEUCHOS_TEST_FOR_EXCEPTION(
7319 ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
7320 "CrsMatrix::getColumnMapMultiVector: You may only call this method if "
7321 "this matrix's graph is fill complete.");
7322
7323 const size_t numVecs = X_domainMap.getNumVectors ();
7324 RCP<const import_type> importer = this->getGraph ()->getImporter ();
7325 RCP<const map_type> colMap = this->getColMap ();
7326
7327 RCP<MV> X_colMap; // null by default
7328
7329 // If the Import object is trivial (null), then we don't need a
7330 // separate column Map multivector. Just return null in that
7331 // case. The caller is responsible for knowing not to use the
7332 // returned null pointer.
7333 //
7334 // If the Import is nontrivial, then we do need a separate
7335 // column Map multivector for the Import operation. Check in
7336 // that case if we have to (re)create the column Map
7337 // multivector.
7338 if (! importer.is_null () || force) {
7339 if (importMV_.is_null () || importMV_->getNumVectors () != numVecs) {
7340 X_colMap = rcp (new MV (colMap, numVecs));
7341
7342 // Cache the newly created multivector for later reuse.
7343 importMV_ = X_colMap;
7344 }
7345 else { // Yay, we can reuse the cached multivector!
7346 X_colMap = importMV_;
7347 // mfh 09 Jan 2013: We don't have to fill with zeros first,
7348 // because the Import uses INSERT combine mode, which overwrites
7349 // existing entries.
7350 //
7351 //X_colMap->putScalar (ZERO);
7352 }
7353 }
7354 return X_colMap;
7355 }
7356
7357 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7358 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7361 const bool force) const
7362 {
7363 using Teuchos::null;
7364 using Teuchos::RCP;
7365 using Teuchos::rcp;
7366
7367 // If the graph is not fill complete, then the Export object (if
7368 // one should exist) hasn't been constructed yet.
7369 TEUCHOS_TEST_FOR_EXCEPTION(
7370 ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
7371 "CrsMatrix::getRowMapMultiVector: You may only call this method if this "
7372 "matrix's graph is fill complete.");
7373
7374 const size_t numVecs = Y_rangeMap.getNumVectors ();
7375 RCP<const export_type> exporter = this->getGraph ()->getExporter ();
7376 // Every version of the constructor takes either a row Map, or a
7377 // graph (all of whose constructors take a row Map). Thus, the
7378 // matrix always has a row Map.
7379 RCP<const map_type> rowMap = this->getRowMap ();
7380
7381 RCP<MV> Y_rowMap; // null by default
7382
7383 // If the Export object is trivial (null), then we don't need a
7384 // separate row Map multivector. Just return null in that case.
7385 // The caller is responsible for knowing not to use the returned
7386 // null pointer.
7387 //
7388 // If the Export is nontrivial, then we do need a separate row
7389 // Map multivector for the Export operation. Check in that case
7390 // if we have to (re)create the row Map multivector.
7391 if (! exporter.is_null () || force) {
7392 if (exportMV_.is_null () || exportMV_->getNumVectors () != numVecs) {
7393 Y_rowMap = rcp (new MV (rowMap, numVecs));
7394 exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse.
7395 }
7396 else { // Yay, we can reuse the cached multivector!
7397 Y_rowMap = exportMV_;
7398 }
7399 }
7400 return Y_rowMap;
7401 }
7402
7403 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7404 void
7406 removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
7407 {
7408 TEUCHOS_TEST_FOR_EXCEPTION(
7409 myGraph_.is_null (), std::logic_error, "Tpetra::CrsMatrix::"
7410 "removeEmptyProcessesInPlace: This method does not work when the matrix "
7411 "was created with a constant graph (that is, when it was created using "
7412 "the version of its constructor that takes an RCP<const CrsGraph>). "
7413 "This is because the matrix is not allowed to modify the graph in that "
7414 "case, but removing empty processes requires modifying the graph.");
7415 myGraph_->removeEmptyProcessesInPlace (newMap);
7416 // Even though CrsMatrix's row Map (as returned by getRowMap())
7417 // comes from its CrsGraph, CrsMatrix still implements DistObject,
7418 // so we also have to change the DistObject's Map.
7419 this->map_ = this->getRowMap ();
7420 // In the nonconst graph case, staticGraph_ is just a const
7421 // pointer to myGraph_. This assignment is probably redundant,
7422 // but it doesn't hurt.
7423 staticGraph_ = Teuchos::rcp_const_cast<const Graph> (myGraph_);
7424 }
7425
7426 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7427 Teuchos::RCP<RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7429 add (const Scalar& alpha,
7431 const Scalar& beta,
7432 const Teuchos::RCP<const map_type>& domainMap,
7433 const Teuchos::RCP<const map_type>& rangeMap,
7434 const Teuchos::RCP<Teuchos::ParameterList>& params) const
7435 {
7436 using Teuchos::Array;
7437 using Teuchos::ArrayView;
7438 using Teuchos::ParameterList;
7439 using Teuchos::RCP;
7440 using Teuchos::rcp;
7441 using Teuchos::rcp_implicit_cast;
7442 using Teuchos::sublist;
7443 using std::endl;
7444 using LO = local_ordinal_type;
7445 using GO = global_ordinal_type;
7446 using crs_matrix_type =
7447 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
7448 const char errPfx[] = "Tpetra::CrsMatrix::add: ";
7449
7450 const bool debug = Details::Behavior::debug("CrsMatrix");
7451 const bool verbose = Details::Behavior::verbose("CrsMatrix");
7452 std::unique_ptr<std::string> prefix;
7453 if (verbose) {
7454 prefix = this->createPrefix("CrsMatrix", "add");
7455 std::ostringstream os;
7456 os << *prefix << "Start" << endl;
7457 std::cerr << os.str ();
7458 }
7459
7460 const crs_matrix_type& B = *this; // a convenient abbreviation
7461 const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero();
7462 const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one();
7463
7464 // If the user didn't supply a domain or range Map, then try to
7465 // get one from B first (if it has them), then from A (if it has
7466 // them). If we don't have any domain or range Maps, scold the
7467 // user.
7468 RCP<const map_type> A_domainMap = A.getDomainMap ();
7469 RCP<const map_type> A_rangeMap = A.getRangeMap ();
7470 RCP<const map_type> B_domainMap = B.getDomainMap ();
7471 RCP<const map_type> B_rangeMap = B.getRangeMap ();
7472
7473 RCP<const map_type> theDomainMap = domainMap;
7474 RCP<const map_type> theRangeMap = rangeMap;
7475
7476 if (domainMap.is_null ()) {
7477 if (B_domainMap.is_null ()) {
7478 TEUCHOS_TEST_FOR_EXCEPTION(
7479 A_domainMap.is_null (), std::invalid_argument,
7480 "Tpetra::CrsMatrix::add: If neither A nor B have a domain Map, "
7481 "then you must supply a nonnull domain Map to this method.");
7482 theDomainMap = A_domainMap;
7483 } else {
7484 theDomainMap = B_domainMap;
7485 }
7486 }
7487 if (rangeMap.is_null ()) {
7488 if (B_rangeMap.is_null ()) {
7489 TEUCHOS_TEST_FOR_EXCEPTION(
7490 A_rangeMap.is_null (), std::invalid_argument,
7491 "Tpetra::CrsMatrix::add: If neither A nor B have a range Map, "
7492 "then you must supply a nonnull range Map to this method.");
7493 theRangeMap = A_rangeMap;
7494 } else {
7495 theRangeMap = B_rangeMap;
7496 }
7497 }
7498
7499 if (debug) {
7500 // In debug mode, check that A and B have matching domain and
7501 // range Maps, if they have domain and range Maps at all. (If
7502 // they aren't fill complete, then they may not yet have them.)
7503 if (! A_domainMap.is_null() && ! A_rangeMap.is_null()) {
7504 if (! B_domainMap.is_null() && ! B_rangeMap.is_null()) {
7505 TEUCHOS_TEST_FOR_EXCEPTION
7506 (! B_domainMap->isSameAs(*A_domainMap),
7507 std::invalid_argument,
7508 errPfx << "The input RowMatrix A must have a domain Map "
7509 "which is the same as (isSameAs) this RowMatrix's "
7510 "domain Map.");
7511 TEUCHOS_TEST_FOR_EXCEPTION
7512 (! B_rangeMap->isSameAs(*A_rangeMap), std::invalid_argument,
7513 errPfx << "The input RowMatrix A must have a range Map "
7514 "which is the same as (isSameAs) this RowMatrix's range "
7515 "Map.");
7516 TEUCHOS_TEST_FOR_EXCEPTION
7517 (! domainMap.is_null() &&
7518 ! domainMap->isSameAs(*B_domainMap),
7519 std::invalid_argument,
7520 errPfx << "The input domain Map must be the same as "
7521 "(isSameAs) this RowMatrix's domain Map.");
7522 TEUCHOS_TEST_FOR_EXCEPTION
7523 (! rangeMap.is_null() &&
7524 ! rangeMap->isSameAs(*B_rangeMap),
7525 std::invalid_argument,
7526 errPfx << "The input range Map must be the same as "
7527 "(isSameAs) this RowMatrix's range Map.");
7528 }
7529 }
7530 else if (! B_domainMap.is_null() && ! B_rangeMap.is_null()) {
7531 TEUCHOS_TEST_FOR_EXCEPTION
7532 (! domainMap.is_null() &&
7533 ! domainMap->isSameAs(*B_domainMap),
7534 std::invalid_argument,
7535 errPfx << "The input domain Map must be the same as "
7536 "(isSameAs) this RowMatrix's domain Map.");
7537 TEUCHOS_TEST_FOR_EXCEPTION
7538 (! rangeMap.is_null() && ! rangeMap->isSameAs(*B_rangeMap),
7539 std::invalid_argument,
7540 errPfx << "The input range Map must be the same as "
7541 "(isSameAs) this RowMatrix's range Map.");
7542 }
7543 else {
7544 TEUCHOS_TEST_FOR_EXCEPTION
7545 (domainMap.is_null() || rangeMap.is_null(),
7546 std::invalid_argument, errPfx << "If neither A nor B "
7547 "have a domain and range Map, then you must supply a "
7548 "nonnull domain and range Map to this method.");
7549 }
7550 }
7551
7552 // What parameters do we pass to C's constructor? Do we call
7553 // fillComplete on C after filling it? And if so, what parameters
7554 // do we pass to C's fillComplete call?
7555 bool callFillComplete = true;
7556 RCP<ParameterList> constructorSublist;
7557 RCP<ParameterList> fillCompleteSublist;
7558 if (! params.is_null()) {
7559 callFillComplete =
7560 params->get("Call fillComplete", callFillComplete);
7561 constructorSublist = sublist(params, "Constructor parameters");
7562 fillCompleteSublist = sublist(params, "fillComplete parameters");
7563 }
7564
7565 RCP<const map_type> A_rowMap = A.getRowMap ();
7566 RCP<const map_type> B_rowMap = B.getRowMap ();
7567 RCP<const map_type> C_rowMap = B_rowMap; // see discussion in documentation
7568 RCP<crs_matrix_type> C; // The result matrix.
7569
7570 // If A and B's row Maps are the same, we can compute an upper
7571 // bound on the number of entries in each row of C, before
7572 // actually computing the sum. A reasonable upper bound is the
7573 // sum of the two entry counts in each row.
7574 if (A_rowMap->isSameAs (*B_rowMap)) {
7575 const LO localNumRows = static_cast<LO> (A_rowMap->getLocalNumElements ());
7576 Array<size_t> C_maxNumEntriesPerRow (localNumRows, 0);
7577
7578 // Get the number of entries in each row of A.
7579 if (alpha != ZERO) {
7580 for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7581 const size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
7582 C_maxNumEntriesPerRow[localRow] += A_numEntries;
7583 }
7584 }
7585 // Get the number of entries in each row of B.
7586 if (beta != ZERO) {
7587 for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7588 const size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
7589 C_maxNumEntriesPerRow[localRow] += B_numEntries;
7590 }
7591 }
7592 // Construct the result matrix C.
7593 if (constructorSublist.is_null ()) {
7594 C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow ()));
7595 } else {
7596 C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow (),
7597 constructorSublist));
7598 }
7599 // Since A and B have the same row Maps, we could add them
7600 // together all at once and merge values before we call
7601 // insertGlobalValues. However, we don't really need to, since
7602 // we've already allocated enough space in each row of C for C
7603 // to do the merge itself.
7604 }
7605 else { // the row Maps of A and B are not the same
7606 // Construct the result matrix C.
7607 // true: !A_rowMap->isSameAs (*B_rowMap)
7608 TEUCHOS_TEST_FOR_EXCEPTION
7609 (true, std::invalid_argument, errPfx << "The row maps must "
7610 "be the same for statically allocated matrices, to ensure "
7611 "that there is sufficient space to do the addition.");
7612 }
7613
7614 TEUCHOS_TEST_FOR_EXCEPTION
7615 (C.is_null (), std::logic_error,
7616 errPfx << "C should not be null at this point. "
7617 "Please report this bug to the Tpetra developers.");
7618
7619 if (verbose) {
7620 std::ostringstream os;
7621 os << *prefix << "Compute C = alpha*A + beta*B" << endl;
7622 std::cerr << os.str ();
7623 }
7624 using gids_type = nonconst_global_inds_host_view_type;
7625 using vals_type = nonconst_values_host_view_type;
7626 gids_type ind;
7627 vals_type val;
7628
7629 if (alpha != ZERO) {
7630 const LO A_localNumRows = static_cast<LO> (A_rowMap->getLocalNumElements ());
7631 for (LO localRow = 0; localRow < A_localNumRows; ++localRow) {
7632 size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
7633 const GO globalRow = A_rowMap->getGlobalElement (localRow);
7634 if (A_numEntries > static_cast<size_t> (ind.size ())) {
7635 Kokkos::resize(ind,A_numEntries);
7636 Kokkos::resize(val,A_numEntries);
7637 }
7638 gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, A_numEntries));
7639 vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, A_numEntries));
7640 A.getGlobalRowCopy (globalRow, indView, valView, A_numEntries);
7641
7642 if (alpha != ONE) {
7643 for (size_t k = 0; k < A_numEntries; ++k) {
7644 valView[k] *= alpha;
7645 }
7646 }
7647 C->insertGlobalValues (globalRow, A_numEntries,
7648 reinterpret_cast<Scalar *>(valView.data()),
7649 indView.data());
7650 }
7651 }
7652
7653 if (beta != ZERO) {
7654 const LO B_localNumRows = static_cast<LO> (B_rowMap->getLocalNumElements ());
7655 for (LO localRow = 0; localRow < B_localNumRows; ++localRow) {
7656 size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
7657 const GO globalRow = B_rowMap->getGlobalElement (localRow);
7658 if (B_numEntries > static_cast<size_t> (ind.size ())) {
7659 Kokkos::resize(ind,B_numEntries);
7660 Kokkos::resize(val,B_numEntries);
7661 }
7662 gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, B_numEntries));
7663 vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, B_numEntries));
7664 B.getGlobalRowCopy (globalRow, indView, valView, B_numEntries);
7665
7666 if (beta != ONE) {
7667 for (size_t k = 0; k < B_numEntries; ++k) {
7668 valView[k] *= beta;
7669 }
7670 }
7671 C->insertGlobalValues (globalRow, B_numEntries,
7672 reinterpret_cast<Scalar *>(valView.data()),
7673 indView.data());
7674 }
7675 }
7676
7677 if (callFillComplete) {
7678 if (verbose) {
7679 std::ostringstream os;
7680 os << *prefix << "Call fillComplete on C" << endl;
7681 std::cerr << os.str ();
7682 }
7683 if (fillCompleteSublist.is_null ()) {
7684 C->fillComplete (theDomainMap, theRangeMap);
7685 } else {
7686 C->fillComplete (theDomainMap, theRangeMap, fillCompleteSublist);
7687 }
7688 }
7689 else if (verbose) {
7690 std::ostringstream os;
7691 os << *prefix << "Do NOT call fillComplete on C" << endl;
7692 std::cerr << os.str ();
7693 }
7694
7695 if (verbose) {
7696 std::ostringstream os;
7697 os << *prefix << "Done" << endl;
7698 std::cerr << os.str ();
7699 }
7700 return rcp_implicit_cast<row_matrix_type> (C);
7701 }
7702
7703
7704
7705 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7706 void
7709 const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>& rowTransfer,
7710 const Teuchos::RCP<const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node> > & domainTransfer,
7711 const Teuchos::RCP<const map_type>& domainMap,
7712 const Teuchos::RCP<const map_type>& rangeMap,
7713 const Teuchos::RCP<Teuchos::ParameterList>& params) const
7714 {
7715 using Details::Behavior;
7720 using Teuchos::ArrayRCP;
7721 using Teuchos::ArrayView;
7722 using Teuchos::Comm;
7723 using Teuchos::ParameterList;
7724 using Teuchos::RCP;
7725 using std::endl;
7726 typedef LocalOrdinal LO;
7727 typedef GlobalOrdinal GO;
7728 typedef node_type NT;
7729 typedef CrsMatrix<Scalar, LO, GO, NT> this_CRS_type;
7730 typedef Vector<int, LO, GO, NT> IntVectorType;
7731 using Teuchos::as;
7732
7733 const bool debug = Behavior::debug("CrsMatrix");
7734 const bool verbose = Behavior::verbose("CrsMatrix");
7735 int MyPID = getComm ()->getRank ();
7736
7737 std::unique_ptr<std::string> verbosePrefix;
7738 if (verbose) {
7739 verbosePrefix =
7740 this->createPrefix("CrsMatrix", "transferAndFillComplete");
7741 std::ostringstream os;
7742 os << "Start" << endl;
7743 std::cerr << os.str();
7744 }
7745
7746 //
7747 // Get the caller's parameters
7748 //
7749 bool isMM = false; // optimize for matrix-matrix ops.
7750 bool reverseMode = false; // Are we in reverse mode?
7751 bool restrictComm = false; // Do we need to restrict the communicator?
7752
7753 int mm_optimization_core_count =
7754 Behavior::TAFC_OptimizationCoreCount();
7755 RCP<ParameterList> matrixparams; // parameters for the destination matrix
7756 bool overrideAllreduce = false;
7757 bool useKokkosPath = false;
7758 if (! params.is_null ()) {
7759 matrixparams = sublist (params, "CrsMatrix");
7760 reverseMode = params->get ("Reverse Mode", reverseMode);
7761 useKokkosPath = params->get ("TAFC: use kokkos path", useKokkosPath);
7762 restrictComm = params->get ("Restrict Communicator", restrictComm);
7763 auto & slist = params->sublist("matrixmatrix: kernel params",false);
7764 isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
7765 mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
7766
7767 overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
7768 if(getComm()->getSize() < mm_optimization_core_count && isMM) isMM = false;
7769 if(reverseMode) isMM = false;
7770 }
7771
7772 // Only used in the sparse matrix-matrix multiply (isMM) case.
7773 std::shared_ptr< ::Tpetra::Details::CommRequest> iallreduceRequest;
7774 int mismatch = 0;
7775 int reduced_mismatch = 0;
7776 if (isMM && !overrideAllreduce) {
7777
7778 // Test for pathological matrix transfer
7779 const bool source_vals = ! getGraph ()->getImporter ().is_null();
7780 const bool target_vals = ! (rowTransfer.getExportLIDs ().size() == 0 ||
7781 rowTransfer.getRemoteLIDs ().size() == 0);
7782 mismatch = (source_vals != target_vals) ? 1 : 0;
7783 iallreduceRequest =
7784 ::Tpetra::Details::iallreduce (mismatch, reduced_mismatch,
7785 Teuchos::REDUCE_MAX, * (getComm ()));
7786 }
7787
7788#ifdef HAVE_TPETRA_MMM_TIMINGS
7789 using Teuchos::TimeMonitor;
7790 std::string label;
7791 if(!params.is_null())
7792 label = params->get("Timer Label",label);
7793 std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
7794 std::string tlstr;
7795 {
7796 std::ostringstream os;
7797 if(isMM) os<<":MMOpt";
7798 else os<<":MMLegacy";
7799 tlstr = os.str();
7800 }
7801
7802 Teuchos::TimeMonitor MMall(*TimeMonitor::getNewTimer(prefix + std::string("TAFC All") +tlstr ));
7803#endif
7804
7805 // Make sure that the input argument rowTransfer is either an
7806 // Import or an Export. Import and Export are the only two
7807 // subclasses of Transfer that we defined, but users might
7808 // (unwisely, for now at least) decide to implement their own
7809 // subclasses. Exclude this possibility.
7810 const import_type* xferAsImport = dynamic_cast<const import_type*> (&rowTransfer);
7811 const export_type* xferAsExport = dynamic_cast<const export_type*> (&rowTransfer);
7812 TEUCHOS_TEST_FOR_EXCEPTION(
7813 xferAsImport == nullptr && xferAsExport == nullptr, std::invalid_argument,
7814 "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' input "
7815 "argument must be either an Import or an Export, and its template "
7816 "parameters must match the corresponding template parameters of the "
7817 "CrsMatrix.");
7818
7819 // Make sure that the input argument domainTransfer is either an
7820 // Import or an Export. Import and Export are the only two
7821 // subclasses of Transfer that we defined, but users might
7822 // (unwisely, for now at least) decide to implement their own
7823 // subclasses. Exclude this possibility.
7824 Teuchos::RCP<const import_type> xferDomainAsImport = Teuchos::rcp_dynamic_cast<const import_type> (domainTransfer);
7825 Teuchos::RCP<const export_type> xferDomainAsExport = Teuchos::rcp_dynamic_cast<const export_type> (domainTransfer);
7826
7827 if(! domainTransfer.is_null()) {
7828 TEUCHOS_TEST_FOR_EXCEPTION(
7829 (xferDomainAsImport.is_null() && xferDomainAsExport.is_null()), std::invalid_argument,
7830 "Tpetra::CrsMatrix::transferAndFillComplete: The 'domainTransfer' input "
7831 "argument must be either an Import or an Export, and its template "
7832 "parameters must match the corresponding template parameters of the "
7833 "CrsMatrix.");
7834
7835 TEUCHOS_TEST_FOR_EXCEPTION(
7836 ( xferAsImport != nullptr || ! xferDomainAsImport.is_null() ) &&
7837 (( xferAsImport != nullptr && xferDomainAsImport.is_null() ) ||
7838 ( xferAsImport == nullptr && ! xferDomainAsImport.is_null() )), std::invalid_argument,
7839 "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7840 "arguments must be of the same type (either Import or Export).");
7841
7842 TEUCHOS_TEST_FOR_EXCEPTION(
7843 ( xferAsExport != nullptr || ! xferDomainAsExport.is_null() ) &&
7844 (( xferAsExport != nullptr && xferDomainAsExport.is_null() ) ||
7845 ( xferAsExport == nullptr && ! xferDomainAsExport.is_null() )), std::invalid_argument,
7846 "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7847 "arguments must be of the same type (either Import or Export).");
7848 } // domainTransfer != null
7849
7850
7851 // FIXME (mfh 15 May 2014) Wouldn't communication still be needed,
7852 // if the source Map is not distributed but the target Map is?
7853 const bool communication_needed = rowTransfer.getSourceMap ()->isDistributed ();
7854
7855 // Get the new domain and range Maps. We need some of them for
7856 // error checking, now that we have the reverseMode parameter.
7857 RCP<const map_type> MyRowMap = reverseMode ?
7858 rowTransfer.getSourceMap () : rowTransfer.getTargetMap ();
7859 RCP<const map_type> MyColMap; // create this below
7860 RCP<const map_type> MyDomainMap = ! domainMap.is_null () ?
7861 domainMap : getDomainMap ();
7862 RCP<const map_type> MyRangeMap = ! rangeMap.is_null () ?
7863 rangeMap : getRangeMap ();
7864 RCP<const map_type> BaseRowMap = MyRowMap;
7865 RCP<const map_type> BaseDomainMap = MyDomainMap;
7866
7867 // If the user gave us a nonnull destMat, then check whether it's
7868 // "pristine." That means that it has no entries.
7869 //
7870 // FIXME (mfh 15 May 2014) If this is not true on all processes,
7871 // then this exception test may hang. It would be better to
7872 // forward an error flag to the next communication phase.
7873 if (! destMat.is_null ()) {
7874 // FIXME (mfh 15 May 2014): The Epetra idiom for checking
7875 // whether a graph or matrix has no entries on the calling
7876 // process, is that it is neither locally nor globally indexed.
7877 // This may change eventually with the Kokkos refactor version
7878 // of Tpetra, so it would be better just to check the quantity
7879 // of interest directly. Note that with the Kokkos refactor
7880 // version of Tpetra, asking for the total number of entries in
7881 // a graph or matrix that is not fill complete might require
7882 // computation (kernel launch), since it is not thread scalable
7883 // to update a count every time an entry is inserted.
7884 const bool NewFlag = ! destMat->getGraph ()->isLocallyIndexed () &&
7885 ! destMat->getGraph ()->isGloballyIndexed ();
7886 TEUCHOS_TEST_FOR_EXCEPTION(
7887 ! NewFlag, std::invalid_argument, "Tpetra::CrsMatrix::"
7888 "transferAndFillComplete: The input argument 'destMat' is only allowed "
7889 "to be nonnull, if its graph is empty (neither locally nor globally "
7890 "indexed).");
7891 // FIXME (mfh 15 May 2014) At some point, we want to change
7892 // graphs and matrices so that their DistObject Map
7893 // (this->getMap()) may differ from their row Map. This will
7894 // make redistribution for 2-D distributions more efficient. I
7895 // hesitate to change this check, because I'm not sure how much
7896 // the code here depends on getMap() and getRowMap() being the
7897 // same.
7898 TEUCHOS_TEST_FOR_EXCEPTION(
7899 ! destMat->getRowMap ()->isSameAs (*MyRowMap), std::invalid_argument,
7900 "Tpetra::CrsMatrix::transferAndFillComplete: The (row) Map of the "
7901 "input argument 'destMat' is not the same as the (row) Map specified "
7902 "by the input argument 'rowTransfer'.");
7903 TEUCHOS_TEST_FOR_EXCEPTION(
7904 ! destMat->checkSizes (*this), std::invalid_argument,
7905 "Tpetra::CrsMatrix::transferAndFillComplete: You provided a nonnull "
7906 "destination matrix, but checkSizes() indicates that it is not a legal "
7907 "legal target for redistribution from the source matrix (*this). This "
7908 "may mean that they do not have the same dimensions.");
7909 }
7910
7911 // If forward mode (the default), then *this's (row) Map must be
7912 // the same as the source Map of the Transfer. If reverse mode,
7913 // then *this's (row) Map must be the same as the target Map of
7914 // the Transfer.
7915 //
7916 // FIXME (mfh 15 May 2014) At some point, we want to change graphs
7917 // and matrices so that their DistObject Map (this->getMap()) may
7918 // differ from their row Map. This will make redistribution for
7919 // 2-D distributions more efficient. I hesitate to change this
7920 // check, because I'm not sure how much the code here depends on
7921 // getMap() and getRowMap() being the same.
7922 TEUCHOS_TEST_FOR_EXCEPTION(
7923 ! (reverseMode || getRowMap ()->isSameAs (*rowTransfer.getSourceMap ())),
7924 std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
7925 "rowTransfer->getSourceMap() must match this->getRowMap() in forward mode.");
7926 TEUCHOS_TEST_FOR_EXCEPTION(
7927 ! (! reverseMode || getRowMap ()->isSameAs (*rowTransfer.getTargetMap ())),
7928 std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
7929 "rowTransfer->getTargetMap() must match this->getRowMap() in reverse mode.");
7930
7931 // checks for domainTransfer
7932 TEUCHOS_TEST_FOR_EXCEPTION(
7933 ! xferDomainAsImport.is_null() && ! xferDomainAsImport->getTargetMap()->isSameAs(*domainMap),
7934 std::invalid_argument,
7935 "Tpetra::CrsMatrix::transferAndFillComplete: The target map of the 'domainTransfer' input "
7936 "argument must be the same as the rebalanced domain map 'domainMap'");
7937
7938 TEUCHOS_TEST_FOR_EXCEPTION(
7939 ! xferDomainAsExport.is_null() && ! xferDomainAsExport->getSourceMap()->isSameAs(*domainMap),
7940 std::invalid_argument,
7941 "Tpetra::CrsMatrix::transferAndFillComplete: The source map of the 'domainTransfer' input "
7942 "argument must be the same as the rebalanced domain map 'domainMap'");
7943
7944 // The basic algorithm here is:
7945 //
7946 // 1. Call the moral equivalent of "Distor.do" to handle the import.
7947 // 2. Copy all the Imported and Copy/Permuted data into the raw
7948 // CrsMatrix / CrsGraphData pointers, still using GIDs.
7949 // 3. Call an optimized version of MakeColMap that avoids the
7950 // Directory lookups (since the importer knows who owns all the
7951 // GIDs) AND reindexes to LIDs.
7952 // 4. Call expertStaticFillComplete()
7953
7954 // Get information from the Importer
7955 const size_t NumSameIDs = rowTransfer.getNumSameIDs();
7956 ArrayView<const LO> ExportLIDs = reverseMode ?
7957 rowTransfer.getRemoteLIDs () : rowTransfer.getExportLIDs ();
7958 auto RemoteLIDs = reverseMode ?
7959 rowTransfer.getExportLIDs_dv() : rowTransfer.getRemoteLIDs_dv();
7960 auto PermuteToLIDs = reverseMode ?
7961 rowTransfer.getPermuteFromLIDs_dv() : rowTransfer.getPermuteToLIDs_dv();
7962 auto PermuteFromLIDs = reverseMode ?
7963 rowTransfer.getPermuteToLIDs_dv() : rowTransfer.getPermuteFromLIDs_dv();
7964 Distributor& Distor = rowTransfer.getDistributor ();
7965
7966 // Owning PIDs
7967 Teuchos::Array<int> SourcePids;
7968
7969 // Temp variables for sub-communicators
7970 RCP<const map_type> ReducedRowMap, ReducedColMap,
7971 ReducedDomainMap, ReducedRangeMap;
7972 RCP<const Comm<int> > ReducedComm;
7973
7974 // If the user gave us a null destMat, then construct the new
7975 // destination matrix. We will replace its column Map later.
7976 if (destMat.is_null ()) {
7977 destMat = rcp (new this_CRS_type (MyRowMap, 0, matrixparams));
7978 }
7979
7980 /***************************************************/
7981 /***** 1) First communicator restriction phase ****/
7982 /***************************************************/
7983 if (restrictComm) {
7984#ifdef HAVE_TPETRA_MMM_TIMINGS
7985 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrictComm")));
7986#endif
7987 ReducedRowMap = MyRowMap->removeEmptyProcesses ();
7988 ReducedComm = ReducedRowMap.is_null () ?
7989 Teuchos::null :
7990 ReducedRowMap->getComm ();
7991 destMat->removeEmptyProcessesInPlace (ReducedRowMap);
7992
7993 ReducedDomainMap = MyRowMap.getRawPtr () == MyDomainMap.getRawPtr () ?
7994 ReducedRowMap :
7995 MyDomainMap->replaceCommWithSubset (ReducedComm);
7996 ReducedRangeMap = MyRowMap.getRawPtr () == MyRangeMap.getRawPtr () ?
7997 ReducedRowMap :
7998 MyRangeMap->replaceCommWithSubset (ReducedComm);
7999
8000 // Reset the "my" maps
8001 MyRowMap = ReducedRowMap;
8002 MyDomainMap = ReducedDomainMap;
8003 MyRangeMap = ReducedRangeMap;
8004
8005 // Update my PID, if we've restricted the communicator
8006 if (! ReducedComm.is_null ()) {
8007 MyPID = ReducedComm->getRank ();
8008 }
8009 else {
8010 MyPID = -2; // For debugging
8011 }
8012 }
8013 else {
8014 ReducedComm = MyRowMap->getComm ();
8015 }
8016
8017
8018
8019 /***************************************************/
8020 /***** 2) From Tpetra::DistObject::doTransfer() ****/
8021 /***************************************************/
8022 // Get the owning PIDs
8023 RCP<const import_type> MyImporter = getGraph ()->getImporter ();
8024
8025 // check whether domain maps of source matrix and base domain map is the same
8026 bool bSameDomainMap = BaseDomainMap->isSameAs (*getDomainMap ());
8027
8028 if (! restrictComm && ! MyImporter.is_null () && bSameDomainMap ) {
8029#ifdef HAVE_TPETRA_MMM_TIMINGS
8030 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs same map")));
8031#endif
8032 // Same domain map as source matrix
8033 //
8034 // NOTE: This won't work for restrictComm (because the Import
8035 // doesn't know the restricted PIDs), though writing an
8036 // optimized version for that case would be easy (Import an
8037 // IntVector of the new PIDs). Might want to add this later.
8038 Import_Util::getPids (*MyImporter, SourcePids, false);
8039 }
8040 else if (restrictComm && ! MyImporter.is_null () && bSameDomainMap) {
8041 // Same domain map as source matrix (restricted communicator)
8042 // We need one import from the domain to the column map
8043#ifdef HAVE_TPETRA_MMM_TIMINGS
8044 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs restricted comm")));
8045#endif
8046 IntVectorType SourceDomain_pids(getDomainMap (),true);
8047 IntVectorType SourceCol_pids(getColMap());
8048 // SourceDomain_pids contains the restricted pids
8049 SourceDomain_pids.putScalar(MyPID);
8050
8051 SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8052 SourcePids.resize (getColMap ()->getLocalNumElements ());
8053 SourceCol_pids.get1dCopy (SourcePids ());
8054 }
8055 else if (MyImporter.is_null ()) {
8056 // Matrix has no off-process entries
8057#ifdef HAVE_TPETRA_MMM_TIMINGS
8058 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs all local entries")));
8059#endif
8060 SourcePids.resize (getColMap ()->getLocalNumElements ());
8061 SourcePids.assign (getColMap ()->getLocalNumElements (), MyPID);
8062 }
8063 else if ( ! MyImporter.is_null () &&
8064 ! domainTransfer.is_null () ) {
8065 // general implementation for rectangular matrices with
8066 // domain map different than SourceMatrix domain map.
8067 // User has to provide a DomainTransfer object. We need
8068 // to communications (import/export)
8069#ifdef HAVE_TPETRA_MMM_TIMINGS
8070 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs rectangular case")));
8071#endif
8072
8073 // TargetDomain_pids lives on the rebalanced new domain map
8074 IntVectorType TargetDomain_pids (domainMap);
8075 TargetDomain_pids.putScalar (MyPID);
8076
8077 // SourceDomain_pids lives on the non-rebalanced old domain map
8078 IntVectorType SourceDomain_pids (getDomainMap ());
8079
8080 // SourceCol_pids lives on the non-rebalanced old column map
8081 IntVectorType SourceCol_pids (getColMap ());
8082
8083 if (! reverseMode && ! xferDomainAsImport.is_null() ) {
8084 SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8085 }
8086 else if (reverseMode && ! xferDomainAsExport.is_null() ) {
8087 SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8088 }
8089 else if (! reverseMode && ! xferDomainAsExport.is_null() ) {
8090 SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8091 }
8092 else if (reverseMode && ! xferDomainAsImport.is_null() ) {
8093 SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8094 }
8095 else {
8096 TEUCHOS_TEST_FOR_EXCEPTION(
8097 true, std::logic_error, "Tpetra::CrsMatrix::"
8098 "transferAndFillComplete: Should never get here! "
8099 "Please report this bug to a Tpetra developer.");
8100 }
8101 SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8102 SourcePids.resize (getColMap ()->getLocalNumElements ());
8103 SourceCol_pids.get1dCopy (SourcePids ());
8104 }
8105 else if ( ! MyImporter.is_null () &&
8106 BaseDomainMap->isSameAs (*BaseRowMap) &&
8107 getDomainMap ()->isSameAs (*getRowMap ())) {
8108 // We can use the rowTransfer + SourceMatrix's Import to find out who owns what.
8109#ifdef HAVE_TPETRA_MMM_TIMINGS
8110 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs query import")));
8111#endif
8112
8113 IntVectorType TargetRow_pids (domainMap);
8114 IntVectorType SourceRow_pids (getRowMap ());
8115 IntVectorType SourceCol_pids (getColMap ());
8116
8117 TargetRow_pids.putScalar (MyPID);
8118 if (! reverseMode && xferAsImport != nullptr) {
8119 SourceRow_pids.doExport (TargetRow_pids, *xferAsImport, INSERT);
8120 }
8121 else if (reverseMode && xferAsExport != nullptr) {
8122 SourceRow_pids.doExport (TargetRow_pids, *xferAsExport, INSERT);
8123 }
8124 else if (! reverseMode && xferAsExport != nullptr) {
8125 SourceRow_pids.doImport (TargetRow_pids, *xferAsExport, INSERT);
8126 }
8127 else if (reverseMode && xferAsImport != nullptr) {
8128 SourceRow_pids.doImport (TargetRow_pids, *xferAsImport, INSERT);
8129 }
8130 else {
8131 TEUCHOS_TEST_FOR_EXCEPTION(
8132 true, std::logic_error, "Tpetra::CrsMatrix::"
8133 "transferAndFillComplete: Should never get here! "
8134 "Please report this bug to a Tpetra developer.");
8135 }
8136
8137 SourceCol_pids.doImport (SourceRow_pids, *MyImporter, INSERT);
8138 SourcePids.resize (getColMap ()->getLocalNumElements ());
8139 SourceCol_pids.get1dCopy (SourcePids ());
8140 }
8141 else {
8142 TEUCHOS_TEST_FOR_EXCEPTION(
8143 true, std::invalid_argument, "Tpetra::CrsMatrix::"
8144 "transferAndFillComplete: This method only allows either domainMap == "
8145 "getDomainMap (), or (domainMap == rowTransfer.getTargetMap () and "
8146 "getDomainMap () == getRowMap ()).");
8147 }
8148
8149 // Tpetra-specific stuff
8150 size_t constantNumPackets = destMat->constantNumberOfPackets ();
8151 {
8152#ifdef HAVE_TPETRA_MMM_TIMINGS
8153 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC reallocate buffers")));
8154#endif
8155 if (constantNumPackets == 0) {
8156 destMat->reallocArraysForNumPacketsPerLid (ExportLIDs.size (),
8157 RemoteLIDs.view_host().size ());
8158 }
8159 else {
8160 // There are a constant number of packets per element. We
8161 // already know (from the number of "remote" (incoming)
8162 // elements) how many incoming elements we expect, so we can
8163 // resize the buffer accordingly.
8164 const size_t rbufLen = RemoteLIDs.view_host().size() * constantNumPackets;
8165 destMat->reallocImportsIfNeeded (rbufLen, false, nullptr);
8166 }
8167 }
8168
8169 // Pack & Prepare w/ owning PIDs
8170 {
8171#ifdef HAVE_TPETRA_MMM_TIMINGS
8172 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC pack and prepare")));
8173#endif
8174 if (debug) {
8175 using Teuchos::outArg;
8176 using Teuchos::REDUCE_MAX;
8177 using Teuchos::reduceAll;
8178 using std::cerr;
8179 using std::endl;
8180 RCP<const Teuchos::Comm<int> > comm = this->getComm ();
8181 const int myRank = comm->getRank ();
8182
8183 std::ostringstream errStrm;
8184 int lclErr = 0;
8185 int gblErr = 0;
8186
8187 Teuchos::ArrayView<size_t> numExportPacketsPerLID;
8188 try {
8189 // packAndPrepare* methods modify numExportPacketsPerLID_.
8190 destMat->numExportPacketsPerLID_.modify_host ();
8191 numExportPacketsPerLID =
8192 getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8193 }
8194 catch (std::exception& e) {
8195 errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw: "
8196 << e.what () << std::endl;
8197 lclErr = 1;
8198 }
8199 catch (...) {
8200 errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw "
8201 "an exception not a subclass of std::exception" << std::endl;
8202 lclErr = 1;
8203 }
8204
8205 if (! comm.is_null ()) {
8206 reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
8207 }
8208 if (gblErr != 0) {
8209 ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
8210 TEUCHOS_TEST_FOR_EXCEPTION(
8211 true, std::runtime_error, "getArrayViewFromDualView threw an "
8212 "exception on at least one process.");
8213 }
8214
8215 if (verbose) {
8216 std::ostringstream os;
8217 os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
8218 << std::endl;
8219 std::cerr << os.str ();
8220 }
8221 try {
8223 destMat->exports_,
8224 numExportPacketsPerLID,
8225 ExportLIDs,
8226 SourcePids,
8227 constantNumPackets);
8228 }
8229 catch (std::exception& e) {
8230 errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw: "
8231 << e.what () << std::endl;
8232 lclErr = 1;
8233 }
8234 catch (...) {
8235 errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw "
8236 "an exception not a subclass of std::exception" << std::endl;
8237 lclErr = 1;
8238 }
8239
8240 if (verbose) {
8241 std::ostringstream os;
8242 os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
8243 << std::endl;
8244 std::cerr << os.str ();
8245 }
8246
8247 if (! comm.is_null ()) {
8248 reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
8249 }
8250 if (gblErr != 0) {
8251 ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
8252 TEUCHOS_TEST_FOR_EXCEPTION(
8253 true, std::runtime_error, "packCrsMatrixWithOwningPIDs threw an "
8254 "exception on at least one process.");
8255 }
8256 }
8257 else {
8258 // packAndPrepare* methods modify numExportPacketsPerLID_.
8259 destMat->numExportPacketsPerLID_.modify_host ();
8260 Teuchos::ArrayView<size_t> numExportPacketsPerLID =
8261 getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8262 if (verbose) {
8263 std::ostringstream os;
8264 os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
8265 << std::endl;
8266 std::cerr << os.str ();
8267 }
8269 destMat->exports_,
8270 numExportPacketsPerLID,
8271 ExportLIDs,
8272 SourcePids,
8273 constantNumPackets);
8274 if (verbose) {
8275 std::ostringstream os;
8276 os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
8277 << std::endl;
8278 std::cerr << os.str ();
8279 }
8280 }
8281 }
8282
8283 // Do the exchange of remote data.
8284 {
8285#ifdef HAVE_TPETRA_MMM_TIMINGS
8286 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs exchange remote data")));
8287#endif
8288 if (! communication_needed) {
8289 if (verbose) {
8290 std::ostringstream os;
8291 os << *verbosePrefix << "Communication not needed" << std::endl;
8292 std::cerr << os.str ();
8293 }
8294 }
8295 else {
8296 if (reverseMode) {
8297 if (constantNumPackets == 0) { // variable number of packets per LID
8298 if (verbose) {
8299 std::ostringstream os;
8300 os << *verbosePrefix << "Reverse mode, variable # packets / LID"
8301 << std::endl;
8302 std::cerr << os.str ();
8303 }
8304 // Make sure that host has the latest version, since we're
8305 // using the version on host. If host has the latest
8306 // version, syncing to host does nothing.
8307 destMat->numExportPacketsPerLID_.sync_host ();
8308 Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
8309 getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8310 destMat->numImportPacketsPerLID_.sync_host ();
8311 Teuchos::ArrayView<size_t> numImportPacketsPerLID =
8312 getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
8313
8314 if (verbose) {
8315 std::ostringstream os;
8316 os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
8317 << std::endl;
8318 std::cerr << os.str ();
8319 }
8320 Distor.doReversePostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1,
8321 destMat->numImportPacketsPerLID_.view_host());
8322 if (verbose) {
8323 std::ostringstream os;
8324 os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
8325 << std::endl;
8326 std::cerr << os.str ();
8327 }
8328
8329 size_t totalImportPackets = 0;
8330 for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
8331 totalImportPackets += numImportPacketsPerLID[i];
8332 }
8333
8334 // Reallocation MUST go before setting the modified flag,
8335 // because it may clear out the flags.
8336 destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
8337 verbosePrefix.get ());
8338 destMat->imports_.modify_host ();
8339 auto hostImports = destMat->imports_.view_host();
8340 // This is a legacy host pack/unpack path, so use the host
8341 // version of exports_.
8342 destMat->exports_.sync_host ();
8343 auto hostExports = destMat->exports_.view_host();
8344 if (verbose) {
8345 std::ostringstream os;
8346 os << *verbosePrefix << "Calling 4-arg doReversePostsAndWaits"
8347 << std::endl;
8348 std::cerr << os.str ();
8349 }
8350 Distor.doReversePostsAndWaits (hostExports,
8351 numExportPacketsPerLID,
8352 hostImports,
8353 numImportPacketsPerLID);
8354 if (verbose) {
8355 std::ostringstream os;
8356 os << *verbosePrefix << "Finished 4-arg doReversePostsAndWaits"
8357 << std::endl;
8358 std::cerr << os.str ();
8359 }
8360 }
8361 else { // constant number of packets per LID
8362 if (verbose) {
8363 std::ostringstream os;
8364 os << *verbosePrefix << "Reverse mode, constant # packets / LID"
8365 << std::endl;
8366 std::cerr << os.str ();
8367 }
8368 destMat->imports_.modify_host ();
8369 auto hostImports = destMat->imports_.view_host();
8370 // This is a legacy host pack/unpack path, so use the host
8371 // version of exports_.
8372 destMat->exports_.sync_host ();
8373 auto hostExports = destMat->exports_.view_host();
8374 if (verbose) {
8375 std::ostringstream os;
8376 os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
8377 << std::endl;
8378 std::cerr << os.str ();
8379 }
8380 Distor.doReversePostsAndWaits (hostExports,
8381 constantNumPackets,
8382 hostImports);
8383 if (verbose) {
8384 std::ostringstream os;
8385 os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
8386 << std::endl;
8387 std::cerr << os.str ();
8388 }
8389 }
8390 }
8391 else { // forward mode (the default)
8392 if (constantNumPackets == 0) { // variable number of packets per LID
8393 if (verbose) {
8394 std::ostringstream os;
8395 os << *verbosePrefix << "Forward mode, variable # packets / LID"
8396 << std::endl;
8397 std::cerr << os.str ();
8398 }
8399 // Make sure that host has the latest version, since we're
8400 // using the version on host. If host has the latest
8401 // version, syncing to host does nothing.
8402 destMat->numExportPacketsPerLID_.sync_host ();
8403 Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
8404 getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8405 destMat->numImportPacketsPerLID_.sync_host ();
8406 Teuchos::ArrayView<size_t> numImportPacketsPerLID =
8407 getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
8408 if (verbose) {
8409 std::ostringstream os;
8410 os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8411 << std::endl;
8412 std::cerr << os.str ();
8413 }
8414 Distor.doPostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1,
8415 destMat->numImportPacketsPerLID_.view_host());
8416 if (verbose) {
8417 std::ostringstream os;
8418 os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8419 << std::endl;
8420 std::cerr << os.str ();
8421 }
8422
8423 size_t totalImportPackets = 0;
8424 for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
8425 totalImportPackets += numImportPacketsPerLID[i];
8426 }
8427
8428 // Reallocation MUST go before setting the modified flag,
8429 // because it may clear out the flags.
8430 destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
8431 verbosePrefix.get ());
8432 destMat->imports_.modify_host ();
8433 auto hostImports = destMat->imports_.view_host();
8434 // This is a legacy host pack/unpack path, so use the host
8435 // version of exports_.
8436 destMat->exports_.sync_host ();
8437 auto hostExports = destMat->exports_.view_host();
8438 if (verbose) {
8439 std::ostringstream os;
8440 os << *verbosePrefix << "Calling 4-arg doPostsAndWaits"
8441 << std::endl;
8442 std::cerr << os.str ();
8443 }
8444 Distor.doPostsAndWaits (hostExports,
8445 numExportPacketsPerLID,
8446 hostImports,
8447 numImportPacketsPerLID);
8448 if (verbose) {
8449 std::ostringstream os;
8450 os << *verbosePrefix << "Finished 4-arg doPostsAndWaits"
8451 << std::endl;
8452 std::cerr << os.str ();
8453 }
8454 }
8455 else { // constant number of packets per LID
8456 if (verbose) {
8457 std::ostringstream os;
8458 os << *verbosePrefix << "Forward mode, constant # packets / LID"
8459 << std::endl;
8460 std::cerr << os.str ();
8461 }
8462 destMat->imports_.modify_host ();
8463 auto hostImports = destMat->imports_.view_host();
8464 // This is a legacy host pack/unpack path, so use the host
8465 // version of exports_.
8466 destMat->exports_.sync_host ();
8467 auto hostExports = destMat->exports_.view_host();
8468 if (verbose) {
8469 std::ostringstream os;
8470 os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8471 << std::endl;
8472 std::cerr << os.str ();
8473 }
8474 Distor.doPostsAndWaits (hostExports,
8475 constantNumPackets,
8476 hostImports);
8477 if (verbose) {
8478 std::ostringstream os;
8479 os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8480 << std::endl;
8481 std::cerr << os.str ();
8482 }
8483 }
8484 }
8485 }
8486 }
8487
8488 /*********************************************************************/
8489 /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/
8490 /*********************************************************************/
8491
8492 bool runOnHost = std::is_same_v<typename device_type::memory_space, Kokkos::HostSpace> && !useKokkosPath;
8493
8494 Teuchos::Array<int> RemotePids;
8495 if (runOnHost) {
8496 Teuchos::Array<int> TargetPids;
8497 // Backwards compatibility measure. We'll use this again below.
8498
8499 // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
8500 // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
8501 // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
8502 destMat->numImportPacketsPerLID_.modify_host(); //FIXME
8503
8504# ifdef HAVE_TPETRA_MMM_TIMINGS
8505 RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
8506# endif
8507 ArrayRCP<size_t> CSR_rowptr;
8508 ArrayRCP<GO> CSR_colind_GID;
8509 ArrayRCP<LO> CSR_colind_LID;
8510 ArrayRCP<Scalar> CSR_vals;
8511
8512 destMat->imports_.sync_device ();
8513 destMat->numImportPacketsPerLID_.sync_device ();
8514
8515 size_t N = BaseRowMap->getLocalNumElements ();
8516
8517 auto RemoteLIDs_d = RemoteLIDs.view_device();
8518 auto PermuteToLIDs_d = PermuteToLIDs.view_device();
8519 auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();
8520
8522 *this,
8523 RemoteLIDs_d,
8524 destMat->imports_.view_device(), //hostImports
8525 destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID
8526 NumSameIDs,
8527 PermuteToLIDs_d,
8528 PermuteFromLIDs_d,
8529 N,
8530 MyPID,
8531 CSR_rowptr,
8532 CSR_colind_GID,
8533 CSR_vals,
8534 SourcePids(),
8535 TargetPids);
8536
8537 // If LO and GO are the same, we can reuse memory when
8538 // converting the column indices from global to local indices.
8539 if (typeid (LO) == typeid (GO)) {
8540 CSR_colind_LID = Teuchos::arcp_reinterpret_cast<LO> (CSR_colind_GID);
8541 }
8542 else {
8543 CSR_colind_LID.resize (CSR_colind_GID.size());
8544 }
8545 CSR_colind_LID.resize (CSR_colind_GID.size());
8546
8547 // On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally
8548 // owned entries. Convert them to the actual PID.
8549 // JHU FIXME This can be done within unpackAndCombineIntoCrsArrays with a parallel_for.
8550 for(size_t i=0; i<static_cast<size_t>(TargetPids.size()); i++)
8551 {
8552 if(TargetPids[i] == -1) TargetPids[i] = MyPID;
8553 }
8554#ifdef HAVE_TPETRA_MMM_TIMINGS
8555 tmCopySPRdata = Teuchos::null;
8556#endif
8557 /**************************************************************/
8558 /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8559 /**************************************************************/
8560 // Call an optimized version of makeColMap that avoids the
8561 // Directory lookups (since the Import object knows who owns all
8562 // the GIDs).
8563 if (verbose) {
8564 std::ostringstream os;
8565 os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8566 << std::endl;
8567 std::cerr << os.str ();
8568 }
8569 {
8570#ifdef HAVE_TPETRA_MMM_TIMINGS
8571 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC makeColMap")));
8572#endif
8573 Import_Util::lowCommunicationMakeColMapAndReindexSerial(CSR_rowptr (),
8574 CSR_colind_LID (),
8575 CSR_colind_GID (),
8576 BaseDomainMap,
8577 TargetPids,
8578 RemotePids,
8579 MyColMap);
8580 }
8581
8582 if (verbose) {
8583 std::ostringstream os;
8584 os << *verbosePrefix << "restrictComm="
8585 << (restrictComm ? "true" : "false") << std::endl;
8586 std::cerr << os.str ();
8587 }
8588
8589 /*******************************************************/
8590 /**** 4) Second communicator restriction phase ****/
8591 /*******************************************************/
8592 {
8593#ifdef HAVE_TPETRA_MMM_TIMINGS
8594 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrict colmap")));
8595#endif
8596 if (restrictComm) {
8597 ReducedColMap = (MyRowMap.getRawPtr () == MyColMap.getRawPtr ()) ?
8598 ReducedRowMap :
8599 MyColMap->replaceCommWithSubset (ReducedComm);
8600 MyColMap = ReducedColMap; // Reset the "my" maps
8601 }
8602
8603 // Replace the col map
8604 if (verbose) {
8605 std::ostringstream os;
8606 os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8607 std::cerr << os.str ();
8608 }
8609 destMat->replaceColMap (MyColMap);
8610
8611 // Short circuit if the processor is no longer in the communicator
8612 //
8613 // NOTE: Epetra replaces modifies all "removed" processes so they
8614 // have a dummy (serial) Map that doesn't touch the original
8615 // communicator. Duplicating that here might be a good idea.
8616 if (ReducedComm.is_null ()) {
8617 if (verbose) {
8618 std::ostringstream os;
8619 os << *verbosePrefix << "I am no longer in the communicator; "
8620 "returning" << std::endl;
8621 std::cerr << os.str ();
8622 }
8623 return;
8624 }
8625 }
8626
8627 /***************************************************/
8628 /**** 5) Sort ****/
8629 /***************************************************/
8630 if ((! reverseMode && xferAsImport != nullptr) ||
8631 (reverseMode && xferAsExport != nullptr)) {
8632 if (verbose) {
8633 std::ostringstream os;
8634 os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8635 std::cerr << os.str ();
8636 }
8637#ifdef HAVE_TPETRA_MMM_TIMINGS
8638 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortCrsEntries")));
8639#endif
8640 Import_Util::sortCrsEntries (CSR_rowptr(),
8641 CSR_colind_LID(),
8642 CSR_vals());
8643 }
8644 else if ((! reverseMode && xferAsExport != nullptr) ||
8645 (reverseMode && xferAsImport != nullptr)) {
8646 if (verbose) {
8647 std::ostringstream os;
8648 os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8649 << endl;
8650 std::cerr << os.str();
8651 }
8652#ifdef HAVE_TPETRA_MMM_TIMINGS
8653 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries")));
8654#endif
8656 CSR_colind_LID(),
8657 CSR_vals());
8658 if (CSR_rowptr[N] != static_cast<size_t>(CSR_vals.size())) {
8659 CSR_colind_LID.resize (CSR_rowptr[N]);
8660 CSR_vals.resize (CSR_rowptr[N]);
8661 }
8662 }
8663 else {
8664 TEUCHOS_TEST_FOR_EXCEPTION(
8665 true, std::logic_error, "Tpetra::CrsMatrix::"
8666 "transferAndFillComplete: Should never get here! "
8667 "Please report this bug to a Tpetra developer.");
8668 }
8669 /***************************************************/
8670 /**** 6) Reset the colmap and the arrays ****/
8671 /***************************************************/
8672
8673 if (verbose) {
8674 std::ostringstream os;
8675 os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8676 std::cerr << os.str ();
8677 }
8678
8679 // Call constructor for the new matrix (restricted as needed)
8680 //
8681 // NOTE (mfh 15 May 2014) This should work fine for the Kokkos
8682 // refactor version of CrsMatrix, though it reserves the right to
8683 // make a deep copy of the arrays.
8684 {
8685#ifdef HAVE_TPETRA_MMM_TIMINGS
8686 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC setAllValues")));
8687#endif
8688 destMat->setAllValues (CSR_rowptr, CSR_colind_LID, CSR_vals);
8689 }
8690
8691 } else {
8692 // run on device
8693
8694
8695 // Backwards compatibility measure. We'll use this again below.
8696
8697 // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
8698 // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
8699 // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
8700 destMat->numImportPacketsPerLID_.modify_host(); //FIXME
8701
8702# ifdef HAVE_TPETRA_MMM_TIMINGS
8703 RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
8704# endif
8705 ArrayRCP<size_t> CSR_rowptr;
8706 ArrayRCP<GO> CSR_colind_GID;
8707 ArrayRCP<LO> CSR_colind_LID;
8708 ArrayRCP<Scalar> CSR_vals;
8709
8710 destMat->imports_.sync_device ();
8711 destMat->numImportPacketsPerLID_.sync_device ();
8712
8713 size_t N = BaseRowMap->getLocalNumElements ();
8714
8715 auto RemoteLIDs_d = RemoteLIDs.view_device();
8716 auto PermuteToLIDs_d = PermuteToLIDs.view_device();
8717 auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();
8718
8719 Kokkos::View<size_t*,device_type> CSR_rowptr_d;
8720 Kokkos::View<GO*,device_type> CSR_colind_GID_d;
8721 Kokkos::View<LO*,device_type> CSR_colind_LID_d;
8722 Kokkos::View<impl_scalar_type*,device_type> CSR_vals_d;
8723 Kokkos::View<int*,device_type> TargetPids_d;
8724
8726 *this,
8727 RemoteLIDs_d,
8728 destMat->imports_.view_device(), //hostImports
8729 destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID
8730 NumSameIDs,
8731 PermuteToLIDs_d,
8732 PermuteFromLIDs_d,
8733 N,
8734 MyPID,
8735 CSR_rowptr_d,
8736 CSR_colind_GID_d,
8737 CSR_vals_d,
8738 SourcePids(),
8739 TargetPids_d);
8740
8741 Kokkos::resize (CSR_colind_LID_d, CSR_colind_GID_d.size());
8742
8743#ifdef HAVE_TPETRA_MMM_TIMINGS
8744 tmCopySPRdata = Teuchos::null;
8745#endif
8746 /**************************************************************/
8747 /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8748 /**************************************************************/
8749 // Call an optimized version of makeColMap that avoids the
8750 // Directory lookups (since the Import object knows who owns all
8751 // the GIDs).
8752 if (verbose) {
8753 std::ostringstream os;
8754 os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8755 << std::endl;
8756 std::cerr << os.str ();
8757 }
8758 {
8759#ifdef HAVE_TPETRA_MMM_TIMINGS
8760 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC makeColMap")));
8761#endif
8763 CSR_colind_LID_d,
8764 CSR_colind_GID_d,
8765 BaseDomainMap,
8766 TargetPids_d,
8767 RemotePids,
8768 MyColMap);
8769 }
8770
8771 if (verbose) {
8772 std::ostringstream os;
8773 os << *verbosePrefix << "restrictComm="
8774 << (restrictComm ? "true" : "false") << std::endl;
8775 std::cerr << os.str ();
8776 }
8777
8778 /*******************************************************/
8779 /**** 4) Second communicator restriction phase ****/
8780 /*******************************************************/
8781 {
8782#ifdef HAVE_TPETRA_MMM_TIMINGS
8783 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrict colmap")));
8784#endif
8785 if (restrictComm) {
8786 ReducedColMap = (MyRowMap.getRawPtr () == MyColMap.getRawPtr ()) ?
8787 ReducedRowMap :
8788 MyColMap->replaceCommWithSubset (ReducedComm);
8789 MyColMap = ReducedColMap; // Reset the "my" maps
8790 }
8791
8792 // Replace the col map
8793 if (verbose) {
8794 std::ostringstream os;
8795 os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8796 std::cerr << os.str ();
8797 }
8798 destMat->replaceColMap (MyColMap);
8799
8800 // Short circuit if the processor is no longer in the communicator
8801 //
8802 // NOTE: Epetra replaces modifies all "removed" processes so they
8803 // have a dummy (serial) Map that doesn't touch the original
8804 // communicator. Duplicating that here might be a good idea.
8805 if (ReducedComm.is_null ()) {
8806 if (verbose) {
8807 std::ostringstream os;
8808 os << *verbosePrefix << "I am no longer in the communicator; "
8809 "returning" << std::endl;
8810 std::cerr << os.str ();
8811 }
8812 return;
8813 }
8814 }
8815
8816 /***************************************************/
8817 /**** 5) Sort ****/
8818 /***************************************************/
8819
8820 if ((! reverseMode && xferAsImport != nullptr) ||
8821 (reverseMode && xferAsExport != nullptr)) {
8822 if (verbose) {
8823 std::ostringstream os;
8824 os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8825 std::cerr << os.str ();
8826 }
8827#ifdef HAVE_TPETRA_MMM_TIMINGS
8828 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortCrsEntries")));
8829#endif
8830 Import_Util::sortCrsEntries (CSR_rowptr_d,
8831 CSR_colind_LID_d,
8832 CSR_vals_d);
8833 }
8834 else if ((! reverseMode && xferAsExport != nullptr) ||
8835 (reverseMode && xferAsImport != nullptr)) {
8836 if (verbose) {
8837 std::ostringstream os;
8838 os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8839 << endl;
8840 std::cerr << os.str();
8841 }
8842#ifdef HAVE_TPETRA_MMM_TIMINGS
8843 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries")));
8844#endif
8846 CSR_colind_LID_d,
8847 CSR_vals_d);
8848 }
8849 else {
8850 TEUCHOS_TEST_FOR_EXCEPTION(
8851 true, std::logic_error, "Tpetra::CrsMatrix::"
8852 "transferAndFillComplete: Should never get here! "
8853 "Please report this bug to a Tpetra developer.");
8854 }
8855
8856 /***************************************************/
8857 /**** 6) Reset the colmap and the arrays ****/
8858 /***************************************************/
8859
8860 if (verbose) {
8861 std::ostringstream os;
8862 os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8863 std::cerr << os.str ();
8864 }
8865
8866 {
8867#ifdef HAVE_TPETRA_MMM_TIMINGS
8868 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC setAllValues")));
8869#endif
8870 destMat->setAllValues (CSR_rowptr_d, CSR_colind_LID_d, CSR_vals_d);
8871 }
8872
8873 } //if (runOnHost) .. else ..
8874
8875 /***************************************************/
8876 /**** 7) Build Importer & Call ESFC ****/
8877 /***************************************************/
8878#ifdef HAVE_TPETRA_MMM_TIMINGS
8879 RCP<TimeMonitor> tmIESFC = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC build importer and esfc"))));
8880#endif
8881 // Pre-build the importer using the existing PIDs
8882 Teuchos::ParameterList esfc_params;
8883
8884 RCP<import_type> MyImport;
8885
8886 // Fulfull the non-blocking allreduce on reduced_mismatch.
8887 if (iallreduceRequest.get () != nullptr) {
8888 if (verbose) {
8889 std::ostringstream os;
8890 os << *verbosePrefix << "Calling iallreduceRequest->wait()"
8891 << endl;
8892 std::cerr << os.str ();
8893 }
8894 iallreduceRequest->wait ();
8895 if (reduced_mismatch != 0) {
8896 isMM = false;
8897 }
8898 }
8899
8900 if( isMM ) {
8901#ifdef HAVE_TPETRA_MMM_TIMINGS
8902 Teuchos::TimeMonitor MMisMM (*TimeMonitor::getNewTimer(prefix + std::string("isMM Block")));
8903#endif
8904 // Combine all type1/2/3 lists, [filter them], then call the expert import constructor.
8905
8906 if (verbose) {
8907 std::ostringstream os;
8908 os << *verbosePrefix << "Getting CRS pointers" << endl;
8909 std::cerr << os.str ();
8910 }
8911
8912 Teuchos::ArrayRCP<LocalOrdinal> type3LIDs;
8913 Teuchos::ArrayRCP<int> type3PIDs;
8914 auto rowptr = getCrsGraph()->getLocalRowPtrsHost();
8915 auto colind = getCrsGraph()->getLocalIndicesHost();
8916
8917 if (verbose) {
8918 std::ostringstream os;
8919 os << *verbosePrefix << "Calling reverseNeighborDiscovery" << std::endl;
8920 std::cerr << os.str ();
8921 }
8922
8923 {
8924#ifdef HAVE_TPETRA_MMM_TIMINGS
8925 TimeMonitor tm_rnd (*TimeMonitor::getNewTimer(prefix + std::string("isMMrevNeighDis")));
8926#endif
8927 Import_Util::reverseNeighborDiscovery(*this,
8928 rowptr,
8929 colind,
8930 rowTransfer,
8931 MyImporter,
8932 MyDomainMap,
8933 type3PIDs,
8934 type3LIDs,
8935 ReducedComm);
8936 }
8937
8938 if (verbose) {
8939 std::ostringstream os;
8940 os << *verbosePrefix << "Done with reverseNeighborDiscovery" << std::endl;
8941 std::cerr << os.str ();
8942 }
8943
8944 Teuchos::ArrayView<const int> EPID1 = MyImporter.is_null() ? Teuchos::ArrayView<const int>() : MyImporter->getExportPIDs();
8945 Teuchos::ArrayView<const LO> ELID1 = MyImporter.is_null() ? Teuchos::ArrayView<const LO>() : MyImporter->getExportLIDs();
8946
8947 Teuchos::ArrayView<const int> TEPID2 = rowTransfer.getExportPIDs(); // row matrix
8948 Teuchos::ArrayView<const LO> TELID2 = rowTransfer.getExportLIDs();
8949
8950 const int numCols = getGraph()->getColMap()->getLocalNumElements(); // may be dup
8951 // from EpetraExt_MMHelpers.cpp: build_type2_exports
8952 std::vector<bool> IsOwned(numCols,true);
8953 std::vector<int> SentTo(numCols,-1);
8954 if (! MyImporter.is_null ()) {
8955 for (auto && rlid : MyImporter->getRemoteLIDs()) { // the remoteLIDs must be from sourcematrix
8956 IsOwned[rlid]=false;
8957 }
8958 }
8959
8960 std::vector<std::pair<int,GO> > usrtg;
8961 usrtg.reserve(TEPID2.size());
8962
8963 {
8964 const auto& colMap = * (this->getColMap ()); // *this is sourcematrix
8965 for (Array_size_type i = 0; i < TEPID2.size (); ++i) {
8966 const LO row = TELID2[i];
8967 const int pid = TEPID2[i];
8968 for (auto j = rowptr[row]; j < rowptr[row+1]; ++j) {
8969 const int col = colind[j];
8970 if (IsOwned[col] && SentTo[col] != pid) {
8971 SentTo[col] = pid;
8972 GO gid = colMap.getGlobalElement (col);
8973 usrtg.push_back (std::pair<int,GO> (pid, gid));
8974 }
8975 }
8976 }
8977 }
8978
8979// This sort can _not_ be omitted.[
8980 std::sort(usrtg.begin(),usrtg.end()); // default comparator does the right thing, now sorted in gid order
8981 auto eopg = std ::unique(usrtg.begin(),usrtg.end());
8982 // 25 Jul 2018: Could just ignore the entries at and after eopg.
8983 usrtg.erase(eopg,usrtg.end());
8984
8985 const Array_size_type type2_us_size = usrtg.size();
8986 Teuchos::ArrayRCP<int> EPID2=Teuchos::arcp(new int[type2_us_size],0,type2_us_size,true);
8987 Teuchos::ArrayRCP< LO> ELID2=Teuchos::arcp(new LO[type2_us_size],0,type2_us_size,true);
8988
8989 int pos=0;
8990 for(auto && p : usrtg) {
8991 EPID2[pos]= p.first;
8992 ELID2[pos]= this->getDomainMap()->getLocalElement(p.second);
8993 pos++;
8994 }
8995
8996 Teuchos::ArrayView<int> EPID3 = type3PIDs();
8997 Teuchos::ArrayView< LO> ELID3 = type3LIDs();
8998 GO InfGID = std::numeric_limits<GO>::max();
8999 int InfPID = INT_MAX;
9000#ifdef TPETRA_MIN3
9001# undef TPETRA_MIN3
9002#endif // TPETRA_MIN3
9003#define TPETRA_MIN3(x,y,z) ((x)<(y)?(std::min(x,z)):(std::min(y,z)))
9004 int i1=0, i2=0, i3=0;
9005 int Len1 = EPID1.size();
9006 int Len2 = EPID2.size();
9007 int Len3 = EPID3.size();
9008
9009 int MyLen=Len1+Len2+Len3;
9010 Teuchos::ArrayRCP<LO> userExportLIDs = Teuchos::arcp(new LO[MyLen],0,MyLen,true);
9011 Teuchos::ArrayRCP<int> userExportPIDs = Teuchos::arcp(new int[MyLen],0,MyLen,true);
9012 int iloc = 0; // will be the size of the userExportLID/PIDs
9013
9014 while(i1 < Len1 || i2 < Len2 || i3 < Len3){
9015 int PID1 = (i1<Len1)?(EPID1[i1]):InfPID;
9016 int PID2 = (i2<Len2)?(EPID2[i2]):InfPID;
9017 int PID3 = (i3<Len3)?(EPID3[i3]):InfPID;
9018
9019 GO GID1 = (i1<Len1)?getDomainMap()->getGlobalElement(ELID1[i1]):InfGID;
9020 GO GID2 = (i2<Len2)?getDomainMap()->getGlobalElement(ELID2[i2]):InfGID;
9021 GO GID3 = (i3<Len3)?getDomainMap()->getGlobalElement(ELID3[i3]):InfGID;
9022
9023 int MIN_PID = TPETRA_MIN3(PID1,PID2,PID3);
9024 GO MIN_GID = TPETRA_MIN3( ((PID1==MIN_PID)?GID1:InfGID), ((PID2==MIN_PID)?GID2:InfGID), ((PID3==MIN_PID)?GID3:InfGID));
9025#ifdef TPETRA_MIN3
9026# undef TPETRA_MIN3
9027#endif // TPETRA_MIN3
9028 bool added_entry=false;
9029
9030 if(PID1 == MIN_PID && GID1 == MIN_GID){
9031 userExportLIDs[iloc]=ELID1[i1];
9032 userExportPIDs[iloc]=EPID1[i1];
9033 i1++;
9034 added_entry=true;
9035 iloc++;
9036 }
9037 if(PID2 == MIN_PID && GID2 == MIN_GID){
9038 if(!added_entry) {
9039 userExportLIDs[iloc]=ELID2[i2];
9040 userExportPIDs[iloc]=EPID2[i2];
9041 added_entry=true;
9042 iloc++;
9043 }
9044 i2++;
9045 }
9046 if(PID3 == MIN_PID && GID3 == MIN_GID){
9047 if(!added_entry) {
9048 userExportLIDs[iloc]=ELID3[i3];
9049 userExportPIDs[iloc]=EPID3[i3];
9050 iloc++;
9051 }
9052 i3++;
9053 }
9054 }
9055
9056 if (verbose) {
9057 std::ostringstream os;
9058 os << *verbosePrefix << "Create Import" << std::endl;
9059 std::cerr << os.str ();
9060 }
9061
9062#ifdef HAVE_TPETRA_MMM_TIMINGS
9063 auto ismmIctor(*TimeMonitor::getNewTimer(prefix + std::string("isMMIportCtor")));
9064#endif
9065 Teuchos::RCP<Teuchos::ParameterList> plist = rcp(new Teuchos::ParameterList());
9066 // 25 Jul 2018: Test for equality with the non-isMM path's Import object.
9067 if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
9068 MyImport = rcp ( new import_type (MyDomainMap,
9069 MyColMap,
9070 RemotePids,
9071 userExportLIDs.view(0,iloc).getConst(),
9072 userExportPIDs.view(0,iloc).getConst(),
9073 plist)
9074 );
9075
9076 if (verbose) {
9077 std::ostringstream os;
9078 os << *verbosePrefix << "Call expertStaticFillComplete" << std::endl;
9079 std::cerr << os.str ();
9080 }
9081
9082 {
9083#ifdef HAVE_TPETRA_MMM_TIMINGS
9084 TimeMonitor esfc (*TimeMonitor::getNewTimer(prefix + std::string("isMM::destMat->eSFC")));
9085 esfc_params.set("Timer Label",label+std::string("isMM eSFC"));
9086#endif
9087 if(!params.is_null())
9088 esfc_params.set("compute global constants",params->get("compute global constants",true));
9089 destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap, MyImport,Teuchos::null,rcp(new Teuchos::ParameterList(esfc_params)));
9090
9091 }
9092
9093 } // if(isMM)
9094 else {
9095#ifdef HAVE_TPETRA_MMM_TIMINGS
9096 TimeMonitor MMnotMMblock (*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMblock")));
9097#endif
9098 if (verbose) {
9099 std::ostringstream os;
9100 os << *verbosePrefix << "Create Import" << std::endl;
9101 std::cerr << os.str ();
9102 }
9103
9104#ifdef HAVE_TPETRA_MMM_TIMINGS
9105 TimeMonitor notMMIcTor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMCreateImporter")));
9106#endif
9107 Teuchos::RCP<Teuchos::ParameterList> mypars = rcp(new Teuchos::ParameterList);
9108 mypars->set("Timer Label","notMMFrom_tAFC");
9109 if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
9110 MyImport = rcp (new import_type (MyDomainMap, MyColMap, RemotePids, mypars));
9111
9112 if (verbose) {
9113 std::ostringstream os;
9114 os << *verbosePrefix << "Call expertStaticFillComplete" << endl;
9115 std::cerr << os.str ();
9116 }
9117
9118#ifdef HAVE_TPETRA_MMM_TIMINGS
9119 TimeMonitor esfcnotmm(*TimeMonitor::getNewTimer(prefix + std::string("notMMdestMat->expertStaticFillComplete")));
9120 esfc_params.set("Timer Label",prefix+std::string("notMM eSFC"));
9121#else
9122 esfc_params.set("Timer Label",std::string("notMM eSFC"));
9123#endif
9124
9125 if (!params.is_null ()) {
9126 esfc_params.set ("compute global constants",
9127 params->get ("compute global constants", true));
9128 }
9129 destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap,
9130 MyImport, Teuchos::null,
9131 rcp (new Teuchos::ParameterList (esfc_params)));
9132 }
9133
9134#ifdef HAVE_TPETRA_MMM_TIMINGS
9135 tmIESFC = Teuchos::null;
9136#endif
9137
9138 if (verbose) {
9139 std::ostringstream os;
9140 os << *verbosePrefix << "Done" << endl;
9141 std::cerr << os.str ();
9142 }
9143 } //transferAndFillComplete
9144
9145
9146 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9147 void
9149 importAndFillComplete (Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >& destMatrix,
9150 const import_type& importer,
9151 const Teuchos::RCP<const map_type>& domainMap,
9152 const Teuchos::RCP<const map_type>& rangeMap,
9153 const Teuchos::RCP<Teuchos::ParameterList>& params) const
9154 {
9155 transferAndFillComplete (destMatrix, importer, Teuchos::null, domainMap, rangeMap, params);
9156 }
9157
9158 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9159 void
9161 importAndFillComplete (Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >& destMatrix,
9162 const import_type& rowImporter,
9163 const import_type& domainImporter,
9164 const Teuchos::RCP<const map_type>& domainMap,
9165 const Teuchos::RCP<const map_type>& rangeMap,
9166 const Teuchos::RCP<Teuchos::ParameterList>& params) const
9167 {
9168 transferAndFillComplete (destMatrix, rowImporter, Teuchos::rcpFromRef(domainImporter), domainMap, rangeMap, params);
9169 }
9170
9171 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9172 void
9174 exportAndFillComplete (Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >& destMatrix,
9175 const export_type& exporter,
9176 const Teuchos::RCP<const map_type>& domainMap,
9177 const Teuchos::RCP<const map_type>& rangeMap,
9178 const Teuchos::RCP<Teuchos::ParameterList>& params) const
9179 {
9180 transferAndFillComplete (destMatrix, exporter, Teuchos::null, domainMap, rangeMap, params);
9181 }
9182
9183 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9184 void
9186 exportAndFillComplete (Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >& destMatrix,
9187 const export_type& rowExporter,
9188 const export_type& domainExporter,
9189 const Teuchos::RCP<const map_type>& domainMap,
9190 const Teuchos::RCP<const map_type>& rangeMap,
9191 const Teuchos::RCP<Teuchos::ParameterList>& params) const
9192 {
9193 transferAndFillComplete (destMatrix, rowExporter, Teuchos::rcpFromRef(domainExporter), domainMap, rangeMap, params);
9194 }
9195
9196} // namespace Tpetra
9197
9198//
9199// Explicit instantiation macro
9200//
9201// Must be expanded from within the Tpetra namespace!
9202//
9203
9204#define TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR,LO,GO,NODE) \
9205 \
9206 template class CrsMatrix< SCALAR , LO , GO , NODE >;
9207
9208#define TPETRA_CRSMATRIX_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
9209 \
9210 template Teuchos::RCP< CrsMatrix< SO , LO , GO , NODE > > \
9211 CrsMatrix< SI , LO , GO , NODE >::convert< SO > () const;
9212
9213#define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9214 template<> \
9215 Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9216 importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9217 const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9218 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9219 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& importer, \
9220 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9221 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9222 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9223 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9224 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9225 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9226 const Teuchos::RCP<Teuchos::ParameterList>& params);
9227
9228#define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9229 template<> \
9230 Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9231 importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9232 const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9233 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9234 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowImporter, \
9235 const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9236 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9237 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainImporter, \
9238 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9239 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9240 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9241 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9242 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9243 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9244 const Teuchos::RCP<Teuchos::ParameterList>& params);
9245
9246
9247#define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9248 template<> \
9249 Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9250 exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9251 const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9252 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9253 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& exporter, \
9254 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9255 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9256 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9257 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9258 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9259 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9260 const Teuchos::RCP<Teuchos::ParameterList>& params);
9261
9262#define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9263 template<> \
9264 Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9265 exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9266 const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9267 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9268 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowExporter, \
9269 const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9270 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9271 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainExporter, \
9272 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9273 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9274 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9275 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9276 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9277 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9278 const Teuchos::RCP<Teuchos::ParameterList>& params);
9279
9280
9281#define TPETRA_CRSMATRIX_INSTANT(SCALAR, LO, GO ,NODE) \
9282 TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR, LO, GO, NODE) \
9283 TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9284 TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9285 TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9286 TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE)
9287
9288#endif // TPETRA_CRSMATRIX_DEF_HPP
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Declare and define Tpetra::Details::copyConvert, an implementation detail of Tpetra (in particular,...
Declare and define Tpetra::Details::copyOffsets, an implementation detail of Tpetra (in particular,...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Functions for manipulating CRS arrays.
Declaration of a function that prints strings from each process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Declaration of Tpetra::Details::iallreduce.
Declaration and definition of Tpetra::Details::leftScaleLocalCrsMatrix.
KOKKOS_FUNCTION size_t packRow(const LocalMapType &col_map, const Kokkos::View< Packet *, BufferDeviceType > &exports, const InputLidsType &lids_in, const InputPidsType &pids_in, const size_t offset, const size_t num_ent, const bool pack_pids)
Packs a single row of the CrsGraph.
Declaration and definition of Tpetra::Details::rightScaleLocalCrsMatrix.
KOKKOS_FUNCTION int unpackRow(const Kokkos::View< GO *, Device, Kokkos::MemoryUnmanaged > &gids_out, const Kokkos::View< int *, Device, Kokkos::MemoryUnmanaged > &pids_out, const Kokkos::View< const Packet *, BufferDevice > &imports, const size_t offset, const size_t num_ent)
Unpack a single row of a CrsGraph.
Utility functions for packing and unpacking sparse matrix entries.
void lowCommunicationMakeColMapAndReindex(const Teuchos::ArrayView< const size_t > &rowptr, const Teuchos::ArrayView< LocalOrdinal > &colind_LID, const Teuchos::ArrayView< GlobalOrdinal > &colind_GID, const Teuchos::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMapRCP, const Teuchos::ArrayView< const int > &owningPIDs, Teuchos::Array< int > &remotePIDs, Teuchos::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap)
lowCommunicationMakeColMapAndReindex
void sortAndMergeCrsEntries(const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< Ordinal > &CRS_colind, const Teuchos::ArrayView< Scalar > &CRS_vals)
Sort and merge the entries of the (raw CSR) matrix by column index within each row.
void sortCrsEntries(const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< Ordinal > &CRS_colind, const Teuchos::ArrayView< Scalar > &CRS_vals)
Sort the entries of the (raw CSR) matrix by column index within each row.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
void getPids(const Tpetra::Import< LocalOrdinal, GlobalOrdinal, Node > &Importer, Teuchos::Array< int > &pids, bool use_minus_one_for_local)
Like getPidGidPairs, but just gets the PIDs, ordered by the column Map.
#define TPETRA_ABUSE_WARNING(throw_exception_test, Exception, msg)
Handle an abuse warning, according to HAVE_TPETRA_THROW_ABUSE_WARNINGS and HAVE_TPETRA_PRINT_ABUSE_WA...
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
void reindexColumns(const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortIndicesInEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
global_inds_dualv_type::t_host::const_type getGlobalIndsViewHost(const RowInfo &rowinfo) const
Get a const, globally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myR...
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
local_inds_wdv_type lclIndsUnpacked_wdv
Local ordinals of column indices for all rows Valid when isLocallyIndexed is true If OptimizedStorage...
RowInfo getRowInfoFromGlobalRowIndex(const global_ordinal_type gblRow) const
Get information about the locally owned row with global index gblRow.
size_t findGlobalIndices(const RowInfo &rowInfo, const Teuchos::ArrayView< const global_ordinal_type > &indices, std::function< void(const size_t, const size_t, const size_t)> fun) const
Finds indices in the given row.
num_row_entries_type k_numRowEntries_
The number of local entries in each locally owned row.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
RowInfo getRowInfo(const local_ordinal_type myRow) const
Get information about the locally owned row with local index myRow.
Teuchos::RCP< const map_type > colMap_
The Map describing the distribution of columns of the graph.
bool noRedundancies_
Whether the graph's indices are non-redundant (merged) in each row, on this process.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Kokkos::View< size_t *, Kokkos::LayoutLeft, device_type >::HostMirror num_row_entries_type
Row offsets for "1-D" storage.
bool isFillComplete() const override
Whether fillComplete() has been called and the graph is in compute mode.
const row_ptrs_host_view_type & getRowPtrsUnpackedHost() const
Get the unpacked row pointers on host. Lazily make a copy from device.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
local_inds_dualv_type::t_host::const_type getLocalIndsViewHost(const RowInfo &rowinfo) const
Get a const, locally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myRo...
Teuchos::RCP< const map_type > getRowMap() const override
Returns the Map that describes the row distribution in this graph.
size_t insertGlobalIndicesImpl(const local_ordinal_type lclRow, const global_ordinal_type inputGblColInds[], const size_t numInputInds)
Insert global indices, using an input local row index.
bool indicesAreSorted_
Whether the graph's indices are sorted in each row, on this process.
local_inds_dualv_type::t_host getLocalIndsViewHostNonConst(const RowInfo &rowinfo)
Get a ReadWrite locally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(m...
Teuchos::RCP< const map_type > rowMap_
The Map describing the distribution of rows of the graph.
bool isGloballyIndexed() const override
Whether the graph's column indices are stored as global indices.
bool isLocallyIndexed() const override
Whether the graph's column indices are stored as local indices.
size_t getLocalNumRows() const override
Returns the number of graph rows owned on the calling node.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
virtual void insertGlobalValuesImpl(crs_graph_type &graph, RowInfo &rowInfo, const GlobalOrdinal gblColInds[], const impl_scalar_type vals[], const size_t numInputEnt)
Common implementation detail of insertGlobalValues and insertGlobalValuesFiltered.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const override
Print this object with the given verbosity level to the given output stream.
std::map< GlobalOrdinal, std::pair< Teuchos::Array< GlobalOrdinal >, Teuchos::Array< Scalar > > > nonlocals_
Nonlocal data added using insertGlobalValues().
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
ordinal_rowptrs_type ordinalRowptrs
local_ordinal typed version of local matrix's rowptrs. This allows the LocalCrsMatrixOperator to have...
void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< char *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode CM) override
Unpack the imported column indices and values, and combine into matrix.
void replaceRangeMap(const Teuchos::RCP< const map_type > &newRangeMap)
Replace the current range Map with the given objects.
Details::EStorageStatus storageStatus_
Status of the matrix's storage, when not in a fill-complete state.
void applyNonTranspose(const MV &X_in, MV &Y_in, Scalar alpha, Scalar beta) const
Special case of apply() for mode == Teuchos::NO_TRANS.
void importAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const import_type &importer, const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Import from this to the given destination matrix, and make the result fill complete.
CrsGraph< LocalOrdinal, GlobalOrdinal, Node > crs_graph_type
The CrsGraph specialization suitable for this CrsMatrix specialization.
local_ordinal_type replaceGlobalValues(const global_ordinal_type globalRow, const Kokkos::View< const global_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries' values, using global indices.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, over all processes in the matrix's communicator.
void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row,...
size_t getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const override
Number of entries in the sparse matrix in the given global row, on the calling (MPI) process.
void scale(const Scalar &alpha)
Scale the matrix's values: this := alpha*this.
GlobalOrdinal global_ordinal_type
The type of each global index in the matrix.
void sortAndMergeIndicesAndValues(const bool sorted, const bool merged)
Sort and merge duplicate local column indices in all rows on the calling process, along with their co...
void packNew(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< char *, buffer_device_type > &exports, const Kokkos::DualView< size_t *, buffer_device_type > &numPacketsPerLID, size_t &constantNumPackets) const
Pack this object's data for an Import or Export.
size_t getLocalNumCols() const override
The number of columns connected to the locally owned rows of this matrix.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
bool hasColMap() const override
Whether the matrix has a well-defined column Map.
Teuchos::RCP< CrsMatrix< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another CrsMatrix with the same entries, but converted to a different Scalar type T.
values_dualv_type::t_dev getValuesViewDeviceNonConst(const RowInfo &rowinfo)
Get a non-const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myR...
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
std::shared_ptr< local_multiply_op_type > getLocalMultiplyOperator() const
The local sparse matrix operator (a wrapper of getLocalMatrixDevice() that supports local matrix-vect...
local_ordinal_type sumIntoLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using local row and column indices.
virtual Teuchos::RCP< RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params) const override
Implementation of RowMatrix::add: return alpha*A + beta*this.
void applyTranspose(const MV &X_in, MV &Y_in, const Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Special case of apply() for mode != Teuchos::NO_TRANS.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Number of entries in the sparse matrix in the given local row, on the calling (MPI) process.
Teuchos::RCP< MV > exportMV_
Row Map MultiVector used in apply().
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
bool isFillActive() const
Whether the matrix is not fill complete.
RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > row_matrix_type
The RowMatrix representing the base class of CrsMatrix.
void replaceDomainMapAndImporter(const Teuchos::RCP< const map_type > &newDomainMap, Teuchos::RCP< const import_type > &newImporter)
Replace the current domain Map and Import with the given objects.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute a sparse matrix-MultiVector multiply.
mag_type getFrobeniusNorm() const override
Compute and return the Frobenius norm of the matrix.
void insertLocalValues(const LocalOrdinal localRow, const Teuchos::ArrayView< const LocalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const CombineMode CM=ADD)
Insert one or more entries into the matrix, using local column indices.
global_size_t getGlobalNumCols() const override
The number of global columns in the matrix.
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
Teuchos::RCP< MV > importMV_
Column Map MultiVector used in apply().
void allocateValues(ELocalGlobal lg, GraphAllocationStatus gas, const bool verbose)
Allocate values (and optionally indices) using the Node.
size_t getLocalNumEntries() const override
The local number of entries in this matrix.
Map< LocalOrdinal, GlobalOrdinal, Node > map_type
The Map specialization suitable for this CrsMatrix specialization.
typename Node::device_type device_type
The Kokkos device type.
bool fillComplete_
Whether the matrix is fill complete.
virtual LocalOrdinal sumIntoGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoGlobalValues.
void replaceDomainMap(const Teuchos::RCP< const map_type > &newDomainMap)
Replace the current domain Map with the given objects.
std::string description() const override
A one-line description of this object.
void reindexColumns(crs_graph_type *const graph, const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
void replaceRangeMapAndExporter(const Teuchos::RCP< const map_type > &newRangeMap, Teuchos::RCP< const export_type > &newExporter)
Replace the current Range Map and Export with the given objects.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix's column Map with the given Map.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void globalAssemble()
Communicate nonlocal contributions to other processes.
void checkInternalState() const
Check that this object's state is sane; throw if it's not.
bool hasTransposeApply() const override
Whether apply() allows applying the transpose or conjugate transpose.
GlobalOrdinal getIndexBase() const override
The index base for global indices for this matrix.
Scalar scalar_type
The type of each entry in the matrix.
void getLocalDiagCopy(Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &diag) const override
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices,...
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
void fillLocalGraphAndMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local graph and matrix.
Export< LocalOrdinal, GlobalOrdinal, Node > export_type
The Export specialization suitable for this CrsMatrix specialization.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
void getLocalRowView(LocalOrdinal LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant view of a row of this matrix, using local row and column indices.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void getGlobalRowView(::Tpetra::Details::DefaultTypes::global_ordinal_type GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
virtual void copyAndPermute(const SrcDistObject &source, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap) override
Remove processes owning zero rows from the Maps and their communicator.
virtual LocalOrdinal sumIntoLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoLocalValues.
void swap(CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &matrix)
Swaps the data from *this with the data and maps from crsMatrix.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
global_size_t getGlobalNumEntries() const override
The global number of entries in this matrix.
virtual LocalOrdinal replaceLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceLocalValues.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
bool isFillComplete() const override
Whether the matrix is fill complete.
virtual bool checkSizes(const SrcDistObject &source) override
Compare the source and target (this) objects for compatibility.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
local_ordinal_type replaceLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries' values, using local row and column indices.
void exportAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const export_type &exporter, const Teuchos::RCP< const map_type > &domainMap=Teuchos::null, const Teuchos::RCP< const map_type > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Export from this to the given destination matrix, and make the result fill complete.
values_dualv_type::t_host::const_type getValuesViewHost(const RowInfo &rowinfo) const
Get a const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow).
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
local_matrix_device_type::values_type::const_type getLocalValuesDevice(Access::ReadOnlyStruct s) const
Get the Kokkos local values on device, read only.
virtual LocalOrdinal replaceGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceGlobalValues.
values_dualv_type::t_host getValuesViewHostNonConst(const RowInfo &rowinfo)
Get a non-const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow...
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
void getLocalDiagOffsets(Teuchos::ArrayRCP< size_t > &offsets) const
Get offsets of the diagonal entries in the matrix.
void fillLocalMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local matrix.
void rightScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the right with the given Vector.
bool isStorageOptimized() const
Returns true if storage has been optimized.
Import< LocalOrdinal, GlobalOrdinal, Node > import_type
The Import specialization suitable for this CrsMatrix specialization.
void getLocalRowCopy(LocalOrdinal LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row,...
values_dualv_type::t_dev::const_type getValuesViewDevice(const RowInfo &rowinfo) const
Get a const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow).
void leftScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the left with the given Vector.
virtual bool supportsRowViews() const override
Return true if getLocalRowView() and getGlobalRowView() are valid for this object.
static size_t mergeRowIndicesAndValues(size_t rowLen, local_ordinal_type *cols, impl_scalar_type *vals)
Merge duplicate row indices in the given row, along with their corresponding values.
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix's graph, as a CrsGraph.
Description of Tpetra's behavior.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
static size_t verbosePrintCountThreshold()
Number of entries below which arrays, lists, etc. will be printed in debug mode.
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is "imbalanced" in the number of entries per row....
bool isLocallyComplete() const
Is this Export or Import locally complete?
void doImport(const SrcDistObject &source, const Import< LocalOrdinal, GlobalOrdinal, Node > &importer, const CombineMode CM, const bool restrictedMode=false)
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
bool isDistributed() const
Whether this is a globally distributed object.
Sets up and executes a communication plan for a Tpetra DistObject.
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
Accessors for the Teuchos::Comm and Kokkos Node objects.
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
local_map_type getLocalMap() const
Get the LocalMap for Kokkos-Kernels.
One or more distributed dense vectors.
void reduce()
Sum values of a locally replicated multivector across all processes.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
size_t getLocalLength() const
Local number of rows on the calling process.
size_t getNumVectors() const
Number of columns in the multivector.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on host. This requires that ther...
bool isConstantStride() const
Whether this multivector has constant stride between columns.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRangeMap() const =0
The Map associated with the range of this operator, which must be compatible with Y....
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getDomainMap() const =0
The Map associated with the domain of this operator, which must be compatible with X....
An abstract interface for graphs accessed by rows.
A read-only, row-oriented interface to a sparse matrix.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRowMap() const =0
The Map that describes the distribution of rows over processes.
virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const =0
Get a copy of the given global row's entries.
virtual size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const =0
The current number of entries on the calling process in the specified local row.
Abstract base class for objects that can be the source of an Import or Export operation.
A distributed dense vector.
Implementation details of Tpetra.
Nonmember function that computes a residual Computes R = B - A * X.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries....
void verbosePrintArray(std::ostream &out, const ArrayType &x, const char name[], const size_t maxNumToPrint)
Print min(x.size(), maxNumToPrint) entries of x.
void copyOffsets(const OutputViewType &dst, const InputViewType &src)
Copy row offsets (in a sparse graph or matrix) from src to dst. The offsets may have different types.
void leftScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Left-scale a KokkosSparse::CrsMatrix.
LO getLocalDiagCopyWithoutOffsetsNotFillComplete(::Tpetra::Vector< SC, LO, GO, NT > &diag, const ::Tpetra::RowMatrix< SC, LO, GO, NT > &A, const bool debug=false)
Given a locally indexed, global sparse matrix, extract the matrix's diagonal entries into a Tpetra::V...
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
Teuchos::ArrayView< typename DualViewType::t_dev::value_type > getArrayViewFromDualView(const DualViewType &x)
Get a Teuchos::ArrayView which views the host Kokkos::View of the input 1-D Kokkos::DualView.
void copyConvert(const OutputViewType &dst, const InputViewType &src)
Copy values from the 1-D Kokkos::View src, to the 1-D Kokkos::View dst, of the same length....
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
void rightScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Right-scale a KokkosSparse::CrsMatrix.
std::unique_ptr< std::string > createPrefix(const int myRank, const char prefix[])
Create string prefix for each line of verbose output.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
static LocalMapType::local_ordinal_type getDiagCopyWithoutOffsets(const DiagType &D, const LocalMapType &rowMap, const LocalMapType &colMap, const CrsMatrixType &A)
Given a locally indexed, local sparse matrix, and corresponding local row and column Maps,...
void packCrsMatrixNew(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports, const Kokkos::DualView< size_t *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &numPacketsPerLID, const Kokkos::DualView< const LO *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication, for "new" DistObject inter...
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2, const bool stableSort=false)
Sort the first array, and apply the resulting permutation to the second array.
Teuchos_Ordinal Array_size_type
Size type for Teuchos Array objects.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > createOneToOne(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &M)
Nonmember constructor for a contiguous Map with user-defined weights and a user-specified,...
void merge2(IT1 &indResultOut, IT2 &valResultOut, IT1 indBeg, IT1 indEnd, IT2 valBeg, IT2)
Merge values in place, additively, with the same index.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes).
@ INSERT
Insert new values that don't currently exist.
@ ZERO
Replace old values with zero.
Functor for the the ABSMAX CombineMode of Import and Export operations.
Scalar operator()(const Scalar &x, const Scalar &y)
Return the maximum of the magnitudes (absolute values) of x and y.
Traits class for packing / unpacking data of type T.
Traits class for allocating a Kokkos::View<T*, D>.
Allocation information for a locally owned row in a CrsGraph or CrsMatrix.