1 #include<DD-AVX_internal.hpp>
2 using namespace ddavx_core;
9 inline reg
set_all(
const std::vector<double>& x,
const std::vector<int>& index,
const size_t i){
11 tmp = set(x[index[i+0]], x[index[i+1]], x[index[i+2]], x[index[i+3]], x[index[i+4]], x[index[i+5]], x[index[i+6]], x[index[i+7]]);
18 std::cerr <<
"error bad vector size" << std::endl;
22 std::cerr <<
"error bad matrix size" << std::endl;
28 #pragma omp parallel for schedule(guided) private(regs)
29 for(
int i=0; i<A.
get_row(); i++){
30 reg y_hi = regs.zeros;
31 reg y_lo = regs.zeros;
34 int ww = A.
row_ptr[i+1] - (SIMD_Length-1);
35 for(j = A.
row_ptr[i]; j < ww; j+=SIMD_Length){
40 reg Areg = load(A.
val[j]);
42 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
51 reg Areg = load(A.
val[j]);
53 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
61 reg Areg = load(A.
val[j]);
62 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
70 reg Areg = load(A.
val[j]);
71 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
79 reg Areg = load(A.
val[j]);
80 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
88 reg Areg = load(A.
val[j]);
90 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
95 reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.
hi[A.
col_ind[j+1]], x.
hi[A.
col_ind[j+0]]);
96 reg x_lo = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.
lo[A.
col_ind[j+1]], x.
lo[A.
col_ind[j+0]]);
98 reg Areg = load(A.
val[j]);
99 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
104 reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.
hi[A.
col_ind[j+0]]);
105 reg x_lo = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.
lo[A.
col_ind[j+0]]);
107 reg Areg = load(A.
val[j]);
108 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
112 y[i] = reduction(y_hi, y_lo);
118 if(x.size() != y.
size()){
119 std::cerr <<
"error bad vector size" << std::endl;
123 std::cerr <<
"error bad matrix size" << std::endl;
129 #pragma omp parallel for schedule(guided) private(regs)
130 for(
int i=0; i<A.
get_row(); i++){
131 reg y_hi = regs.zeros;
132 reg y_lo = regs.zeros;
135 int ww = A.
row_ptr[i+1] - (SIMD_Length-1);
136 for(j = A.
row_ptr[i]; j < ww; j+=SIMD_Length){
139 reg x_lo = regs.zeros;
141 reg Areg = load(A.
val[j]);
143 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
149 reg x_lo = regs.zeros;
151 reg Areg = load(A.
val[j]);
153 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
159 reg x_lo = regs.zeros;
161 reg Areg = load(A.
val[j]);
162 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
168 reg x_lo = regs.zeros;
170 reg Areg = load(A.
val[j]);
171 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
177 reg x_lo = regs.zeros;
179 reg Areg = load(A.
val[j]);
180 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
186 reg x_lo = regs.zeros;
188 reg Areg = load(A.
val[j]);
190 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
195 reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x[A.
col_ind[j+1]], x[A.
col_ind[j+0]]);
196 reg x_lo = regs.zeros;
198 reg Areg = load(A.
val[j]);
199 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
204 reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x[A.
col_ind[j+0]]);
205 reg x_lo = regs.zeros;
207 reg Areg = load(A.
val[j]);
208 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
212 y[i] = reduction(y_hi, y_lo);
218 if(x.
size() != y.size()){
219 std::cerr <<
"error bad vector size" << std::endl;
223 std::cerr <<
"error bad matrix size" << std::endl;
229 #pragma omp parallel for schedule(guided) private(regs)
230 for(
int i=0; i<A.
get_row(); i++){
231 reg y_hi = regs.zeros;
232 reg y_lo = regs.zeros;
235 int ww = A.
row_ptr[i+1] - (SIMD_Length-1);
236 for(j = A.
row_ptr[i]; j < ww; j+=SIMD_Length){
241 reg Areg = load(A.
val[j]);
243 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
251 reg Areg = load(A.
val[j]);
253 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
261 reg Areg = load(A.
val[j]);
262 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
270 reg Areg = load(A.
val[j]);
271 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
279 reg Areg = load(A.
val[j]);
280 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
288 reg Areg = load(A.
val[j]);
290 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
295 reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.
hi[A.
col_ind[j+1]], x.
hi[A.
col_ind[j+0]]);
296 reg x_lo = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.
lo[A.
col_ind[j+1]], x.
lo[A.
col_ind[j+0]]);
298 reg Areg = load(A.
val[j]);
299 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
305 reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.
hi[A.
col_ind[j+0]]);
306 reg x_lo = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.
lo[A.
col_ind[j+0]]);
308 reg Areg = load(A.
val[j]);
309 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
313 y.data()[i] = reduction(y_hi, y_lo);
319 if(x.size() != y.size()){
320 std::cerr <<
"error bad vector size" << std::endl;
324 std::cerr <<
"error bad matrix size" << std::endl;
328 #pragma omp parallel for schedule(guided)
329 for(
int i=0; i<A.
get_row(); i++){
333 #pragma omp parallel for schedule(guided)
334 for(
int i=0; i<A.
get_row(); i++){
343 inline reg
set_all(
const std::vector<double>& x,
const std::vector<int>& index,
const int i){
345 tmp = set(x[index[i+0]], x[index[i+1]], x[index[i+2]], x[index[i+3]]);
352 std::cerr <<
"error bad vector size" << std::endl;
356 std::cerr <<
"error bad matrix size" << std::endl;
362 #pragma omp parallel for schedule(guided) private(regs)
363 for(
int i=0; i<A.
get_row(); i++){
364 reg y_hi = regs.zeros;
365 reg y_lo = regs.zeros;
368 int ww = A.
row_ptr[i+1] - (SIMD_Length-1);
369 for(j = A.
row_ptr[i]; j < ww; j+=SIMD_Length){
374 reg Areg = load(A.
val[j]);
376 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
385 reg Areg = load(A.
val[j]);
387 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
395 reg Areg = load(A.
val[j]);
396 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
401 reg x_hi = set(0.0, 0.0, 0.0, x.
hi[A.
col_ind[j+0]]);
402 reg x_lo = set(0.0, 0.0, 0.0, x.
lo[A.
col_ind[j+0]]);
404 reg Areg = load(A.
val[j]);
405 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
409 y[i] = reduction(y_hi, y_lo);
415 if(x.size() != y.
size()){
416 std::cerr <<
"error bad vector size" << std::endl;
420 std::cerr <<
"error bad matrix size" << std::endl;
426 #pragma omp parallel for schedule(guided) private(regs)
427 for(
int i=0; i<A.
get_row(); i++){
428 reg y_hi = regs.zeros;
429 reg y_lo = regs.zeros;
432 int ww = A.
row_ptr[i+1] - (SIMD_Length-1);
433 for(j = A.
row_ptr[i]; j < ww; j+=SIMD_Length){
436 reg x_lo = regs.zeros;
438 reg Areg = load(A.
val[j]);
440 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
446 reg x_lo = regs.zeros;
448 reg Areg = load(A.
val[j]);
450 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
456 reg x_lo = regs.zeros;
458 reg Areg = load(A.
val[j]);
459 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
464 reg x_hi = set(0.0, 0.0, 0.0, x[A.
col_ind[j+0]]);
465 reg x_lo = regs.zeros;
467 reg Areg = load(A.
val[j]);
468 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
472 y[i] = reduction(y_hi, y_lo);
478 if(x.
size() != y.size()){
479 std::cerr <<
"error bad vector size" << std::endl;
483 std::cerr <<
"error bad matrix size" << std::endl;
489 #pragma omp parallel for schedule(guided) private(regs)
490 for(
int i=0; i<A.
get_row(); i++){
491 reg y_hi = regs.zeros;
492 reg y_lo = regs.zeros;
495 int ww = A.
row_ptr[i+1] - (SIMD_Length-1);
496 for(j = A.
row_ptr[i]; j < ww; j+=SIMD_Length){
501 reg Areg = load(A.
val[j]);
503 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
511 reg Areg = load(A.
val[j]);
513 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
521 reg Areg = load(A.
val[j]);
522 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
527 reg x_hi = set(0.0, 0.0, 0.0, x.
hi[A.
col_ind[j+0]]);
528 reg x_lo = set(0.0, 0.0, 0.0, x.
lo[A.
col_ind[j+0]]);
530 reg Areg = load(A.
val[j]);
531 Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
535 y.data()[i] = reduction(y_hi, y_lo);
541 if(x.size() != y.size()){
542 std::cerr <<
"error bad vector size" << std::endl;
546 std::cerr <<
"error bad matrix size" << std::endl;
550 #pragma omp parallel for schedule(guided)
551 for(
int i=0; i<A.
get_row(); i++){
555 #pragma omp parallel for schedule(guided)
556 for(
int i=0; i<A.
get_row(); i++){
double precision CRS format sparse matrix
std::vector< int > col_ind
std::vector< int > row_ptr
int get_row() const
get # of row
std::vector< double > val
Double precision vector class, This class is almost same as std::vector<double>
Double-double precision vector class.
void matvec(const d_real_SpMat &A, const dd_real_vector &x, dd_real_vector &y)
matvec: y = Ax
reg set_all(const std::vector< double > &x, const std::vector< int > &index, const int i)