DD-AVX  2.0.0
matrix_spmv_crs.cpp
Go to the documentation of this file.
1 #include<DD-AVX_internal.hpp>
2 using namespace ddavx_core;
3 
4 namespace dd_avx{
5 
6 #if defined USE_AVX512
7 
8  // 実験的にsetを関数化して書き直しの箇所を減らしてみます
9  inline reg set_all(const std::vector<double>& x, const std::vector<int>& index, const size_t i){
10  reg tmp;
11  tmp = set(x[index[i+0]], x[index[i+1]], x[index[i+2]], x[index[i+3]], x[index[i+4]], x[index[i+5]], x[index[i+6]], x[index[i+7]]);
12  return tmp;
13  }
14  //512
15  // D, DD, DD
16  void matvec(const d_real_SpMat& A, const dd_real_vector& x, dd_real_vector& y){
17  if(x.size() != y.size()){
18  std::cerr << "error bad vector size" << std::endl;
19  assert(1);
20  }
21  if(x.size() != A.get_row()){
22  std::cerr << "error bad matrix size" << std::endl;
23  assert(1);
24  }
25 
26  registers regs;
27 
28 #pragma omp parallel for schedule(guided) private(regs)
29  for(int i=0; i<A.get_row(); i++){
30  reg y_hi = regs.zeros;
31  reg y_lo = regs.zeros;
32  int j = 0;
33 
34  int ww = A.row_ptr[i+1] - (SIMD_Length-1);
35  for(j = A.row_ptr[i]; j < ww; j+=SIMD_Length){
36 
37  reg x_hi = set_all(x.hi, A.col_ind, j);
38  reg x_lo = set_all(x.lo, A.col_ind, j);
39 
40  reg Areg = load(A.val[j]);
41 
42  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
43  }
44 
45  // Fraction Processing (SIMD_Length=8)
46  // ここは関数化したいな..
47  if(j == A.row_ptr[i+1]-7){
48  reg x_hi = set(0.0, x.hi[A.col_ind[j+6]], x.hi[A.col_ind[j+5]], x.hi[A.col_ind[j+4]], x.hi[A.col_ind[j+3]], x.hi[A.col_ind[j+2]], x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
49  reg x_lo = set(0.0, x.lo[A.col_ind[j+6]], x.lo[A.col_ind[j+5]], x.lo[A.col_ind[j+4]], x.lo[A.col_ind[j+3]], x.lo[A.col_ind[j+2]], x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
50 
51  reg Areg = load(A.val[j]);
52 
53  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
54  j+=7;
55  }
56 
57  if(j == A.row_ptr[i+1]-6){
58  reg x_hi = set(0.0, 0.0, x.hi[A.col_ind[j+5]], x.hi[A.col_ind[j+4]], x.hi[A.col_ind[j+3]], x.hi[A.col_ind[j+2]], x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
59  reg x_lo = set(0.0, 0.0, x.lo[A.col_ind[j+5]], x.lo[A.col_ind[j+4]], x.lo[A.col_ind[j+3]], x.lo[A.col_ind[j+2]], x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
60 
61  reg Areg = load(A.val[j]);
62  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
63  j+=6;
64  }
65 
66  if(j == A.row_ptr[i+1]-5){
67  reg x_hi = set(0.0, 0.0, 0.0, x.hi[A.col_ind[j+4]], x.hi[A.col_ind[j+3]], x.hi[A.col_ind[j+2]], x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
68  reg x_lo = set(0.0, 0.0, 0.0, x.lo[A.col_ind[j+4]], x.lo[A.col_ind[j+3]], x.lo[A.col_ind[j+2]], x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
69 
70  reg Areg = load(A.val[j]);
71  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
72  j+=5;
73  }
74 
75  if(j == A.row_ptr[i+1]-4){
76  reg x_hi = set(0.0, 0.0, 0.0, 0.0, x.hi[A.col_ind[j+3]], x.hi[A.col_ind[j+2]], x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
77  reg x_lo = set(0.0, 0.0, 0.0, 0.0, x.lo[A.col_ind[j+3]], x.lo[A.col_ind[j+2]], x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
78 
79  reg Areg = load(A.val[j]);
80  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
81  j+=4;
82  }
83 
84  if(j == A.row_ptr[i+1]-3){
85  reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, x.hi[A.col_ind[j+2]], x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
86  reg x_lo = set(0.0, 0.0, 0.0, 0.0, 0.0, x.lo[A.col_ind[j+2]], x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
87 
88  reg Areg = load(A.val[j]);
89 
90  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
91  j+=3;
92  }
93 
94  if(j == A.row_ptr[i+1]-2){
95  reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
96  reg x_lo = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
97 
98  reg Areg = load(A.val[j]);
99  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
100  j+=2;
101  }
102 
103  if(j == A.row_ptr[i+1]-1){
104  reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.hi[A.col_ind[j+0]]);
105  reg x_lo = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.lo[A.col_ind[j+0]]);
106 
107  reg Areg = load(A.val[j]);
108  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
109  j+=1;
110  }
111 
112  y[i] = reduction(y_hi, y_lo);
113  }
114  }
115 
116  // D, D, DD
117  void matvec(const d_real_SpMat& A, const d_real_vector& x, dd_real_vector& y){
118  if(x.size() != y.size()){
119  std::cerr << "error bad vector size" << std::endl;
120  assert(1);
121  }
122  if(x.size() != A.get_row()){
123  std::cerr << "error bad matrix size" << std::endl;
124  assert(1);
125  }
126 
127  registers regs;
128 
129 #pragma omp parallel for schedule(guided) private(regs)
130  for(int i=0; i<A.get_row(); i++){
131  reg y_hi = regs.zeros;
132  reg y_lo = regs.zeros;
133  int j = 0;
134 
135  int ww = A.row_ptr[i+1] - (SIMD_Length-1);
136  for(j = A.row_ptr[i]; j < ww; j+=SIMD_Length){
137 
138  reg x_hi = set_all(x, A.col_ind, j);
139  reg x_lo = regs.zeros;
140 
141  reg Areg = load(A.val[j]);
142 
143  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
144  }
145 
146  //Fraction Processing
147  if(j == A.row_ptr[i+1]-7){
148  reg x_hi = set(0.0, x[A.col_ind[j+6]], x[A.col_ind[j+5]], x[A.col_ind[j+4]], x[A.col_ind[j+3]], x[A.col_ind[j+2]], x[A.col_ind[j+1]], x[A.col_ind[j+0]]);
149  reg x_lo = regs.zeros;
150 
151  reg Areg = load(A.val[j]);
152 
153  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
154  j+=7;
155  }
156 
157  if(j == A.row_ptr[i+1]-6){
158  reg x_hi = set(0.0, 0.0, x[A.col_ind[j+5]], x[A.col_ind[j+4]], x[A.col_ind[j+3]], x[A.col_ind[j+2]], x[A.col_ind[j+1]], x[A.col_ind[j+0]]);
159  reg x_lo = regs.zeros;
160 
161  reg Areg = load(A.val[j]);
162  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
163  j+=6;
164  }
165 
166  if(j == A.row_ptr[i+1]-5){
167  reg x_hi = set(0.0, 0.0, 0.0, x[A.col_ind[j+4]], x[A.col_ind[j+3]], x[A.col_ind[j+2]], x[A.col_ind[j+1]], x[A.col_ind[j+0]]);
168  reg x_lo = regs.zeros;
169 
170  reg Areg = load(A.val[j]);
171  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
172  j+=5;
173  }
174 
175  if(j == A.row_ptr[i+1]-4){
176  reg x_hi = set(0.0, 0.0, 0.0, 0.0, x[A.col_ind[j+3]], x[A.col_ind[j+2]], x[A.col_ind[j+1]], x[A.col_ind[j+0]]);
177  reg x_lo = regs.zeros;
178 
179  reg Areg = load(A.val[j]);
180  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
181  j+=4;
182  }
183 
184  if(j == A.row_ptr[i+1]-3){
185  reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, x[A.col_ind[j+2]], x[A.col_ind[j+1]], x[A.col_ind[j+0]]);
186  reg x_lo = regs.zeros;
187 
188  reg Areg = load(A.val[j]);
189 
190  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
191  j+=3;
192  }
193 
194  if(j == A.row_ptr[i+1]-2){
195  reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x[A.col_ind[j+1]], x[A.col_ind[j+0]]);
196  reg x_lo = regs.zeros;
197 
198  reg Areg = load(A.val[j]);
199  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
200  j+=2;
201  }
202 
203  if(j == A.row_ptr[i+1]-1){
204  reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x[A.col_ind[j+0]]);
205  reg x_lo = regs.zeros;
206 
207  reg Areg = load(A.val[j]);
208  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
209  j+=1;
210  }
211 
212  y[i] = reduction(y_hi, y_lo);
213  }
214  }
215 
216  // D, DD, D
217  void matvec(const d_real_SpMat& A, const dd_real_vector& x, d_real_vector& y){
218  if(x.size() != y.size()){
219  std::cerr << "error bad vector size" << std::endl;
220  assert(1);
221  }
222  if(x.size() != A.get_row()){
223  std::cerr << "error bad matrix size" << std::endl;
224  assert(1);
225  }
226 
227  registers regs;
228 
229 #pragma omp parallel for schedule(guided) private(regs)
230  for(int i=0; i<A.get_row(); i++){
231  reg y_hi = regs.zeros;
232  reg y_lo = regs.zeros;
233  int j = 0;
234 
235  int ww = A.row_ptr[i+1] - (SIMD_Length-1);
236  for(j = A.row_ptr[i]; j < ww; j+=SIMD_Length){
237 
238  reg x_hi = set_all(x.hi, A.col_ind, j);
239  reg x_lo = set_all(x.lo, A.col_ind, j);
240 
241  reg Areg = load(A.val[j]);
242 
243  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
244  }
245 
246  //Fraction Processing
247  if(j == A.row_ptr[i+1]-7){
248  reg x_hi = set(0.0, x.hi[A.col_ind[j+6]], x.hi[A.col_ind[j+5]], x.hi[A.col_ind[j+4]], x.hi[A.col_ind[j+3]], x.hi[A.col_ind[j+2]], x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
249  reg x_lo = set(0.0, x.lo[A.col_ind[j+6]], x.lo[A.col_ind[j+5]], x.lo[A.col_ind[j+4]], x.lo[A.col_ind[j+3]], x.lo[A.col_ind[j+2]], x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
250 
251  reg Areg = load(A.val[j]);
252 
253  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
254  j+=7;
255  }
256 
257  if(j == A.row_ptr[i+1]-6){
258  reg x_hi = set(0.0, 0.0, x.hi[A.col_ind[j+5]], x.hi[A.col_ind[j+4]], x.hi[A.col_ind[j+3]], x.hi[A.col_ind[j+2]], x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
259  reg x_lo = set(0.0, 0.0, x.lo[A.col_ind[j+5]], x.lo[A.col_ind[j+4]], x.lo[A.col_ind[j+3]], x.lo[A.col_ind[j+2]], x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
260 
261  reg Areg = load(A.val[j]);
262  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
263  j+=6;
264  }
265 
266  if(j == A.row_ptr[i+1]-5){
267  reg x_hi = set(0.0, 0.0, 0.0, x.hi[A.col_ind[j+4]], x.hi[A.col_ind[j+3]], x.hi[A.col_ind[j+2]], x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
268  reg x_lo = set(0.0, 0.0, 0.0, x.lo[A.col_ind[j+4]], x.lo[A.col_ind[j+3]], x.lo[A.col_ind[j+2]], x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
269 
270  reg Areg = load(A.val[j]);
271  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
272  j+=5;
273  }
274 
275  if(j == A.row_ptr[i+1]-4){
276  reg x_hi = set(0.0, 0.0, 0.0, 0.0, x.hi[A.col_ind[j+3]], x.hi[A.col_ind[j+2]], x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
277  reg x_lo = set(0.0, 0.0, 0.0, 0.0, x.lo[A.col_ind[j+3]], x.lo[A.col_ind[j+2]], x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
278 
279  reg Areg = load(A.val[j]);
280  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
281  j+=4;
282  }
283 
284  if(j == A.row_ptr[i+1]-3){
285  reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, x.hi[A.col_ind[j+2]], x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
286  reg x_lo = set(0.0, 0.0, 0.0, 0.0, 0.0, x.lo[A.col_ind[j+2]], x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
287 
288  reg Areg = load(A.val[j]);
289 
290  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
291  j+=3;
292  }
293 
294  if(j == A.row_ptr[i+1]-2){
295  reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
296  reg x_lo = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
297 
298  reg Areg = load(A.val[j]);
299  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
300  j+=2;
301  }
302 
303 
304  if(j == A.row_ptr[i+1]-1){
305  reg x_hi = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.hi[A.col_ind[j+0]]);
306  reg x_lo = set(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, x.lo[A.col_ind[j+0]]);
307 
308  reg Areg = load(A.val[j]);
309  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
310  j+=1;
311  }
312 
313  y.data()[i] = reduction(y_hi, y_lo);
314  }
315  }
316 
317  // D, D, D
318  void matvec(const d_real_SpMat& A, const d_real_vector& x, d_real_vector& y){
319  if(x.size() != y.size()){
320  std::cerr << "error bad vector size" << std::endl;
321  assert(1);
322  }
323  if(x.size() != A.get_row()){
324  std::cerr << "error bad matrix size" << std::endl;
325  assert(1);
326  }
327 
328 #pragma omp parallel for schedule(guided)
329  for(int i=0; i<A.get_row(); i++){
330  y.data()[i] = 0;
331  }
332 
333 #pragma omp parallel for schedule(guided)
334  for(int i=0; i<A.get_row(); i++){
335  for(int j = A.row_ptr[i]; j < A.row_ptr[i+1]; j++){
336  y.data()[i] += A.val[j] * x[A.col_ind[j]];
337  }
338  }
339  }
340 
341 #else
342  // 実験的にsetを関数化して書き直しの箇所を減らしてみます
343  inline reg set_all(const std::vector<double>& x, const std::vector<int>& index, const int i){
344  reg tmp;
345  tmp = set(x[index[i+0]], x[index[i+1]], x[index[i+2]], x[index[i+3]]);
346  return tmp;
347  }
348 
349  // D, DD, DD
350  void matvec(const d_real_SpMat& A, const dd_real_vector& x, dd_real_vector& y){
351  if(x.size() != y.size()){
352  std::cerr << "error bad vector size" << std::endl;
353  assert(1);
354  }
355  if(x.size() != A.get_row()){
356  std::cerr << "error bad matrix size" << std::endl;
357  assert(1);
358  }
359 
360  registers regs;
361 
362 #pragma omp parallel for schedule(guided) private(regs)
363  for(int i=0; i<A.get_row(); i++){
364  reg y_hi = regs.zeros;
365  reg y_lo = regs.zeros;
366  int j = 0;
367 
368  int ww = A.row_ptr[i+1] - (SIMD_Length-1);
369  for(j = A.row_ptr[i]; j < ww; j+=SIMD_Length){
370 
371  reg x_hi = set_all(x.hi, A.col_ind, j);
372  reg x_lo = set_all(x.lo, A.col_ind, j);
373 
374  reg Areg = load(A.val[j]);
375 
376  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
377  }
378 
379  // Fraction Processing (SIMD_Length=4)
380  // ここは関数化したいな..
381  if(j == A.row_ptr[i+1]-3){
382  reg x_hi = set(0.0, x.hi[A.col_ind[j+2]], x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
383  reg x_lo = set(0.0, x.lo[A.col_ind[j+2]], x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
384 
385  reg Areg = load(A.val[j]);
386 
387  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
388  j+=3;
389  }
390 
391  if(j == A.row_ptr[i+1]-2){
392  reg x_hi = set(0.0, 0.0, x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
393  reg x_lo = set(0.0, 0.0, x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
394 
395  reg Areg = load(A.val[j]);
396  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
397  j+=2;
398  }
399 
400  if(j == A.row_ptr[i+1]-1){
401  reg x_hi = set(0.0, 0.0, 0.0, x.hi[A.col_ind[j+0]]);
402  reg x_lo = set(0.0, 0.0, 0.0, x.lo[A.col_ind[j+0]]);
403 
404  reg Areg = load(A.val[j]);
405  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
406  j+=1;
407  }
408 
409  y[i] = reduction(y_hi, y_lo);
410  }
411  }
412 
413  // D, D, DD
414  void matvec(const d_real_SpMat& A, const d_real_vector& x, dd_real_vector& y){
415  if(x.size() != y.size()){
416  std::cerr << "error bad vector size" << std::endl;
417  assert(1);
418  }
419  if(x.size() != A.get_row()){
420  std::cerr << "error bad matrix size" << std::endl;
421  assert(1);
422  }
423 
424  registers regs;
425 
426 #pragma omp parallel for schedule(guided) private(regs)
427  for(int i=0; i<A.get_row(); i++){
428  reg y_hi = regs.zeros;
429  reg y_lo = regs.zeros;
430  int j = 0;
431 
432  int ww = A.row_ptr[i+1] - (SIMD_Length-1);
433  for(j = A.row_ptr[i]; j < ww; j+=SIMD_Length){
434 
435  reg x_hi = set_all(x, A.col_ind, j);
436  reg x_lo = regs.zeros;
437 
438  reg Areg = load(A.val[j]);
439 
440  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
441  }
442 
443  //Fraction Processing
444  if(j == A.row_ptr[i+1]-3){
445  reg x_hi = set(0.0, x[A.col_ind[j+2]], x[A.col_ind[j+1]], x[A.col_ind[j+0]]);
446  reg x_lo = regs.zeros;
447 
448  reg Areg = load(A.val[j]);
449 
450  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
451  j+=3;
452  }
453 
454  if(j == A.row_ptr[i+1]-2){
455  reg x_hi = set(0.0, 0.0, x[A.col_ind[j+1]], x[A.col_ind[j+0]]);
456  reg x_lo = regs.zeros;
457 
458  reg Areg = load(A.val[j]);
459  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
460  j+=2;
461  }
462 
463  if(j == A.row_ptr[i+1]-1){
464  reg x_hi = set(0.0, 0.0, 0.0, x[A.col_ind[j+0]]);
465  reg x_lo = regs.zeros;
466 
467  reg Areg = load(A.val[j]);
468  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
469  j+=1;
470  }
471 
472  y[i] = reduction(y_hi, y_lo);
473  }
474  }
475 
476  // D, DD, D
477  void matvec(const d_real_SpMat& A, const dd_real_vector& x, d_real_vector& y){
478  if(x.size() != y.size()){
479  std::cerr << "error bad vector size" << std::endl;
480  assert(1);
481  }
482  if(x.size() != A.get_row()){
483  std::cerr << "error bad matrix size" << std::endl;
484  assert(1);
485  }
486 
487  registers regs;
488 
489 #pragma omp parallel for schedule(guided) private(regs)
490  for(int i=0; i<A.get_row(); i++){
491  reg y_hi = regs.zeros;
492  reg y_lo = regs.zeros;
493  int j = 0;
494 
495  int ww = A.row_ptr[i+1] - (SIMD_Length-1);
496  for(j = A.row_ptr[i]; j < ww; j+=SIMD_Length){
497 
498  reg x_hi = set_all(x.hi, A.col_ind, j);
499  reg x_lo = set_all(x.lo, A.col_ind, j);
500 
501  reg Areg = load(A.val[j]);
502 
503  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
504  }
505 
506  //Fraction Processing
507  if(j == A.row_ptr[i+1]-3){
508  reg x_hi = set(0.0, x.hi[A.col_ind[j+2]], x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
509  reg x_lo = set(0.0, x.lo[A.col_ind[j+2]], x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
510 
511  reg Areg = load(A.val[j]);
512 
513  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
514  j+=3;
515  }
516 
517  if(j == A.row_ptr[i+1]-2){
518  reg x_hi = set(0.0, 0.0, x.hi[A.col_ind[j+1]], x.hi[A.col_ind[j+0]]);
519  reg x_lo = set(0.0, 0.0, x.lo[A.col_ind[j+1]], x.lo[A.col_ind[j+0]]);
520 
521  reg Areg = load(A.val[j]);
522  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
523  j+=2;
524  }
525 
526  if(j == A.row_ptr[i+1]-1){
527  reg x_hi = set(0.0, 0.0, 0.0, x.hi[A.col_ind[j+0]]);
528  reg x_lo = set(0.0, 0.0, 0.0, x.lo[A.col_ind[j+0]]);
529 
530  reg Areg = load(A.val[j]);
531  Fmad(y_hi, y_lo, y_hi, y_lo, x_hi, x_lo, Areg, regs);
532  j+=1;
533  }
534 
535  y.data()[i] = reduction(y_hi, y_lo);
536  }
537  }
538 
539  // D, D, D
540  void matvec(const d_real_SpMat& A, const d_real_vector& x, d_real_vector& y){
541  if(x.size() != y.size()){
542  std::cerr << "error bad vector size" << std::endl;
543  assert(1);
544  }
545  if(x.size() != A.get_row()){
546  std::cerr << "error bad matrix size" << std::endl;
547  assert(1);
548  }
549 
550 #pragma omp parallel for schedule(guided)
551  for(int i=0; i<A.get_row(); i++){
552  y.data()[i] = 0;
553  }
554 
555 #pragma omp parallel for schedule(guided)
556  for(int i=0; i<A.get_row(); i++){
557  for(int j = A.row_ptr[i]; j < A.row_ptr[i+1]; j++){
558  y.data()[i] += A.val[j] * x[A.col_ind[j]];
559  }
560  }
561  }
562 #endif
563 }
double precision CRS format sparse matrix
std::vector< int > col_ind
std::vector< int > row_ptr
int get_row() const
get # of row
std::vector< double > val
Double precision vector class, This class is almost same as std::vector<double>
Double-double precision vector class.
std::vector< double > lo
int size() const
get size
std::vector< double > hi
void matvec(const d_real_SpMat &A, const dd_real_vector &x, dd_real_vector &y)
matvec: y = Ax
reg set_all(const std::vector< double > &x, const std::vector< int > &index, const int i)