DD-AVX  2.0.0
axpyz.cpp
Go to the documentation of this file.
1 #include<DD-AVX_internal.hpp>
2 using namespace ddavx_core;
3 
4 namespace dd_avx{
5 //alpha = DD, z = DD ///////////////////////////////////////////
6  void axpyz(const dd_real& alpha, const dd_real_vector& x, const dd_real_vector& y, dd_real_vector& z){
7  if(x.size() != y.size() && x.size() != z.size()){
8  std::cerr << "error bad vector size" << std::endl;
9  assert(1);
10  }
11  registers regs;
12 
13 #pragma omp parallel private(regs)
14  {
15  int i=0, is=0, ie=0;
16  get_isie(y.size(), is, ie);
17  reg alpha_hi = broadcast(alpha.x[0]);
18  reg alpha_lo = broadcast(alpha.x[1]);
19  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
20 
21  reg x_hi = load(x.hi[i]);
22  reg x_lo = load(x.lo[i]);
23 
24  reg y_hi = load(y.hi[i]);
25  reg y_lo = load(y.lo[i]);
26 
27  reg z_hi = load(z.hi[i]);
28  reg z_lo = load(z.lo[i]);
29 
30  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
31 
32  store(z.hi[i], z_hi);
33  store(z.lo[i], z_lo);
34  }
35  for(;i<ie;i++){
36  Fma(z.hi[i], z.lo[i], y.hi[i], y.lo[i], alpha.x[0], alpha.x[1], x.hi[i], x.lo[i]);
37  }
38  }
39  }
40 
41  void axpyz(const dd_real& alpha, const d_real_vector& x, const dd_real_vector& y, dd_real_vector& z){
42  if(x.size() != y.size() && x.size() != z.size()){
43  std::cerr << "error bad vector size" << std::endl;
44  assert(1);
45  }
46  registers regs;
47 
48 #pragma omp parallel private(regs)
49  {
50  int i=0, is=0, ie=0;
51  get_isie(y.size(), is, ie);
52  reg alpha_hi = broadcast(alpha.x[0]);
53  reg alpha_lo = broadcast(alpha.x[1]);
54  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
55 
56  reg x_hi = load(x.data()[i]);
57  reg x_lo = regs.zeros;
58 
59  reg y_hi = load(y.hi[i]);
60  reg y_lo = load(y.lo[i]);
61 
62  reg z_hi = load(z.hi[i]);
63  reg z_lo = load(z.lo[i]);
64 
65  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
66 
67  store(z.hi[i], z_hi);
68  store(z.lo[i], z_lo);
69  }
70  for(;i<ie;i++){
71  Fma(z.hi[i], z.lo[i], y.hi[i], y.lo[i], alpha.x[0], alpha.x[1], x.data()[i], 0.0);
72  }
73  }
74  }
75 
76  void axpyz(const dd_real& alpha, const dd_real_vector& x, const d_real_vector& y, dd_real_vector& z){
77  if(x.size() != y.size() && x.size() != z.size()){
78  std::cerr << "error bad vector size" << std::endl;
79  assert(1);
80  }
81  registers regs;
82 
83 #pragma omp parallel private(regs)
84  {
85  int i=0, is=0, ie=0;
86  get_isie(y.size(), is, ie);
87  reg alpha_hi = broadcast(alpha.x[0]);
88  reg alpha_lo = broadcast(alpha.x[1]);
89  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
90 
91  reg x_hi = load(x.hi[i]);
92  reg x_lo = load(x.lo[i]);
93 
94  reg y_hi = load(y.data()[i]);
95  reg y_lo = regs.zeros;
96 
97  reg z_hi = load(z.hi[i]);
98  reg z_lo = load(z.lo[i]);
99 
100  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
101 
102  store(z.hi[i], z_hi);
103  store(z.lo[i], z_lo);
104  }
105  for(;i<ie;i++){
106  Fma(z.hi[i], z.lo[i], y.data()[i], 0.0, alpha.x[0], alpha.x[1], x.hi[i], x.lo[i]);
107  }
108  }
109  }
110 
111  void axpyz(const dd_real& alpha, const d_real_vector& x, const d_real_vector& y, dd_real_vector& z){
112  if(x.size() != y.size() && x.size() != z.size()){
113  std::cerr << "error bad vector size" << std::endl;
114  assert(1);
115  }
116  registers regs;
117 
118 #pragma omp parallel private(regs)
119  {
120  int i=0, is=0, ie=0;
121  get_isie(y.size(), is, ie);
122  reg alpha_hi = broadcast(alpha.x[0]);
123  reg alpha_lo = broadcast(alpha.x[1]);
124  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
125 
126  reg x_hi = load(x.data()[i]);
127  reg x_lo = regs.zeros;
128 
129  reg y_hi = load(y.data()[i]);
130  reg y_lo = regs.zeros;
131 
132  reg z_hi = load(z.hi[i]);
133  reg z_lo = load(z.lo[i]);
134 
135  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_hi, regs);
136 
137  store(z.hi[i], z_hi);
138  store(z.lo[i], z_lo);
139  }
140  for(;i<ie;i++){
141  Fma(z.hi[i], z.lo[i], y.data()[i], 0.0, alpha.x[0], alpha.x[1], x.data()[i], 0.0);
142  }
143  }
144  }
145 //alpha = D, z = DD ///////////////////////////////////////////
146  void axpyz(const d_real& alpha, const dd_real_vector& x, const dd_real_vector& y, dd_real_vector& z){
147  if(x.size() != y.size() && x.size() != z.size()){
148  std::cerr << "error bad vector size" << std::endl;
149  assert(1);
150  }
151  registers regs;
152 
153 #pragma omp parallel private(regs)
154  {
155  int i=0, is=0, ie=0;
156  get_isie(y.size(), is, ie);
157  reg alpha_hi = broadcast(alpha);
158  reg alpha_lo = regs.zeros;
159  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
160 
161  reg x_hi = load(x.hi[i]);
162  reg x_lo = load(x.lo[i]);
163 
164  reg y_hi = load(y.hi[i]);
165  reg y_lo = load(y.lo[i]);
166 
167  reg z_hi = load(z.hi[i]);
168  reg z_lo = load(z.lo[i]);
169 
170  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
171 
172  store(z.hi[i], z_hi);
173  store(z.lo[i], z_lo);
174  }
175  for(;i<ie;i++){
176  Fma(z.hi[i], z.lo[i], y.hi[i], y.lo[i], alpha, 0.0, x.hi[i], x.lo[i]);
177  }
178  }
179  }
180 
181  void axpyz(const d_real& alpha, const d_real_vector& x, const dd_real_vector& y, dd_real_vector& z){
182  if(x.size() != y.size() && x.size() != z.size()){
183  std::cerr << "error bad vector size" << std::endl;
184  assert(1);
185  }
186  registers regs;
187 
188 #pragma omp parallel private(regs)
189  {
190  int i=0, is=0, ie=0;
191  get_isie(y.size(), is, ie);
192  reg alpha_hi = broadcast(alpha);
193  reg alpha_lo = regs.zeros;
194  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
195 
196  reg x_hi = load(x.data()[i]);
197  reg x_lo = regs.zeros;
198 
199  reg y_hi = load(y.hi[i]);
200  reg y_lo = load(y.lo[i]);
201 
202  reg z_hi = load(z.hi[i]);
203  reg z_lo = load(z.lo[i]);
204 
205  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
206 
207  store(z.hi[i], z_hi);
208  store(z.lo[i], z_lo);
209  }
210  for(;i<ie;i++){
211  Fma(z.hi[i], z.lo[i], y.hi[i], y.lo[i], alpha, 0.0, x.data()[i], 0.0);
212  }
213  }
214  }
215 
216  void axpyz(const d_real& alpha, const dd_real_vector& x, const d_real_vector& y, dd_real_vector& z){
217  if(x.size() != y.size() && x.size() != z.size()){
218  std::cerr << "error bad vector size" << std::endl;
219  assert(1);
220  }
221  registers regs;
222 
223 #pragma omp parallel private(regs)
224  {
225  int i=0, is=0, ie=0;
226  get_isie(y.size(), is, ie);
227  reg alpha_hi = broadcast(alpha);
228  reg alpha_lo = regs.zeros;
229  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
230 
231  reg x_hi = load(x.hi[i]);
232  reg x_lo = load(x.lo[i]);
233 
234  reg y_hi = load(y.data()[i]);
235  reg y_lo = regs.zeros;
236 
237  reg z_hi = load(z.hi[i]);
238  reg z_lo = load(z.lo[i]);
239 
240  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
241 
242  store(z.hi[i], z_hi);
243  store(z.lo[i], z_lo);
244  }
245  for(;i<ie;i++){
246  Fma(z.hi[i], z.lo[i], y.data()[i], 0.0, alpha, 0.0, x.hi[i], x.lo[i]);
247  }
248  }
249  }
250 
251  void axpyz(const d_real& alpha, const d_real_vector& x, const d_real_vector& y, dd_real_vector& z){
252  if(x.size() != y.size() && x.size() != z.size()){
253  std::cerr << "error bad vector size" << std::endl;
254  assert(1);
255  }
256  registers regs;
257 
258 #pragma omp parallel private(regs)
259  {
260  int i=0, is=0, ie=0;
261  get_isie(y.size(), is, ie);
262  reg alpha_hi = broadcast(alpha);
263  reg alpha_lo = regs.zeros;
264  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
265 
266  reg x_hi = load(x.data()[i]);
267  reg x_lo = regs.zeros;
268 
269  reg y_hi = load(y.data()[i]);
270  reg y_lo = regs.zeros;
271 
272  reg z_hi = load(z.hi[i]);
273  reg z_lo = load(z.lo[i]);
274 
275  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_hi, regs);
276 
277  store(z.hi[i], z_hi);
278  store(z.lo[i], z_lo);
279  }
280  for(;i<ie;i++){
281  Fma(z.hi[i], z.lo[i], y.data()[i], 0.0, alpha, 0.0, x.data()[i], 0.0);
282  }
283  }
284  }
285 
286 //alpha = DD, z = D ///////////////////////////////////////////
287  void axpyz(const dd_real& alpha, const dd_real_vector& x, const dd_real_vector& y, d_real_vector& z){
288  if(x.size() != y.size() && x.size() != z.size()){
289  std::cerr << "error bad vector size" << std::endl;
290  assert(1);
291  }
292  registers regs;
293 
294 #pragma omp parallel private(regs)
295  {
296  int i=0, is=0, ie=0;
297  get_isie(y.size(), is, ie);
298  reg alpha_hi = broadcast(alpha.x[0]);
299  reg alpha_lo = broadcast(alpha.x[1]);
300  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
301 
302  reg x_hi = load(x.hi[i]);
303  reg x_lo = load(x.lo[i]);
304 
305  reg y_hi = load(y.hi[i]);
306  reg y_lo = load(y.lo[i]);
307 
308  reg z_hi = load(z.data()[i]);
309  reg z_lo = regs.zeros;
310 
311  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
312 
313  store(z.data()[i], z_hi);
314  }
315  for(;i<ie;i++){
316  Fma(z.data()[i], y.hi[i], y.lo[i], alpha.x[0], alpha.x[1], x.hi[i], x.lo[i]);
317  }
318  }
319  }
320 
321  void axpyz(const dd_real& alpha, const d_real_vector& x, const dd_real_vector& y, d_real_vector& z){
322  if(x.size() != y.size() && x.size() != z.size()){
323  std::cerr << "error bad vector size" << std::endl;
324  assert(1);
325  }
326  registers regs;
327 
328 #pragma omp parallel private(regs)
329  {
330  int i=0, is=0, ie=0;
331  get_isie(y.size(), is, ie);
332  reg alpha_hi = broadcast(alpha.x[0]);
333  reg alpha_lo = broadcast(alpha.x[1]);
334  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
335 
336  reg x_hi = load(x.data()[i]);
337  reg x_lo = regs.zeros;
338 
339  reg y_hi = load(y.hi[i]);
340  reg y_lo = load(y.lo[i]);
341 
342  reg z_hi = load(z.data()[i]);
343  reg z_lo = regs.zeros;
344 
345  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
346 
347  store(z.data()[i], z_hi);
348  }
349  for(;i<ie;i++){
350  Fma(z.data()[i], y.hi[i], y.lo[i], alpha.x[0], alpha.x[1], x.data()[i], 0.0);
351  }
352  }
353  }
354 
355  void axpyz(const dd_real& alpha, const dd_real_vector& x, const d_real_vector& y, d_real_vector& z){
356  if(x.size() != y.size() && x.size() != z.size()){
357  std::cerr << "error bad vector size" << std::endl;
358  assert(1);
359  }
360  registers regs;
361 
362 #pragma omp parallel private(regs)
363  {
364  int i=0, is=0, ie=0;
365  get_isie(y.size(), is, ie);
366  reg alpha_hi = broadcast(alpha.x[0]);
367  reg alpha_lo = broadcast(alpha.x[1]);
368  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
369 
370  reg x_hi = load(x.hi[i]);
371  reg x_lo = load(x.lo[i]);
372 
373  reg y_hi = load(y.data()[i]);
374  reg y_lo = regs.zeros;
375 
376  reg z_hi = load(z.data()[i]);
377  reg z_lo = regs.zeros;
378 
379  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
380 
381  store(z.data()[i], z_hi);
382  }
383  for(;i<ie;i++){
384  Fma(z.data()[i], y.data()[i], 0.0, alpha.x[0], alpha.x[1], x.hi[i], x.lo[i]);
385  }
386  }
387  }
388 
389  void axpyz(const dd_real& alpha, const d_real_vector& x, const d_real_vector& y, d_real_vector& z){
390  if(x.size() != y.size() && x.size() != z.size()){
391  std::cerr << "error bad vector size" << std::endl;
392  assert(1);
393  }
394  registers regs;
395 
396 #pragma omp parallel private(regs)
397  {
398  int i=0, is=0, ie=0;
399  get_isie(y.size(), is, ie);
400  reg alpha_hi = broadcast(alpha.x[0]);
401  reg alpha_lo = broadcast(alpha.x[1]);
402  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
403 
404  reg x_hi = load(x.data()[i]);
405  reg x_lo = regs.zeros;
406 
407  reg y_hi = load(y.data()[i]);
408  reg y_lo = regs.zeros;
409 
410  reg z_hi = load(z.data()[i]);
411  reg z_lo = regs.zeros;
412 
413  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_hi, regs);
414 
415  store(z.data()[i], z_hi);
416  }
417  for(;i<ie;i++){
418  Fma(z.data()[i], y.data()[i], 0.0, alpha.x[0], alpha.x[1], x.data()[i], 0.0);
419  }
420  }
421  }
422 //alpha = D ///////////////////////////////////////////
423  void axpyz(const d_real& alpha, const dd_real_vector& x, const dd_real_vector& y, d_real_vector& z){
424  if(x.size() != y.size() && x.size() != z.size()){
425  std::cerr << "error bad vector size" << std::endl;
426  assert(1);
427  }
428  registers regs;
429 
430 #pragma omp parallel private(regs)
431  {
432  int i=0, is=0, ie=0;
433  get_isie(y.size(), is, ie);
434  reg alpha_hi = broadcast(alpha);
435  reg alpha_lo = regs.zeros;
436  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
437 
438  reg x_hi = load(x.hi[i]);
439  reg x_lo = load(x.lo[i]);
440 
441  reg y_hi = load(y.hi[i]);
442  reg y_lo = load(y.lo[i]);
443 
444  reg z_hi = load(z.data()[i]);
445  reg z_lo = regs.zeros;
446 
447  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
448 
449  store(z.data()[i], z_hi);
450  }
451  for(;i<ie;i++){
452  Fma(z.data()[i], y.hi[i], y.lo[i], alpha, 0.0, x.hi[i], x.lo[i]);
453  }
454  }
455  }
456 
457  void axpyz(const d_real& alpha, const d_real_vector& x, const dd_real_vector& y, d_real_vector& z){
458  if(x.size() != y.size() && x.size() != z.size()){
459  std::cerr << "error bad vector size" << std::endl;
460  assert(1);
461  }
462  registers regs;
463 
464 #pragma omp parallel private(regs)
465  {
466  int i=0, is=0, ie=0;
467  get_isie(y.size(), is, ie);
468  reg alpha_hi = broadcast(alpha);
469  reg alpha_lo = regs.zeros;
470  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
471 
472  reg x_hi = load(x.data()[i]);
473  reg x_lo = regs.zeros;
474 
475  reg y_hi = load(y.hi[i]);
476  reg y_lo = load(y.lo[i]);
477 
478  reg z_hi = load(z.data()[i]);
479  reg z_lo = regs.zeros;
480 
481  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
482 
483  store(z.data()[i], z_hi);
484  }
485  for(;i<ie;i++){
486  Fma(z.data()[i], y.hi[i], y.lo[i], alpha, 0.0, x.data()[i], 0.0);
487  }
488  }
489  }
490 
491  void axpyz(const d_real& alpha, const dd_real_vector& x, const d_real_vector& y, d_real_vector& z){
492  if(x.size() != y.size() && x.size() != z.size()){
493  std::cerr << "error bad vector size" << std::endl;
494  assert(1);
495  }
496  registers regs;
497 
498 #pragma omp parallel private(regs)
499  {
500  int i=0, is=0, ie=0;
501  get_isie(y.size(), is, ie);
502  reg alpha_hi = broadcast(alpha);
503  reg alpha_lo = regs.zeros;
504  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
505 
506  reg x_hi = load(x.hi[i]);
507  reg x_lo = load(x.lo[i]);
508 
509  reg y_hi = load(y.data()[i]);
510  reg y_lo = regs.zeros;
511 
512  reg z_hi = load(z.data()[i]);
513  reg z_lo = regs.zeros;
514 
515  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
516 
517  store(z.data()[i], z_hi);
518  }
519  for(;i<ie;i++){
520  Fma(z.data()[i], y.data()[i], 0.0, alpha, 0.0, x.hi[i], x.lo[i]);
521  }
522  }
523  }
524 
525  void axpyz(const d_real& alpha, const d_real_vector& x, const d_real_vector& y, d_real_vector& z){
526  if(x.size() != y.size() && x.size() != z.size()){
527  std::cerr << "error bad vector size" << std::endl;
528  assert(1);
529  }
530  registers regs;
531 
532 #pragma omp parallel private(regs)
533  {
534  int i=0, is=0, ie=0;
535  get_isie(y.size(), is, ie);
536  reg alpha_hi = broadcast(alpha);
537  reg alpha_lo = regs.zeros;
538  for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
539 
540  reg x_hi = load(x.data()[i]);
541  reg x_lo = regs.zeros;
542 
543  reg y_hi = load(y.data()[i]);
544  reg y_lo = regs.zeros;
545 
546  reg z_hi = load(z.data()[i]);
547  reg z_lo = regs.zeros;
548 
549  Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_hi, regs);
550 
551  store(z.data()[i], z_hi);
552  }
553  for(;i<ie;i++){
554  Fma(z.data()[i], y.data()[i], 0.0, alpha, 0.0, x.data()[i], 0.0);
555  }
556  }
557  }
558 }
double d_real
Definition: DD-AVX.hpp:22
Double precision vector class, This class is almost same as std::vector<double>
Double-double precision vector class.
std::vector< double > lo
int size() const
get size
std::vector< double > hi
void axpyz(const d_real &alpha, const d_real_vector &x, const d_real_vector &y, d_real_vector &z)
Definition: axpyz.cpp:525