1 #include<DD-AVX_internal.hpp>
2 using namespace ddavx_core;
8 std::cerr <<
"error bad vector size" << std::endl;
13 #pragma omp parallel private(regs)
16 get_isie(y.
size(), is, ie);
17 reg alpha_hi = broadcast(alpha.x[0]);
18 reg alpha_lo = broadcast(alpha.x[1]);
19 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
21 reg x_hi = load(x.
hi[i]);
22 reg x_lo = load(x.
lo[i]);
24 reg y_hi = load(y.
hi[i]);
25 reg y_lo = load(y.
lo[i]);
27 reg z_hi = load(z.
hi[i]);
28 reg z_lo = load(z.
lo[i]);
30 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
36 Fma(z.
hi[i], z.
lo[i], y.
hi[i], y.
lo[i], alpha.x[0], alpha.x[1], x.
hi[i], x.
lo[i]);
42 if(x.size() != y.
size() && x.size() != z.
size()){
43 std::cerr <<
"error bad vector size" << std::endl;
48 #pragma omp parallel private(regs)
51 get_isie(y.
size(), is, ie);
52 reg alpha_hi = broadcast(alpha.x[0]);
53 reg alpha_lo = broadcast(alpha.x[1]);
54 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
56 reg x_hi = load(x.data()[i]);
57 reg x_lo = regs.zeros;
59 reg y_hi = load(y.
hi[i]);
60 reg y_lo = load(y.
lo[i]);
62 reg z_hi = load(z.
hi[i]);
63 reg z_lo = load(z.
lo[i]);
65 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
71 Fma(z.
hi[i], z.
lo[i], y.
hi[i], y.
lo[i], alpha.x[0], alpha.x[1], x.data()[i], 0.0);
78 std::cerr <<
"error bad vector size" << std::endl;
83 #pragma omp parallel private(regs)
86 get_isie(y.size(), is, ie);
87 reg alpha_hi = broadcast(alpha.x[0]);
88 reg alpha_lo = broadcast(alpha.x[1]);
89 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
91 reg x_hi = load(x.
hi[i]);
92 reg x_lo = load(x.
lo[i]);
94 reg y_hi = load(y.data()[i]);
95 reg y_lo = regs.zeros;
97 reg z_hi = load(z.
hi[i]);
98 reg z_lo = load(z.
lo[i]);
100 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
102 store(z.
hi[i], z_hi);
103 store(z.
lo[i], z_lo);
106 Fma(z.
hi[i], z.
lo[i], y.data()[i], 0.0, alpha.x[0], alpha.x[1], x.
hi[i], x.
lo[i]);
112 if(x.size() != y.size() && x.size() != z.
size()){
113 std::cerr <<
"error bad vector size" << std::endl;
118 #pragma omp parallel private(regs)
121 get_isie(y.size(), is, ie);
122 reg alpha_hi = broadcast(alpha.x[0]);
123 reg alpha_lo = broadcast(alpha.x[1]);
124 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
126 reg x_hi = load(x.data()[i]);
127 reg x_lo = regs.zeros;
129 reg y_hi = load(y.data()[i]);
130 reg y_lo = regs.zeros;
132 reg z_hi = load(z.
hi[i]);
133 reg z_lo = load(z.
lo[i]);
135 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_hi, regs);
137 store(z.
hi[i], z_hi);
138 store(z.
lo[i], z_lo);
141 Fma(z.
hi[i], z.
lo[i], y.data()[i], 0.0, alpha.x[0], alpha.x[1], x.data()[i], 0.0);
148 std::cerr <<
"error bad vector size" << std::endl;
153 #pragma omp parallel private(regs)
156 get_isie(y.
size(), is, ie);
157 reg alpha_hi = broadcast(alpha);
158 reg alpha_lo = regs.zeros;
159 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
161 reg x_hi = load(x.
hi[i]);
162 reg x_lo = load(x.
lo[i]);
164 reg y_hi = load(y.
hi[i]);
165 reg y_lo = load(y.
lo[i]);
167 reg z_hi = load(z.
hi[i]);
168 reg z_lo = load(z.
lo[i]);
170 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
172 store(z.
hi[i], z_hi);
173 store(z.
lo[i], z_lo);
176 Fma(z.
hi[i], z.
lo[i], y.
hi[i], y.
lo[i], alpha, 0.0, x.
hi[i], x.
lo[i]);
182 if(x.size() != y.
size() && x.size() != z.
size()){
183 std::cerr <<
"error bad vector size" << std::endl;
188 #pragma omp parallel private(regs)
191 get_isie(y.
size(), is, ie);
192 reg alpha_hi = broadcast(alpha);
193 reg alpha_lo = regs.zeros;
194 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
196 reg x_hi = load(x.data()[i]);
197 reg x_lo = regs.zeros;
199 reg y_hi = load(y.
hi[i]);
200 reg y_lo = load(y.
lo[i]);
202 reg z_hi = load(z.
hi[i]);
203 reg z_lo = load(z.
lo[i]);
205 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
207 store(z.
hi[i], z_hi);
208 store(z.
lo[i], z_lo);
211 Fma(z.
hi[i], z.
lo[i], y.
hi[i], y.
lo[i], alpha, 0.0, x.data()[i], 0.0);
218 std::cerr <<
"error bad vector size" << std::endl;
223 #pragma omp parallel private(regs)
226 get_isie(y.size(), is, ie);
227 reg alpha_hi = broadcast(alpha);
228 reg alpha_lo = regs.zeros;
229 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
231 reg x_hi = load(x.
hi[i]);
232 reg x_lo = load(x.
lo[i]);
234 reg y_hi = load(y.data()[i]);
235 reg y_lo = regs.zeros;
237 reg z_hi = load(z.
hi[i]);
238 reg z_lo = load(z.
lo[i]);
240 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
242 store(z.
hi[i], z_hi);
243 store(z.
lo[i], z_lo);
246 Fma(z.
hi[i], z.
lo[i], y.data()[i], 0.0, alpha, 0.0, x.
hi[i], x.
lo[i]);
252 if(x.size() != y.size() && x.size() != z.
size()){
253 std::cerr <<
"error bad vector size" << std::endl;
258 #pragma omp parallel private(regs)
261 get_isie(y.size(), is, ie);
262 reg alpha_hi = broadcast(alpha);
263 reg alpha_lo = regs.zeros;
264 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
266 reg x_hi = load(x.data()[i]);
267 reg x_lo = regs.zeros;
269 reg y_hi = load(y.data()[i]);
270 reg y_lo = regs.zeros;
272 reg z_hi = load(z.
hi[i]);
273 reg z_lo = load(z.
lo[i]);
275 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_hi, regs);
277 store(z.
hi[i], z_hi);
278 store(z.
lo[i], z_lo);
281 Fma(z.
hi[i], z.
lo[i], y.data()[i], 0.0, alpha, 0.0, x.data()[i], 0.0);
289 std::cerr <<
"error bad vector size" << std::endl;
294 #pragma omp parallel private(regs)
297 get_isie(y.
size(), is, ie);
298 reg alpha_hi = broadcast(alpha.x[0]);
299 reg alpha_lo = broadcast(alpha.x[1]);
300 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
302 reg x_hi = load(x.
hi[i]);
303 reg x_lo = load(x.
lo[i]);
305 reg y_hi = load(y.
hi[i]);
306 reg y_lo = load(y.
lo[i]);
308 reg z_hi = load(z.data()[i]);
309 reg z_lo = regs.zeros;
311 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
313 store(z.data()[i], z_hi);
316 Fma(z.data()[i], y.
hi[i], y.
lo[i], alpha.x[0], alpha.x[1], x.
hi[i], x.
lo[i]);
322 if(x.size() != y.
size() && x.size() != z.size()){
323 std::cerr <<
"error bad vector size" << std::endl;
328 #pragma omp parallel private(regs)
331 get_isie(y.
size(), is, ie);
332 reg alpha_hi = broadcast(alpha.x[0]);
333 reg alpha_lo = broadcast(alpha.x[1]);
334 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
336 reg x_hi = load(x.data()[i]);
337 reg x_lo = regs.zeros;
339 reg y_hi = load(y.
hi[i]);
340 reg y_lo = load(y.
lo[i]);
342 reg z_hi = load(z.data()[i]);
343 reg z_lo = regs.zeros;
345 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
347 store(z.data()[i], z_hi);
350 Fma(z.data()[i], y.
hi[i], y.
lo[i], alpha.x[0], alpha.x[1], x.data()[i], 0.0);
356 if(x.
size() != y.size() && x.
size() != z.size()){
357 std::cerr <<
"error bad vector size" << std::endl;
362 #pragma omp parallel private(regs)
365 get_isie(y.size(), is, ie);
366 reg alpha_hi = broadcast(alpha.x[0]);
367 reg alpha_lo = broadcast(alpha.x[1]);
368 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
370 reg x_hi = load(x.
hi[i]);
371 reg x_lo = load(x.
lo[i]);
373 reg y_hi = load(y.data()[i]);
374 reg y_lo = regs.zeros;
376 reg z_hi = load(z.data()[i]);
377 reg z_lo = regs.zeros;
379 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
381 store(z.data()[i], z_hi);
384 Fma(z.data()[i], y.data()[i], 0.0, alpha.x[0], alpha.x[1], x.
hi[i], x.
lo[i]);
390 if(x.size() != y.size() && x.size() != z.size()){
391 std::cerr <<
"error bad vector size" << std::endl;
396 #pragma omp parallel private(regs)
399 get_isie(y.size(), is, ie);
400 reg alpha_hi = broadcast(alpha.x[0]);
401 reg alpha_lo = broadcast(alpha.x[1]);
402 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
404 reg x_hi = load(x.data()[i]);
405 reg x_lo = regs.zeros;
407 reg y_hi = load(y.data()[i]);
408 reg y_lo = regs.zeros;
410 reg z_hi = load(z.data()[i]);
411 reg z_lo = regs.zeros;
413 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_hi, regs);
415 store(z.data()[i], z_hi);
418 Fma(z.data()[i], y.data()[i], 0.0, alpha.x[0], alpha.x[1], x.data()[i], 0.0);
425 std::cerr <<
"error bad vector size" << std::endl;
430 #pragma omp parallel private(regs)
433 get_isie(y.
size(), is, ie);
434 reg alpha_hi = broadcast(alpha);
435 reg alpha_lo = regs.zeros;
436 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
438 reg x_hi = load(x.
hi[i]);
439 reg x_lo = load(x.
lo[i]);
441 reg y_hi = load(y.
hi[i]);
442 reg y_lo = load(y.
lo[i]);
444 reg z_hi = load(z.data()[i]);
445 reg z_lo = regs.zeros;
447 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
449 store(z.data()[i], z_hi);
452 Fma(z.data()[i], y.
hi[i], y.
lo[i], alpha, 0.0, x.
hi[i], x.
lo[i]);
458 if(x.size() != y.
size() && x.size() != z.size()){
459 std::cerr <<
"error bad vector size" << std::endl;
464 #pragma omp parallel private(regs)
467 get_isie(y.
size(), is, ie);
468 reg alpha_hi = broadcast(alpha);
469 reg alpha_lo = regs.zeros;
470 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
472 reg x_hi = load(x.data()[i]);
473 reg x_lo = regs.zeros;
475 reg y_hi = load(y.
hi[i]);
476 reg y_lo = load(y.
lo[i]);
478 reg z_hi = load(z.data()[i]);
479 reg z_lo = regs.zeros;
481 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
483 store(z.data()[i], z_hi);
486 Fma(z.data()[i], y.
hi[i], y.
lo[i], alpha, 0.0, x.data()[i], 0.0);
492 if(x.
size() != y.size() && x.
size() != z.size()){
493 std::cerr <<
"error bad vector size" << std::endl;
498 #pragma omp parallel private(regs)
501 get_isie(y.size(), is, ie);
502 reg alpha_hi = broadcast(alpha);
503 reg alpha_lo = regs.zeros;
504 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
506 reg x_hi = load(x.
hi[i]);
507 reg x_lo = load(x.
lo[i]);
509 reg y_hi = load(y.data()[i]);
510 reg y_lo = regs.zeros;
512 reg z_hi = load(z.data()[i]);
513 reg z_lo = regs.zeros;
515 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_lo, regs);
517 store(z.data()[i], z_hi);
520 Fma(z.data()[i], y.data()[i], 0.0, alpha, 0.0, x.
hi[i], x.
lo[i]);
526 if(x.size() != y.size() && x.size() != z.size()){
527 std::cerr <<
"error bad vector size" << std::endl;
532 #pragma omp parallel private(regs)
535 get_isie(y.size(), is, ie);
536 reg alpha_hi = broadcast(alpha);
537 reg alpha_lo = regs.zeros;
538 for(i = is; i < (ie-SIMD_Length+1); i += SIMD_Length){
540 reg x_hi = load(x.data()[i]);
541 reg x_lo = regs.zeros;
543 reg y_hi = load(y.data()[i]);
544 reg y_lo = regs.zeros;
546 reg z_hi = load(z.data()[i]);
547 reg z_lo = regs.zeros;
549 Fma(z_hi, z_lo, y_hi, y_lo, alpha_hi, alpha_lo, x_hi, x_hi, regs);
551 store(z.data()[i], z_hi);
554 Fma(z.data()[i], y.data()[i], 0.0, alpha, 0.0, x.data()[i], 0.0);
Double precision vector class, This class is almost same as std::vector<double>
Double-double precision vector class.
void axpyz(const d_real &alpha, const d_real_vector &x, const d_real_vector &y, d_real_vector &z)