Eigenvalue SoLvers for Petaflop-Applications (ELPA)
2019.05.002
|
#include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
Macros | |
#define | __forceinline __attribute__((always_inline)) static |
Functions | |
for (i=2;i< nb;i++) | |
if (nq==i) | |
_SSE_STORE (q, q1) | |
Variables | |
int | nb = *pnb |
int | nq = *pldq |
int | ldq = *pldq |
int | ldh = *pldh |
int | worked_on = 0 |
__SSE_DATATYPE | x1 = _SSE_LOAD(&q[ldq]) |
__SSE_DATATYPE | x2 = _SSE_LOAD(&q[ldq+offset]) |
__SSE_DATATYPE | x3 = _SSE_LOAD(&q[ldq+2*offset]) |
__SSE_DATATYPE | x4 = _SSE_LOAD(&q[ldq+3*offset]) |
__SSE_DATATYPE | x5 = _SSE_LOAD(&q[ldq+4*offset]) |
__SSE_DATATYPE | x6 = _SSE_LOAD(&q[ldq+5*offset]) |
__SSE_DATATYPE | h2 = _SSE_MUL(h1, vs) |
__SSE_DATATYPE | q1 = _SSE_LOAD(q) |
__SSE_DATATYPE | y1 = _SSE_ADD(q1, _SSE_MUL(x1, h1)) |
__SSE_DATATYPE | q2 = _SSE_LOAD(&q[offset]) |
__SSE_DATATYPE | y2 = _SSE_ADD(q2, _SSE_MUL(x2, h1)) |
__SSE_DATATYPE | q3 = _SSE_LOAD(&q[2*offset]) |
__SSE_DATATYPE | y3 = _SSE_ADD(q3, _SSE_MUL(x3, h1)) |
__SSE_DATATYPE | q4 = _SSE_LOAD(&q[3*offset]) |
__SSE_DATATYPE | y4 = _SSE_ADD(q4, _SSE_MUL(x4, h1)) |
__SSE_DATATYPE | q5 = _SSE_LOAD(&q[4*offset]) |
__SSE_DATATYPE | y5 = _SSE_ADD(q5, _SSE_MUL(x5, h1)) |
__SSE_DATATYPE | q6 = _SSE_LOAD(&q[5*offset]) |
__SSE_DATATYPE | y6 = _SSE_ADD(q6, _SSE_MUL(x6, h1)) |
h1 = _SSE_XOR(tau1, sign) | |
_SSE_STORE &[offset] | q |
#define __forceinline __attribute__((always_inline)) static |
for | ( | ) |
if | ( | nq | = = i | ) |
h1 = _SSE_XOR(tau1, sign) |
__SSE_DATATYPE h2 = _SSE_MUL(h1, vs) |
int ldh = *pldh |
int ldq = *pldq |
int nb = *pnb |
int nq = *pldq |
_SSE_STORE&[nb*ldq] q |
__SSE_DATATYPE q1 = _SSE_LOAD(q) |
__SSE_DATATYPE q2 = _SSE_LOAD(&q[offset]) |
__SSE_DATATYPE q3 = _SSE_LOAD(&q[2*offset]) |
__SSE_DATATYPE q4 = _SSE_LOAD(&q[3*offset]) |
__SSE_DATATYPE q5 = _SSE_LOAD(&q[4*offset]) |
q6 = _SSE_LOAD(&q[5*offset]) |
worked_on = 0 |
Unrolled kernel that computes #ifdef DOUBLE_PRECISION_REAL 12 rows of Q simultaneously, a #endif #ifdef SINGLE_PRECISION_REAL 24 rows of Q simultaneously, a #endif matrix Vector product with two householder vectors + a rank 2 update is performed
Unrolled kernel that computes #ifdef DOUBLE_PRECISION_REAL 10 rows of Q simultaneously, a #endif #ifdef SINGLE_PRECISION_REAL 20 rows of Q simultaneously, a #endif matrix Vector product with two householder vectors + a rank 2 update is performed
Unrolled kernel that computes #ifdef DOUBLE_PRECISION_REAL 8 rows of Q simultaneously, a #endif #ifdef SINGLE_PRECISION_REAL 16 rows of Q simultaneously, a #endif matrix Vector product with two householder vectors + a rank 2 update is performed
Unrolled kernel that computes #ifdef DOUBLE_PRECISION_REAL 6 rows of Q simultaneously, a #endif #ifdef SINGLE_PRECISION_REAL 12 rows of Q simultaneously, a #endif matrix Vector product with two householder vectors + a rank 2 update is performed
Unrolled kernel that computes #ifdef DOUBLE_PRECISION_REAL 4 rows of Q simultaneously, a #endif #ifdef SINGLE_PRECISION_REAL 8 rows of Q simultaneously, a #endif matrix Vector product with two householder vectors + a rank 2 update is performed
Unrolled kernel that computes #ifdef DOUBLE_PRECISION_REAL 2 rows of Q simultaneously, a #endif #ifdef SINGLE_PRECISION_REAL 4 rows of Q simultaneously, a #endif matrix Vector product with two householder vectors + a rank 2 update is performed