#include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>

Macros
#define	__forceinline __attribute__((always_inline)) static

Functions
	for (i=2;i< nb;i++)

	if (nq==i)

	_SSE_STORE (q, q1)

Variables
int	nb = *pnb

int	nq = *pldq

int	ldq = *pldq

int	ldh = *pldh

int	worked_on = 0

__SSE_DATATYPE	x1 = _SSE_LOAD(&q[ldq])

__SSE_DATATYPE	x2 = _SSE_LOAD(&q[ldq+offset])

__SSE_DATATYPE	x3 = _SSE_LOAD(&q[ldq+2*offset])

__SSE_DATATYPE	x4 = _SSE_LOAD(&q[ldq+3*offset])

__SSE_DATATYPE	x5 = _SSE_LOAD(&q[ldq+4*offset])

__SSE_DATATYPE	x6 = _SSE_LOAD(&q[ldq+5*offset])

__SSE_DATATYPE	h2 = _SSE_MUL(h1, vs)

__SSE_DATATYPE	q1 = _SSE_LOAD(q)

__SSE_DATATYPE	y1 = _SSE_ADD(q1, _SSE_MUL(x1, h1))

__SSE_DATATYPE	q2 = _SSE_LOAD(&q[offset])

__SSE_DATATYPE	y2 = _SSE_ADD(q2, _SSE_MUL(x2, h1))

__SSE_DATATYPE	q3 = _SSE_LOAD(&q[2*offset])

__SSE_DATATYPE	y3 = _SSE_ADD(q3, _SSE_MUL(x3, h1))

__SSE_DATATYPE	q4 = _SSE_LOAD(&q[3*offset])

__SSE_DATATYPE	y4 = _SSE_ADD(q4, _SSE_MUL(x4, h1))

__SSE_DATATYPE	q5 = _SSE_LOAD(&q[4*offset])

__SSE_DATATYPE	y5 = _SSE_ADD(q5, _SSE_MUL(x5, h1))

__SSE_DATATYPE	q6 = _SSE_LOAD(&q[5*offset])

__SSE_DATATYPE	y6 = _SSE_ADD(q6, _SSE_MUL(x6, h1))

	h1 = _SSE_XOR(tau1, sign)

_SSE_STORE &[offset]	q

Macro Definition Documentation

◆ __forceinline

#define __forceinline __attribute__((always_inline)) static

Function Documentation

◆ _SSE_STORE()

_SSE_STORE	(	q	,
		q1
	)

◆ for()

for ( )

◆ if()

if ( nq = = i )

Variable Documentation

◆ ldh

int ldh = *pldh

◆ ldq

int ldq = *pldq

◆ q2

__SSE_DATATYPE q2 = _SSE_LOAD(&q[offset])

◆ q3

__SSE_DATATYPE q3 = _SSE_LOAD(&q[2*offset])

◆ q4

__SSE_DATATYPE q4 = _SSE_LOAD(&q[3*offset])

◆ q5

__SSE_DATATYPE q5 = _SSE_LOAD(&q[4*offset])

◆ worked_on

worked_on = 0

Unrolled kernel that computes #ifdef DOUBLE_PRECISION_REAL 12 rows of Q simultaneously, a #endif #ifdef SINGLE_PRECISION_REAL 24 rows of Q simultaneously, a #endif matrix Vector product with two householder vectors + a rank 2 update is performed

Unrolled kernel that computes #ifdef DOUBLE_PRECISION_REAL 10 rows of Q simultaneously, a #endif #ifdef SINGLE_PRECISION_REAL 20 rows of Q simultaneously, a #endif matrix Vector product with two householder vectors + a rank 2 update is performed

Unrolled kernel that computes #ifdef DOUBLE_PRECISION_REAL 8 rows of Q simultaneously, a #endif #ifdef SINGLE_PRECISION_REAL 16 rows of Q simultaneously, a #endif matrix Vector product with two householder vectors + a rank 2 update is performed

Unrolled kernel that computes #ifdef DOUBLE_PRECISION_REAL 6 rows of Q simultaneously, a #endif #ifdef SINGLE_PRECISION_REAL 12 rows of Q simultaneously, a #endif matrix Vector product with two householder vectors + a rank 2 update is performed

Unrolled kernel that computes #ifdef DOUBLE_PRECISION_REAL 4 rows of Q simultaneously, a #endif #ifdef SINGLE_PRECISION_REAL 8 rows of Q simultaneously, a #endif matrix Vector product with two householder vectors + a rank 2 update is performed

Unrolled kernel that computes #ifdef DOUBLE_PRECISION_REAL 2 rows of Q simultaneously, a #endif #ifdef SINGLE_PRECISION_REAL 4 rows of Q simultaneously, a #endif matrix Vector product with two householder vectors + a rank 2 update is performed