void matmul2(int dim, float *a, float *b, float *c) { float dot0, dot1, dot2, dot3, a0, a1, a2, a3, b0, b1, b2, b3; float dot4, dot5, dot6, dot7, a4, a5, a6, a7, b4, b5, b6, b7; long I, J, i, j, k; /* ...transpose b into tb like in matmul1...; */ for (I = 0; I < dim*dim; I += dim) for (j = J = 0; j < dim; ++j, J += dim) { dot0 = dot1 = dot2 = dot3 = dot4 = dot5 = dot6 = dot7 = 0.0; for (k = 0; k < dim; k += 8) { a0 = a[I + k + 0]; b0 = tb[J + k + 0]; a1 = a[I + k + 1]; b1 = tb[J + k + 1]; a2 = a[I + k + 2]; b2 = tb[J + k + 2]; a3 = a[I + k + 3]; b3 = tb[J + k + 3]; a4 = a[I + k + 4]; b4 = tb[J + k + 4]; a5 = a[I + k + 5]; b5 = tb[J + k + 5]; a6 = a[I + k + 6]; b6 = tb[J + k + 6]; a7 = a[I + k + 7]; b7 = tb[J + k + 7]; dot0 += a0 * b0; dot1 += a1 * b1; dot2 += a2 * b2; dot3 += a3 * b3; dot4 += a4 * b4; dot5 += a5 * b5; dot6 += a6 * b6; dot7 += a7 * b7; } c[I + j] = dot0 + dot1 + dot2 + dot3 + dot4 + dot5 + dot6 + dot7; } }