Page 1 of 1

Technical Details

Posted: Sat May 23, 2020 6:55 pm
by katsura
  • memmove()/bzero() measurements represent how well the OS (C library) optimized memmove()/bzero() functions for different CPU architectures.
  • rep movsb/stosb measurements represent how well the ERMS feature is implemented on different CPU architectures. Intel only.
  • temporal load/store measurements represent how well memory load/store (read/write) perform through memory cache.
  • non-temporal (aka streaming) load/store measurements represent how well memory load/store (read/write) perform without going through memory cache.
temporal and non-temporal load/store measurements are optimized to utilize available load/store instructions on pre-SSE4.1, SSE4.1, and AVX2.

pre-SSE4.1:

Code: Select all

// temporal block read
void *tbread(void *dst, const void *src, size_t bs)
{
    __m128 x0, x1, x2, x3;
    __asm__ __volatile__ (
        "BREAD1_%=:\n"
        // temporal load. cache pollution.
        "movaps (%0), %2\n"
        "movaps 0x10(%0), %3\n"
        "movaps 0x20(%0), %4\n"
        "movaps 0x30(%0), %5\n"
        "addq $0x40, %0\n"
        "subq $0x40, %1\n"
        "ja BREAD1_%=\n"
        : "=r"(src) /* %0 */, "=r"(bs) /* %1 */, "=x"(x0) /* %2 */, "=x"(x1) /* %3 */, "=x"(x2) /* %4 */, "=x"(x3) /* %5 */ /* output */
        : "0"(src) /* %6 */, "1"(bs) /* %7 */ /* input */
        : /* clobbered */
    );
    return dst;
}

// temporal block zero fill
void *tbzero(void *dst, const void *src, size_t bs)
{
#pragma unused(src)
    void *p = dst;
    __m128 x0;
    __asm__ __volatile__ (
        "xorps %2, %2\n"
        "BZERO1_%=:\n"
        // temporal store. cache pollution.
        "movaps %2, (%0)\n"
        "movaps %2, 0x10(%0)\n"
        "movaps %2, 0x20(%0)\n"
        "movaps %2, 0x30(%0)\n"
        "addq $0x40, %0\n"
        "subq $0x40, %1\n"
        "ja BZERO1_%=\n"
        : "=r"(p) /* %0 */, "=r"(bs) /* %1 */, "=x"(x0) /* %2 */ /* output */
        : "0"(p) /* %3 */, "1"(bs) /* %4 */ /* input */
        : "memory" /* clobbered */
    );
    return dst;
}

// non-temporal block read
void *ntbread(void *dst, const void *src, size_t bs)
{
    __m128 x0, x1, x2, x3;
    __asm__ __volatile__ (
        "BREAD1_%=:\n"
        // non-temporal load is not available in SSE2. using movaps (temporal load).
        // NOTE: temporal load. cache pollution.
        "movaps (%0), %2\n"
        "movaps 0x10(%0), %3\n"
        "movaps 0x20(%0), %4\n"
        "movaps 0x30(%0), %5\n"
        "addq $0x40, %0\n"
        "subq $0x40, %1\n"
        "ja BREAD1_%=\n"
        : "=r"(src) /* %0 */, "=r"(bs) /* %1 */, "=x"(x0) /* %2 */, "=x"(x1) /* %3 */, "=x"(x2) /* %4 */, "=x"(x3) /* %5 */ /* output */
        : "0"(src) /* %6 */, "1"(bs) /* %7 */ /* input */
        : /* clobbered */
    );
    return dst;
}

// non-temporal block zero fill
void *ntbzero(void *dst, const void *src, size_t bs)
{
#pragma unused(src)
    void *p = dst;
    __m128 x0;
    __asm__ __volatile__ (
        "xorps %2, %2\n"
        "BZERO1_%=:\n"
        // movntdq (SSE2): Non-temporal store of double quadword from an XMM register into memory
        // non-temporal store. no cache pollution.
        "movntdq %2, (%0)\n"
        "movntdq %2, 0x10(%0)\n"
        "movntdq %2, 0x20(%0)\n"
        "movntdq %2, 0x30(%0)\n"
        "addq $0x40, %0\n"
        "subq $0x40, %1\n"
        "ja BZERO1_%=\n"
        : "=r"(p) /* %0 */, "=r"(bs) /* %1 */, "=x"(x0) /* %2 */ /* output */
        : "0"(p) /* %3 */, "1"(bs) /* %4 */ /* input */
        : "memory" /* clobbered */
    );
    return dst;
}
SSE4.1:

Code: Select all

// non-temporal block read (non-temporal load is available on SSE4.1 and up)
void *ntbread_sse41(void *dst, const void *src, size_t bs)
{
    //
    // movntdqa (SSE4.1): Move double quadword from m128 to xmm1 using non-temporal hint if WC memory type.
    //
    __m128 x0, x1, x2, x3;
    __asm__ __volatile__ (
        "BREAD1_%=:\n"
        // non-temporal load. no cache pollution.
        "movntdqa (%0), %2\n"
        "movntdqa 0x10(%0), %3\n"
        "movntdqa 0x20(%0), %4\n"
        "movntdqa 0x30(%0), %5\n"
        "addq $0x40, %0\n"
        "subq $0x40, %1\n"
        "ja BREAD1_%=\n"
        : "=r"(src) /* %0 */, "=r"(bs) /* %1 */, "=x"(x0) /* %2 */, "=x"(x1) /* %3 */, "=x"(x2) /* %4 */, "=x"(x3) /* %5 */ /* output */
        : "0"(src) /* %6 */, "1"(bs) /* %7 */ /* input */
        : /* clobbered */
    );
    return dst;
}

// non-temporal block zero fill
void *ntbzero_sse41(void *dst, const void *src, size_t bs)
{
#pragma unused(src)
    //
    // movntdqa (SSE4.1): Move double quadword from m128 to xmm1 using non-temporal hint if WC memory type.
    // movntdq (SSE2):    Move packed integer values in xmm1 to m128 using non-temporal hint.
    //
    // MOVNTDQA: Provides a non-temporal hint that can cause adjacent 16-byte items
    // within an aligned 64-byte region (a streaming line) to be fetched and held in
    // a small set of temporary buffers (“streaming load buffers”). Subsequent
    // streaming loads to other aligned 16-byte items in the same streaming line may
    // be supplied from the streaming load buffer and can improve throughput.
    //
    void *p = dst;
    __m128 x0;
    __asm__ __volatile__ (
        "xorps %2, %2\n"
        "BZERO1_%=:\n"
        // non-temporal store. no cache pollution.
        "movntdq %2, (%0)\n"
        "movntdq %2, 0x10(%0)\n"
        "movntdq %2, 0x20(%0)\n"
        "movntdq %2, 0x30(%0)\n"
        "addq $0x40, %0\n"
        "subq $0x40, %1\n"
        "ja BZERO1_%=\n"
        : "=r"(p) /* %0 */, "=r"(bs) /* %1 */, "=x"(x0) /* %2 */ /* output */
        : "0"(p) /* %3 */, "1"(bs) /* %4 */ /* input */
        : "memory" /* clobbered */
    );
    return dst;
}
AVX2:

Code: Select all

// temporal block read
void *tbread_avx2(void *dst, const void *src, size_t bs)
{
    __m256 y0, y1;
    __asm__ __volatile__ (
        "BREAD1_%=:\n"
        // temporal load. cache pollution.
        "vmovaps (%0), %2\n"
        "vmovaps 0x20(%0), %3\n"
        "addq $0x40, %0\n"
        "subq $0x40, %1\n"
        "ja BREAD1_%=\n"
        : "=r"(src) /* %0 */, "=r"(bs) /* %1 */, "=x"(y0) /* %2 */, "=x"(y1) /* %3 */ /* output */
        : "0"(src) /* %4 */, "1"(bs) /* %5 */ /* input */
        : /* clobbered */
    );
    return dst;
}

// temporal block zero fill
void *tbzero_avx2(void *dst, const void *src, size_t bs)
{
#pragma unused(src)
    void *p = dst;
    __m128 x0;
    __m256 y0;
    __asm__ __volatile__ (
        //
        // vmovaps (AVX):    Move aligned packed single-precision floating-point values from ymm2/mem to ymm1.
        //
        "xorps %3, %3\n"
        "vbroadcastss %3, %2\n"
        "BZERO1_%=:\n"
        // temporal store. cache pollution.
        "vmovaps %2, (%0)\n"
        "vmovaps %2, 0x20(%0)\n"
        "addq $0x40, %0\n"
        "subq $0x40, %1\n"
        "ja BZERO1_%=\n"
        : "=r"(p) /* %0 */, "=r"(bs) /* %1 */, "=x"(y0) /* %2 */, "=x"(x0) /* %3 */ /* output */
        : "0"(p) /* %4 */, "1"(bs) /* %5 */ /* input */
        : "memory" /* clobbered */
    );
    return dst;
}

// non-temporal block read (non-temporal 256-bit load is available on AVX2 and up)
void *ntbread_avx2(void *dst, const void *src, size_t bs)
{
    __m256 y0, y1;
    __asm__ __volatile__ (
        "BREAD1_%=:\n"
        // non-temporal load. no cache pollution.
        "vmovntdqa (%0), %2\n"
        "vmovntdqa 0x20(%0), %3\n"
        "addq $0x40, %0\n"
        "subq $0x40, %1\n"
        "ja BREAD1_%=\n"
        : "=r"(src) /* %0 */, "=r"(bs) /* %1 */, "=x"(y0) /* %2 */, "=x"(y1) /* %3 */ /* output */
        : "0"(src) /* %4 */, "1"(bs) /* %5 */ /* input */
        : /* clobbered */
    );
    return dst;
}

// non-temporal block zero fill
void *ntbzero_avx2(void *dst, const void *src, size_t bs)
{
#pragma unused(src)
    void *p = dst;
    __m128 x0;
    __m256 y0;
    __asm__ __volatile__ (
        //
        // vmovntdq (AVX):   Move packed integer values in ymm1 to m256 using non-temporal hint.
        //
        "xorps %3, %3\n"
        "vbroadcastss %3, %2\n"
        "BZERO1_%=:\n"
        // non-temporal store. no cache pollution.
        "vmovntdq %2, (%0)\n"
        "vmovntdq %2, 0x20(%0)\n"
        "addq $0x40, %0\n"
        "subq $0x40, %1\n"
        "ja BZERO1_%=\n"
        : "=r"(p) /* %0 */, "=r"(bs) /* %1 */, "=x"(y0) /* %2 */, "=x"(x0) /* %3 */ /* output */
        : "0"(p) /* %4 */, "1"(bs) /* %5 */ /* input */
        : "memory" /* clobbered */
    );
    return dst;
}
Apple Silicon (arm64):

Code: Select all

// temporal block read
void *tbread(void *dst, const void *src, size_t bs)
{
    __asm__ __volatile__ (
        "BREAD1_%=:\n"
        "ldp    q0, q1, [%0]\n"
        "ldp    q2, q3, [%0, #0x20]\n"
        "add    %0, %0, #0x40\n"
        "subs   %1, %1, #0x40\n"
        "b.hi   BREAD1_%=\n"
        : "=r"(src) /* %0 */, "=r"(bs) /* %1 */ /* output */
        : "0"(src) /* %0 */, "1"(bs) /* %1 */ /* input */
        : "v0", "v1", "v2", "v3" /* clobbered */
    );
    return dst;
}

// non-temporal block read
void *ntbread(void *dst, const void *src, size_t bs)
{
    __asm__ __volatile__ (
        "BREAD1_%=:\n"
        "ldnp   q0, q1, [%0]\n"
        "ldnp   q2, q3, [%0, #0x20]\n"
        "add    %0, %0, #0x40\n"
        "subs   %1, %1, #0x40\n"
        "b.hi   BREAD1_%=\n"
        : "=r"(src) /* %0 */, "=r"(bs) /* %1 */ /* output */
        : "0"(src) /* %0 */, "1"(bs) /* %1 */ /* input */
        : "v0", "v1", "v2", "v3" /* clobbered */
    );
    return dst;
}

// temporal block write
void *tbzero(void *dst, const void *src, size_t bs)
{
#pragma unused(src)
    void *p = dst;
    __asm__ __volatile__ (
        // STP: Store Pair of Registers.
        "eor.16b v0, v0, v0\n"
        "BZERO1_%=:\n"
        // temporal store. cache pollution.
        "stp     q0, q0, [%0]\n"
        "stp     q0, q0, [%0, #0x20]\n"
        "add     %0, %0, #0x40\n"
        "subs    %1, %1, #0x40\n"
        "b.hi    BZERO1_%=\n"
        : "=r"(p) /* %0 */, "=r"(bs) /* %1 */ /* output */
        : "0"(p) /* %0 */, "1"(bs) /* %1 */ /* input */
        : "v0", "memory" /* clobbered */
    );
    return dst;
}

// non-temporal block write
void *ntbzero(void *dst, const void *src, size_t bs)
{
#pragma unused(src)
    void *p = dst;
    __asm__ __volatile__ (
        // STNP: Store Pair of Registers, with non-temporal hint.
        "eor.16b v0, v0, v0\n"
        "BZERO1_%=:\n"
        // non-temporal store. no cache pollution.
        "stnp    q0, q0, [%0]\n"
        "stnp    q0, q0, [%0, #0x20]\n"
        "add     %0, %0, #0x40\n"
        "subs    %1, %1, #0x40\n"
        "b.hi    BZERO1_%=\n"
        : "=r"(p) /* %0 */, "=r"(bs) /* %1 */ /* output */
        : "0"(p) /* %0 */, "1"(bs) /* %1 */ /* input */
        : "v0", "memory" /* clobbered */
    );
    return dst;
}