 
            Tested on AMD Seattle. Passes all crc32c unit tests. ~4x performance increase versus sctp.
Signed-off-by: Yazen Ghannam yazen.ghannam@linaro.org --- configure.ac | 1 + m4/ax_arm.m4 | 18 +++++++++++++--- src/arch/arm.c | 2 ++ src/arch/arm.h | 1 + src/common/Makefile.am | 11 ++++++++-- src/common/crc32c.cc | 6 ++++++ src/common/crc32c_arm64.c | 48 ++++++++++++++++++++++++++++++++++++++++++ src/common/crc32c_arm64.h | 25 ++++++++++++++++++++++ src/test/common/test_crc32c.cc | 9 ++++++++ 9 files changed, 116 insertions(+), 5 deletions(-) create mode 100644 src/common/crc32c_arm64.c create mode 100644 src/common/crc32c_arm64.h
diff --git a/configure.ac b/configure.ac index d836b02..60e4feb 100644 --- a/configure.ac +++ b/configure.ac @@ -575,6 +575,7 @@ AC_LANG_POP([C++]) # Find supported SIMD / NEON / SSE extensions supported by the compiler AX_ARM_FEATURES() AM_CONDITIONAL(HAVE_NEON, [ test "x$ax_cv_support_neon_ext" = "xyes"]) +AM_CONDITIONAL(HAVE_ARMV8_CRC, [ test "x$ax_cv_support_crc_ext" = "xyes"]) AX_INTEL_FEATURES() AM_CONDITIONAL(HAVE_SSSE3, [ test "x$ax_cv_support_ssse3_ext" = "xyes"]) AM_CONDITIONAL(HAVE_SSE4_PCLMUL, [ test "x$ax_cv_support_pclmuldq_ext" = "xyes"]) diff --git a/m4/ax_arm.m4 b/m4/ax_arm.m4 index 2ccc9a9..4ef8081 100644 --- a/m4/ax_arm.m4 +++ b/m4/ax_arm.m4 @@ -13,13 +13,25 @@ AC_DEFUN([AX_ARM_FEATURES], fi ;; aarch64*) + AX_CHECK_COMPILE_FLAG(-march=armv8-a, ax_cv_support_armv8=yes, []) + if test x"$ax_cv_support_armv8" = x"yes"; then + ARM_ARCH_FLAGS="-march=armv8-a" + ARM_DEFINE_FLAGS="-DARCH_AARCH64" + fi AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, ax_cv_support_neon_ext=yes, []) if test x"$ax_cv_support_neon_ext" = x"yes"; then - ARM_NEON_FLAGS="-march=armv8-a+simd -DARCH_AARCH64 -DARM_NEON" - AC_SUBST(ARM_NEON_FLAGS) - ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS" + ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+simd" + ARM_DEFINE_FLAGS="$ARM_DEFINE_FLAGS -DARM_NEON" AC_DEFINE(HAVE_NEON,,[Support NEON instructions]) fi + AX_CHECK_COMPILE_FLAG(-march=armv8-a+crc, ax_cv_support_crc_ext=yes, []) + if test x"$ax_cv_support_crc_ext" = x"yes"; then + ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+crc" + AC_DEFINE(HAVE_ARMV8_CRC,,[Support ARMv8 CRC instructions]) + fi + ARM_NEON_FLAGS="$ARM_ARCH_FLAGS $ARM_DEFINE_FLAGS" + AC_SUBST(ARM_NEON_FLAGS) + ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS" ;; esac
diff --git a/src/arch/arm.c b/src/arch/arm.c index 93d079a..2ea97eb 100644 --- a/src/arch/arm.c +++ b/src/arch/arm.c @@ -2,6 +2,7 @@
/* flags we export */ int ceph_arch_neon = 0; +int ceph_arch_arm64_crc32 = 0;
#include <stdio.h>
@@ -47,6 +48,7 @@ int ceph_arch_arm_probe(void) ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON; #elif __aarch64__ && __linux__ ceph_arch_neon = (get_hwcap() & HWCAP_ASIMD) == HWCAP_ASIMD; + ceph_arch_arm64_crc32 = (get_hwcap() & HWCAP_CRC32) == HWCAP_CRC32; #else if (0) get_hwcap(); // make compiler shut up diff --git a/src/arch/arm.h b/src/arch/arm.h index f613438..4ac716a 100644 --- a/src/arch/arm.h +++ b/src/arch/arm.h @@ -6,6 +6,7 @@ extern "C" { #endif
extern int ceph_arch_neon; /* true if we have ARM NEON or ASIMD abilities */ +extern int ceph_arch_arm64_crc32; /* true if we have Aarch64 CRC32/CRC32C abilities */
extern int ceph_arch_arm_probe(void);
diff --git a/src/common/Makefile.am b/src/common/Makefile.am index 2888194..4c36216 100644 --- a/src/common/Makefile.am +++ b/src/common/Makefile.am @@ -103,12 +103,18 @@ libcommon_crc_la_SOURCES = \ common/sctp_crc32.c \ common/crc32c.cc \ common/crc32c_intel_baseline.c \ - common/crc32c_intel_fast.c + common/crc32c_intel_fast.c \ + common/crc32c_arm64.c
if WITH_GOOD_YASM_ELF64 libcommon_crc_la_SOURCES += common/crc32c_intel_fast_asm.S common/crc32c_intel_fast_zero_asm.S libcommon_crc_la_LIBTOOLFLAGS = --tag=CC endif + +if HAVE_ARMV8_CRC +libcommon_crc_la_CFLAGS = $(AM_CFLAGS) $(ARM_FLAGS) +endif + LIBCOMMON_DEPS += libcommon_crc.la noinst_LTLIBRARIES += libcommon_crc.la
@@ -116,7 +122,8 @@ noinst_HEADERS += \ common/bloom_filter.hpp \ common/sctp_crc32.h \ common/crc32c_intel_baseline.h \ - common/crc32c_intel_fast.h + common/crc32c_intel_fast.h \ + common/crc32c_arm64.h
# important; libmsg before libauth! diff --git a/src/common/crc32c.cc b/src/common/crc32c.cc index e2e81a4..540d431 100644 --- a/src/common/crc32c.cc +++ b/src/common/crc32c.cc @@ -5,9 +5,11 @@
#include "arch/probe.h" #include "arch/intel.h" +#include "arch/arm.h" #include "common/sctp_crc32.h" #include "common/crc32c_intel_baseline.h" #include "common/crc32c_intel_fast.h" +#include "common/crc32c_arm64.h"
/* * choose best implementation based on the CPU architecture. @@ -24,6 +26,10 @@ ceph_crc32c_func_t ceph_choose_crc32(void) return ceph_crc32c_intel_fast; }
+ if (ceph_arch_arm64_crc32){ + return ceph_crc32c_arm64; + } + // default return ceph_crc32c_sctp; } diff --git a/src/common/crc32c_arm64.c b/src/common/crc32c_arm64.c new file mode 100644 index 0000000..6a754fb --- /dev/null +++ b/src/common/crc32c_arm64.c @@ -0,0 +1,48 @@ +#include "acconfig.h" +#include "include/int_types.h" +#include "common/crc32c_arm64.h" + +#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) + +uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const *buffer, unsigned len) +{ + int64_t length = len; + + if (!buffer) { + + while ((length -= sizeof(uint64_t)) >= 0) + CRC32CX(crc, 0); + + /* The following is more efficient than the straight loop */ + if (length & sizeof(uint32_t)) + CRC32CW(crc, 0); + + if (length & sizeof(uint16_t)) + CRC32CH(crc, 0); + + if (length & sizeof(uint8_t)) + CRC32CB(crc, 0); + } else { + while ((length -= sizeof(uint64_t)) >= 0) { + CRC32CX(crc, *(uint64_t *)buffer); + buffer += sizeof(uint64_t); + } + + /* The following is more efficient than the straight loop */ + if (length & sizeof(uint32_t)) { + CRC32CW(crc, *(uint32_t *)buffer); + buffer += sizeof(uint32_t); + } + if (length & sizeof(uint16_t)) { + CRC32CH(crc, *(uint16_t *)buffer); + buffer += sizeof(uint16_t); + } + if (length & sizeof(uint8_t)) + CRC32CB(crc, *buffer); + } + return crc; +} + diff --git a/src/common/crc32c_arm64.h b/src/common/crc32c_arm64.h new file mode 100644 index 0000000..100b5e4 --- /dev/null +++ b/src/common/crc32c_arm64.h @@ -0,0 +1,25 @@ +#ifndef CEPH_COMMON_CRC32C_ARM64_H +#define CEPH_COMMON_CRC32C_ARM64_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __aarch64__ + +extern uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const *buffer, unsigned len); + +#else + +static inline uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const *buffer, unsigned len) +{ + return 0; +} + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/test/common/test_crc32c.cc b/src/test/common/test_crc32c.cc index b4297c6..1a9b2e2 100644 --- a/src/test/common/test_crc32c.cc +++ b/src/test/common/test_crc32c.cc @@ -13,6 +13,7 @@
#include "common/sctp_crc32.h" #include "common/crc32c_intel_baseline.h" +#include "common/crc32c_arm64.h"
TEST(Crc32c, Small) { const char *a = "foo bar baz"; @@ -80,6 +81,14 @@ TEST(Crc32c, Performance) { std::cout << "intel baseline = " << rate << " MB/sec" << std::endl; ASSERT_EQ(261108528u, val); } + { + utime_t start = ceph_clock_now(NULL); + unsigned val = ceph_crc32c_arm64(0, (unsigned char *)a, len); + utime_t end = ceph_clock_now(NULL); + float rate = (float)len / (float)(1024*1024) / (float)(end - start); + std::cout << "arm64 baseline = " << rate << " MB/sec" << std::endl; + ASSERT_EQ(261108528u, val); + }
}
 
            Hi Yazen,
I have a few comments below.
Cheers,
 
            On Thu, Jan 22, 2015 at 4:58 AM, Steve Capper steve.capper@linaro.org wrote:
Hi Yazen,
I have a few comments below.
Cheers,
Steve
On Wed, Jan 21, 2015 at 02:17:47PM -0600, Yazen Ghannam wrote:
Tested on AMD Seattle. Passes all crc32c unit tests. ~4x performance increase versus sctp.
Was that perf result from the unit test?
Yes, this was from the unit test.
Also describe the reason for the patch in the commit log; we have an optional CRC instruction that can be used instead of the Ceph in-built table lookup. This patch uses the CRC instruction where available...
Unit tests are added and they show a ...
Probably also mention that this based off Ed Nevill's Hadoop patch.
Will do.
Signed-off-by: Yazen Ghannam yazen.ghannam@linaro.org
configure.ac | 1 + m4/ax_arm.m4 | 18 +++++++++++++--- src/arch/arm.c | 2 ++ src/arch/arm.h | 1 + src/common/Makefile.am | 11 ++++++++-- src/common/crc32c.cc | 6 ++++++ src/common/crc32c_arm64.c | 48 ++++++++++++++++++++++++++++++++++++++++++ src/common/crc32c_arm64.h | 25 ++++++++++++++++++++++ src/test/common/test_crc32c.cc | 9 ++++++++ 9 files changed, 116 insertions(+), 5 deletions(-) create mode 100644 src/common/crc32c_arm64.c create mode 100644 src/common/crc32c_arm64.h
diff --git a/configure.ac b/configure.ac index d836b02..60e4feb 100644 --- a/configure.ac +++ b/configure.ac @@ -575,6 +575,7 @@ AC_LANG_POP([C++]) # Find supported SIMD / NEON / SSE extensions supported by the compiler AX_ARM_FEATURES() AM_CONDITIONAL(HAVE_NEON, [ test "x$ax_cv_support_neon_ext" = "xyes"]) +AM_CONDITIONAL(HAVE_ARMV8_CRC, [ test "x$ax_cv_support_crc_ext" = "xyes"]) AX_INTEL_FEATURES() AM_CONDITIONAL(HAVE_SSSE3, [ test "x$ax_cv_support_ssse3_ext" = "xyes"]) AM_CONDITIONAL(HAVE_SSE4_PCLMUL, [ test "x$ax_cv_support_pclmuldq_ext" = "xyes"]) diff --git a/m4/ax_arm.m4 b/m4/ax_arm.m4 index 2ccc9a9..4ef8081 100644 --- a/m4/ax_arm.m4 +++ b/m4/ax_arm.m4 @@ -13,13 +13,25 @@ AC_DEFUN([AX_ARM_FEATURES], fi ;; aarch64*)
AX_CHECK_COMPILE_FLAG(-march=armv8-a, ax_cv_support_armv8=yes, [])
if test x"$ax_cv_support_armv8" = x"yes"; then
ARM_ARCH_FLAGS="-march=armv8-a"
ARM_DEFINE_FLAGS="-DARCH_AARCH64"
fi AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, ax_cv_support_neon_ext=yes, []) if test x"$ax_cv_support_neon_ext" = x"yes"; then
ARM_NEON_FLAGS="-march=armv8-a+simd -DARCH_AARCH64 -DARM_NEON"
AC_SUBST(ARM_NEON_FLAGS)
ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS"
ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+simd"
ARM_DEFINE_FLAGS="$ARM_DEFINE_FLAGS -DARM_NEON" AC_DEFINE(HAVE_NEON,,[Support NEON instructions]) fi
AX_CHECK_COMPILE_FLAG(-march=armv8-a+crc, ax_cv_support_crc_ext=yes, [])
if test x"$ax_cv_support_crc_ext" = x"yes"; then
ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+crc"
AC_DEFINE(HAVE_ARMV8_CRC,,[Support ARMv8 CRC instructions])
fi
ARM_NEON_FLAGS="$ARM_ARCH_FLAGS $ARM_DEFINE_FLAGS"
AC_SUBST(ARM_NEON_FLAGS)
ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS"So if the compiler supports CRC, then ARM_FLAGS will be set with CRC compile options. This will also affect the jerasure codegen (which doesn't check for the CRC HWCAP). Although this is unlikely to cause problems, I would feel more comfortable with something like: ARM_CRC_FLAGS That way you disambiguate between NEON (for jerasure) and CRC.
It looks to me that jerasure only uses ARM_NEON_FLAGS. What I'll do is have minimal flags for each feature (e.g. NEON and CRC) and put all the available flags in ARM_FLAGS.
;;esac
diff --git a/src/arch/arm.c b/src/arch/arm.c index 93d079a..2ea97eb 100644 --- a/src/arch/arm.c +++ b/src/arch/arm.c @@ -2,6 +2,7 @@
/* flags we export */ int ceph_arch_neon = 0; +int ceph_arch_arm64_crc32 = 0;
<bikeshedding-comment> Can we rename this variable: ceph_arch_aarch64_crc32 ? </bikeshedding-comment>
Will do.
#include <stdio.h>
@@ -47,6 +48,7 @@ int ceph_arch_arm_probe(void) ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON; #elif __aarch64__ && __linux__ ceph_arch_neon = (get_hwcap() & HWCAP_ASIMD) == HWCAP_ASIMD;
ceph_arch_arm64_crc32 = (get_hwcap() & HWCAP_CRC32) == HWCAP_CRC32;#else if (0) get_hwcap(); // make compiler shut up diff --git a/src/arch/arm.h b/src/arch/arm.h index f613438..4ac716a 100644 --- a/src/arch/arm.h +++ b/src/arch/arm.h @@ -6,6 +6,7 @@ extern "C" { #endif
extern int ceph_arch_neon; /* true if we have ARM NEON or ASIMD abilities */ +extern int ceph_arch_arm64_crc32; /* true if we have Aarch64 CRC32/CRC32C abilities */
"AArch64", we have two capital A's.
Ack.
extern int ceph_arch_arm_probe(void);
diff --git a/src/common/Makefile.am b/src/common/Makefile.am index 2888194..4c36216 100644 --- a/src/common/Makefile.am +++ b/src/common/Makefile.am @@ -103,12 +103,18 @@ libcommon_crc_la_SOURCES = \ common/sctp_crc32.c \ common/crc32c.cc \ common/crc32c_intel_baseline.c \
common/crc32c_intel_fast.c
common/crc32c_intel_fast.c \
common/crc32c_arm64.cI would recommend crc32c_aarch64.c here too.
Ack.
if WITH_GOOD_YASM_ELF64 libcommon_crc_la_SOURCES += common/crc32c_intel_fast_asm.S common/crc32c_intel_fast_zero_asm.S libcommon_crc_la_LIBTOOLFLAGS = --tag=CC endif
+if HAVE_ARMV8_CRC +libcommon_crc_la_CFLAGS = $(AM_CFLAGS) $(ARM_FLAGS) +endif
The problem with this is that all the files in the libcommon_crc_la_SOURCES will be compiled with this flag. We want to limit the scope to just the ARM CRC file.
I don't understand the issue here. Can you please explain? There is a check for HAVE_ARMV8_CRC which is only set if these flags are valid. This seems to work fine on aarch64 and x86.
LIBCOMMON_DEPS += libcommon_crc.la noinst_LTLIBRARIES += libcommon_crc.la
@@ -116,7 +122,8 @@ noinst_HEADERS += \ common/bloom_filter.hpp \ common/sctp_crc32.h \ common/crc32c_intel_baseline.h \
common/crc32c_intel_fast.h
common/crc32c_intel_fast.h \
common/crc32c_arm64.h# important; libmsg before libauth! diff --git a/src/common/crc32c.cc b/src/common/crc32c.cc index e2e81a4..540d431 100644 --- a/src/common/crc32c.cc +++ b/src/common/crc32c.cc @@ -5,9 +5,11 @@
#include "arch/probe.h" #include "arch/intel.h" +#include "arch/arm.h" #include "common/sctp_crc32.h" #include "common/crc32c_intel_baseline.h" #include "common/crc32c_intel_fast.h" +#include "common/crc32c_arm64.h"
Does pulling these ARM header files in affect how this compiles for other architectures? Could you please give this a quick build on x86?
Builds fine on x86. I followed the same format as the intel files where arch-specific stuff is #ifdef'd out. See next comment.
/*
- choose best implementation based on the CPU architecture.
@@ -24,6 +26,10 @@ ceph_crc32c_func_t ceph_choose_crc32(void) return ceph_crc32c_intel_fast; }
- if (ceph_arch_arm64_crc32){
- return ceph_crc32c_arm64;
- }
- // default return ceph_crc32c_sctp;
} diff --git a/src/common/crc32c_arm64.c b/src/common/crc32c_arm64.c new file mode 100644 index 0000000..6a754fb --- /dev/null +++ b/src/common/crc32c_arm64.c @@ -0,0 +1,48 @@ +#include "acconfig.h" +#include "include/int_types.h" +#include "common/crc32c_arm64.h"
+#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
This won't likely build for other architectures. I would recommend something like:
#ifdef __aarch64__ CRC routines #else empty functions #endif /* __arch64__ */
Yes, you are correct. I will add a check here. The empty function is defined in the header file.
+uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const *buffer, unsigned len) +{
int64_t length = len;
if (!buffer) {
while ((length -= sizeof(uint64_t)) >= 0)
CRC32CX(crc, 0);
/* The following is more efficient than the straight loop */
if (length & sizeof(uint32_t))
CRC32CW(crc, 0);
if (length & sizeof(uint16_t))
CRC32CH(crc, 0);
if (length & sizeof(uint8_t))
CRC32CB(crc, 0);
} else {
while ((length -= sizeof(uint64_t)) >= 0) {
CRC32CX(crc, *(uint64_t *)buffer);
buffer += sizeof(uint64_t);
}
/* The following is more efficient than the straight loop */
if (length & sizeof(uint32_t)) {
CRC32CW(crc, *(uint32_t *)buffer);
buffer += sizeof(uint32_t);
}
if (length & sizeof(uint16_t)) {
CRC32CH(crc, *(uint16_t *)buffer);
buffer += sizeof(uint16_t);
}
if (length & sizeof(uint8_t))
CRC32CB(crc, *buffer);
}
return crc;+}
diff --git a/src/common/crc32c_arm64.h b/src/common/crc32c_arm64.h new file mode 100644 index 0000000..100b5e4 --- /dev/null +++ b/src/common/crc32c_arm64.h @@ -0,0 +1,25 @@ +#ifndef CEPH_COMMON_CRC32C_ARM64_H +#define CEPH_COMMON_CRC32C_ARM64_H
+#ifdef __cplusplus +extern "C" { +#endif
+#ifdef __aarch64__
+extern uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const *buffer, unsigned len);
+#else
+static inline uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const *buffer, unsigned len) +{
return 0;+}
+#endif
+#ifdef __cplusplus +} +#endif
+#endif diff --git a/src/test/common/test_crc32c.cc b/src/test/common/test_crc32c.cc index b4297c6..1a9b2e2 100644 --- a/src/test/common/test_crc32c.cc +++ b/src/test/common/test_crc32c.cc @@ -13,6 +13,7 @@
#include "common/sctp_crc32.h" #include "common/crc32c_intel_baseline.h" +#include "common/crc32c_arm64.h"
TEST(Crc32c, Small) { const char *a = "foo bar baz"; @@ -80,6 +81,14 @@ TEST(Crc32c, Performance) { std::cout << "intel baseline = " << rate << " MB/sec" << std::endl; ASSERT_EQ(261108528u, val); }
- {
- utime_t start = ceph_clock_now(NULL);
- unsigned val = ceph_crc32c_arm64(0, (unsigned char *)a, len);
- utime_t end = ceph_clock_now(NULL);
- float rate = (float)len / (float)(1024*1024) / (float)(end - start);
- std::cout << "arm64 baseline = " << rate << " MB/sec" << std::endl;
- ASSERT_EQ(261108528u, val);
- }
What about x86? :-)
Unfortunately, I don't have an x86 system with SSE4, but I'll try to find one and test this for reference. Also, I'm adding a check here to remove the test if HAVE_ARMV8_CRC is not defined.
}
-- 2.2.0

