This is looking good, just a few minor comments below.
-- 
Steve
On Fri, Jan 23, 2015 at 09:13:42AM -0600, Yazen Ghannam wrote:
> ARMv8 defines a set of optional CRC32/CRC32C instructions.
> This patch defines an optimized function that uses these
> instructions when available rather than table-based lookup.
> Optimized function based on a Hadoop patch by Ed Nevill.
> 
> Autotools updated to check for compiler support.
> Optimized function is selected at runtime based on HWCAP_CRC32.
> Added crc32c "performance" unit test and arch unit test.
> 
> Tested on AMD Seattle.
> Passes all crc32c unit tests.
> Unit test shows ~4x performance increase versus sctp.
> 
> Signed-off-by: Yazen Ghannam 
yazen.ghannam@linaro.org
> ---
>  configure.ac                   |  1 +
>  m4/ax_arm.m4                   | 18 +++++++++++--
>  src/arch/arm.c                 |  2 ++
>  src/arch/arm.h                 |  1 +
>  src/common/Makefile.am         | 10 +++++++-
>  src/common/crc32c.cc           |  6 +++++
>  src/common/crc32c_aarch64.c    | 58 ++++++++++++++++++++++++++++++++++++++++++
>  src/common/crc32c_aarch64.h    | 25 ++++++++++++++++++
>  src/test/common/test_crc32c.cc | 11 ++++++++
>  src/test/test_arch.cc          |  3 +++
>  10 files changed, 132 insertions(+), 3 deletions(-)
>  create mode 100644 src/common/crc32c_aarch64.c
>  create mode 100644 src/common/crc32c_aarch64.h
> 
> diff --git a/configure.ac b/configure.ac
> index d836b02..60e4feb 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -575,6 +575,7 @@ AC_LANG_POP([C++])
>  # Find supported SIMD / NEON / SSE extensions supported by the compiler
>  AX_ARM_FEATURES()
>  AM_CONDITIONAL(HAVE_NEON, [ test "x$ax_cv_support_neon_ext" = "xyes"])
> +AM_CONDITIONAL(HAVE_ARMV8_CRC, [ test "x$ax_cv_support_crc_ext" = "xyes"])
>  AX_INTEL_FEATURES()
>  AM_CONDITIONAL(HAVE_SSSE3, [ test "x$ax_cv_support_ssse3_ext" = "xyes"])
>  AM_CONDITIONAL(HAVE_SSE4_PCLMUL, [ test "x$ax_cv_support_pclmuldq_ext" = "xyes"])
> diff --git a/m4/ax_arm.m4 b/m4/ax_arm.m4
> index 2ccc9a9..37ea0aa 100644
> --- a/m4/ax_arm.m4
> +++ b/m4/ax_arm.m4
> @@ -13,13 +13,27 @@ AC_DEFUN([AX_ARM_FEATURES],
>        fi
>      ;;
>      aarch64*)
> +      AX_CHECK_COMPILE_FLAG(-march=armv8-a, ax_cv_support_armv8=yes, [])
> +      if test x"$ax_cv_support_armv8" = x"yes"; then
> +        ARM_ARCH_FLAGS="-march=armv8-a"
> +        ARM_DEFINE_FLAGS="-DARCH_AARCH64"
> +      fi
>        AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, ax_cv_support_neon_ext=yes, [])
>        if test x"$ax_cv_support_neon_ext" = x"yes"; then
> +        ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+simd"
> +        ARM_DEFINE_FLAGS="$ARM_DEFINE_FLAGS -DARM_NEON"
>          ARM_NEON_FLAGS="-march=armv8-a+simd -DARCH_AARCH64 -DARM_NEON"
> -        AC_SUBST(ARM_NEON_FLAGS)
> -        ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS"
>          AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
> +        AC_SUBST(ARM_NEON_FLAGS)
> +      fi
> +      AX_CHECK_COMPILE_FLAG(-march=armv8-a+crc, ax_cv_support_crc_ext=yes, [])
> +      if test x"$ax_cv_support_crc_ext" = x"yes"; then
> +        ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+crc"
> +        ARM_CRC_FLAGS="-march=armv8-a+crc -DARCH_AARCH64"
> +        AC_DEFINE(HAVE_ARMV8_CRC,,[Support ARMv8 CRC instructions])
> +        AC_SUBST(ARM_CRC_FLAGS)
>        fi
> +        ARM_FLAGS="$ARM_ARCH_FLAGS $ARM_DEFINE_FLAGS"
>      ;;
>    esac
>  
> diff --git a/src/arch/arm.c b/src/arch/arm.c
> index 93d079a..5a47e33 100644
> --- a/src/arch/arm.c
> +++ b/src/arch/arm.c
> @@ -2,6 +2,7 @@
>  
>  /* flags we export */
>  int ceph_arch_neon = 0;
> +int ceph_arch_aarch64_crc32 = 0;
>  
>  #include <stdio.h>
>  
> @@ -47,6 +48,7 @@ int ceph_arch_arm_probe(void)
>  	ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON;
>  #elif __aarch64__ && __linux__
>  	ceph_arch_neon = (get_hwcap() & HWCAP_ASIMD) == HWCAP_ASIMD;
> +	ceph_arch_aarch64_crc32 = (get_hwcap() & HWCAP_CRC32) == HWCAP_CRC32;
>  #else
>  	if (0)
>  		get_hwcap();  // make compiler shut up
> diff --git a/src/arch/arm.h b/src/arch/arm.h
> index f613438..1659b2e 100644
> --- a/src/arch/arm.h
> +++ b/src/arch/arm.h
> @@ -6,6 +6,7 @@ extern "C" {
>  #endif
>  
>  extern int ceph_arch_neon;  /* true if we have ARM NEON or ASIMD abilities */
> +extern int ceph_arch_aarch64_crc32;  /* true if we have AArch64 CRC32/CRC32C abilities */
>  
>  extern int ceph_arch_arm_probe(void);
>  
> diff --git a/src/common/Makefile.am b/src/common/Makefile.am
> index 2888194..37d1404 100644
> --- a/src/common/Makefile.am
> +++ b/src/common/Makefile.am
> @@ -112,11 +112,19 @@ endif
>  LIBCOMMON_DEPS += libcommon_crc.la
>  noinst_LTLIBRARIES += libcommon_crc.la
>  
> +if HAVE_ARMV8_CRC
> +libcommon_crc_aarch64_la_SOURCES = common/crc32c_aarch64.c
> +libcommon_crc_aarch64_la_CFLAGS = $(AM_CFLAGS) $(ARM_CRC_FLAGS)
> +LIBCOMMON_DEPS += libcommon_crc_aarch64.la
> +noinst_LTLIBRARIES += libcommon_crc_aarch64.la
> +endif
> +
>  noinst_HEADERS += \
>  	common/bloom_filter.hpp \
>  	common/sctp_crc32.h \
>  	common/crc32c_intel_baseline.h \
> -	common/crc32c_intel_fast.h
> +	common/crc32c_intel_fast.h \
> +	common/crc32c_aarch64.h
>  
>  
>  # important; libmsg before libauth!
> diff --git a/src/common/crc32c.cc b/src/common/crc32c.cc
> index e2e81a4..45432f5 100644
> --- a/src/common/crc32c.cc
> +++ b/src/common/crc32c.cc
> @@ -5,9 +5,11 @@
>  
>  #include "arch/probe.h"
>  #include "arch/intel.h"
> +#include "arch/arm.h"
>  #include "common/sctp_crc32.h"
>  #include "common/crc32c_intel_baseline.h"
>  #include "common/crc32c_intel_fast.h"
> +#include "common/crc32c_aarch64.h"
>  
>  /*
>   * choose best implementation based on the CPU architecture.
> @@ -24,6 +26,10 @@ ceph_crc32c_func_t ceph_choose_crc32(void)
>      return ceph_crc32c_intel_fast;
>    }
>  
> +  if (ceph_arch_aarch64_crc32){
> +    return ceph_crc32c_aarch64;
> +  }
> +
>    // default
>    return ceph_crc32c_sctp;
>  }
> diff --git a/src/common/crc32c_aarch64.c b/src/common/crc32c_aarch64.c
> new file mode 100644
> index 0000000..a7f8341
> --- /dev/null
> +++ b/src/common/crc32c_aarch64.c
> @@ -0,0 +1,58 @@
> +#include "acconfig.h"
> +#include "include/int_types.h"
> +#include "common/crc32c_aarch64.h"
> +
> +#ifdef HAVE_ARMV8_CRC
We can remove this #ifdef block now as this C file is only compiled
when HAVE_ARMV8_CRC is defined (i.e. it's a depenancy for
libcommon_crc_aarch64).
The #ifdef block in the header file should be retained.
> +
> +#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
> +#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
> +#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
> +#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
> +
> +uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len)
> +{
> +	int64_t length = len;
> +
> +	if (!buffer) {
> +
> +		while ((length -= sizeof(uint64_t)) >= 0)
> +			CRC32CX(crc, 0);
> +
> +		/* The following is more efficient than the straight loop */
> +		if (length & sizeof(uint32_t))
> +			CRC32CW(crc, 0);
> +
> +		if (length & sizeof(uint16_t))
> +			CRC32CH(crc, 0);
> +
> +		if (length & sizeof(uint8_t))
> +			CRC32CB(crc, 0);
> +	} else {
> +		while ((length -= sizeof(uint64_t)) >= 0) {
> +			CRC32CX(crc, *(uint64_t *)buffer);
> +			buffer += sizeof(uint64_t);
> +		}
> +
> +		/* The following is more efficient than the straight loop */
> +		if (length & sizeof(uint32_t)) {
> +			CRC32CW(crc, *(uint32_t *)buffer);
> +			buffer += sizeof(uint32_t);
> +		}
> +		if (length & sizeof(uint16_t)) {
> +			CRC32CH(crc, *(uint16_t *)buffer);
> +			buffer += sizeof(uint16_t);
> +		}
> +		if (length & sizeof(uint8_t))
> +			CRC32CB(crc, *buffer);
> +	}
> +	return crc;
> +}
> +
> +#else
> +
> +uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len)
> +{
> +	return 0;
> +}
> +
> +#endif
> diff --git a/src/common/crc32c_aarch64.h b/src/common/crc32c_aarch64.h
> new file mode 100644
> index 0000000..8088578
> --- /dev/null
> +++ b/src/common/crc32c_aarch64.h
> @@ -0,0 +1,25 @@
> +#ifndef CEPH_COMMON_CRC32C_AARCH64_H
> +#define CEPH_COMMON_CRC32C_AARCH64_H
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#ifdef HAVE_ARMV8_CRC
> +
> +extern uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len);
> +
> +#else
> +
> +static inline uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len)
> +{
> +	return 0;
> +}
> +
> +#endif
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif
> diff --git a/src/test/common/test_crc32c.cc b/src/test/common/test_crc32c.cc
> index b4297c6..2e0ecf8 100644
> --- a/src/test/common/test_crc32c.cc
> +++ b/src/test/common/test_crc32c.cc
> @@ -13,6 +13,7 @@
>  
>  #include "common/sctp_crc32.h"
>  #include "common/crc32c_intel_baseline.h"
> +#include "common/crc32c_aarch64.h"
>  
>  TEST(Crc32c, Small) {
>    const char *a = "foo bar baz";
> @@ -80,6 +81,16 @@ TEST(Crc32c, Performance) {
>      std::cout << "intel baseline = " << rate << " MB/sec" << std::endl;
>      ASSERT_EQ(261108528u, val);
>    }
> +#ifdef HAVE_ARMV8_CRC
> +  {
> +    utime_t start = ceph_clock_now(NULL);
> +    unsigned val = ceph_crc32c_aarch64(0, (unsigned char *)a, len);
> +    utime_t end = ceph_clock_now(NULL);
> +    float rate = (float)len / (float)(1024*1024) / (float)(end - start);
> +    std::cout << "aarch64 baseline = " << rate << " MB/sec" << std::endl;
> +    ASSERT_EQ(261108528u, val);
> +  }
> +#endif
I didn't spot this before sorry, but you need to check for 
ceph_arch_aarch64_crc32 (i.e. does the CPU have the extension?)
before running ceph_crc32c_aarch64. It may be possible to replace the
#ifdef with "if (ceph_arch_aarch64_crc32)".
>  
>  }
>  
> diff --git a/src/test/test_arch.cc b/src/test/test_arch.cc
> index b129262..27cdd1d 100644
> --- a/src/test/test_arch.cc
> +++ b/src/test/test_arch.cc
> @@ -50,6 +50,9 @@ TEST(Arch, all)
>    expected = (strstr(flags, " neon ") || strstr(flags, " asimd ")) ? 1 : 0;
>    EXPECT_EQ(expected, ceph_arch_neon);
>  
> +  expected = strstr(flags, " crc32 ") ? 1 : 0;
> +  EXPECT_EQ(expected, ceph_arch_aarch64_crc32);
> +
The test_arch.cc file looks very dangerous to me, it's strstr scanning
for CPU flags from /proc/cpuinfo with no regard for architecture.
(It is quite likely another chip in future introduces the crc32 CPU flag).
I see it's trying to sanity check that the HW_CAPS logic matches up
with the cpuflags. Could you please surround the crc32 check with:
#ifdef __arch64__
#endif /* __arch64__ */
Just to pin it down to ARM.
>    expected = strstr(flags, " pclmulqdq ") ? 1 : 0;
>    EXPECT_EQ(expected, ceph_arch_intel_pclmul);
>  
> -- 
> 2.2.1
>