Re: Vectorised copy

5 Sep 2011


      On Sat, Sep 3, 2011 at 4:54 AM, Ulrich Weigand
Ulrich.Weigand@de.ibm.com wrote:
...
Michael Hope michael.hope@linaro.org wrote:
...
int *a;
int *b;
int *c;
const int ad[320];
const int bd[320];
const int cd[320];
void fill()
{
  for (int i = 0; i < 320; i++)
    {
      a[i] = ad[i];
      b[i] = bd[i];
      c[i] = cd[i];
    }
}
[snip]
...
Can we always use the second form?  What optimisation is preventing it?
Without having looked into this in detail, my guess would be
it depends on whether the compiler is able to prove that the
memory pointed to by a, b, and c is distinct (instead of having
a potential overlap if those are pointers into the same array).
Does it help if you make a, b, and c function arguments to fill,
and mark them restrict?
Yip, I had a go with that originally.  Here's the variants:
(1) - local source, local destination:
int a[320];
int b[320];
int c[320];
const int ad[320];
const int bd[320];
const int cd[320];
void fill()
{
  for (int i = 0; i < 320; i++)
    {
      a[i] = ad[i];
      b[i] = bd[i];
      c[i] = cd[i];
    }
}
gives the best:
fill:
    push	{r4, r5, r6}
    ldr	r6, .L5
    ldr	r5, .L5+4
    ldr	r4, .L5+8
    sub	r3, r6, #1280
    ldr	r0, .L5+12
    ldr	r1, .L5+16
    ldr	r2, .L5+20
.L2:
    vldmia	r0!, {d16-d17}
    vldmia	r5!, {d18-d19}
    vstmia	r4!, {d18-d19}
    vstmia	r1!, {d16-d17}
    vldmia	r2!, {d16-d17}
    vstmia	r3!, {d16-d17}
    cmp	r3, r6
    bne	.L2
    pop	{r4, r5, r6}
    bx	lr
(2) - extern destination, local source with -fno-section-anchors to
make the code more readable:
extern int a[320];
extern int b[320];
extern int c[320];
const int ad[320];
const int bd[320];
const int cd[320];
void fill()
{
  for (int i = 0; i < 320; i++)
    {
      a[i] = ad[i];
      b[i] = bd[i];
      c[i] = cd[i];
    }
}
fill:
    ldr	r2, .L5
    push	{r4, r5, r6, r7, r8}
    ldr	r0, .L5+4
    mov	r3, r2
    add	r8, r2, #1280
    ldr	r7, .L5+8
    ldr	r6, .L5+12
    rsb	ip, r3, r0
    ldr	r1, .L5+16
    ldr	r2, .L5+20
    subs	r7, r7, r3
    subs	r6, r6, r3
.L2:
    add	r5, ip, r3
    adds	r4, r7, r3
    vldmia	r2!, {d16-d17}
    vldmia	r1!, {d18-d19}
    adds	r0, r6, r3
    vst1.32	{q9}, [r5]
    vst1.32	{q8}, [r4]
    vldmia	r3, {d16-d17}
    adds	r3, r3, #16
    cmp	r3, r8
    vst1.32	{q8}, [r0]
    bne	.L2
    pop	{r4, r5, r6, r7, r8}
    bx	lr
(3) destination as arguments, restrict:
void fill3(int * __restrict a, int * __restrict b, int * __restrict c)
{
  for (int i = 0; i < 320; i++)
    {
      a[i] = ad[i];
      b[i] = bd[i];
      c[i] = cd[i];
    }
}
fill3:
    push	{r4, r5, r6, r7, r8}
    ldr	r6, .L23
    ldr	r5, .L23+4
    ldr	r4, .L23+8
    mov	r3, r6
    subs	r0, r0, r3
    add	r6, r6, #1280
    subs	r1, r1, r3
    subs	r2, r2, r3
.L21:
    add	r8, r3, r0
    add	ip, r3, r1
    vldmia	r4!, {d16-d17}
    vldmia	r5!, {d18-d19}
    adds	r7, r3, r2
    vst1.32	{q9}, [r8]
    vst1.32	{q8}, [ip]
    vldmia	r3, {d16-d17}
    adds	r3, r3, #16
    cmp	r3, r6
    vst1.32	{q8}, [r7]
    bne	.L21
    pop	{r4, r5, r6, r7, r8}
    bx	lr
(4) destination as aligned structs:
struct blob
{
  int v[320];
} __attribute__((aligned(128)));
void fill(struct blob * __restrict a, struct blob * __restrict b,
struct blob * __restrict c)
{
  for (int i = 0; i < 320; i++)
    {
      a->v[i] = ad[i];
      b->v[i] = bd[i];
      c->v[i] = cd[i];
    }
}
fill:
    push	{r4, r5, r6}
    add	r6, r2, #1280
    ldr	r3, .L5
    ldr	r4, .L5+4
    ldr	r5, .L5+8
.L2:
    vldmia	r3!, {d16-d17}
    vstmia	r0!, {d16-d17}
    vldmia	r4!, {d16-d17}
    vstmia	r1!, {d16-d17}
    vldmia	r5!, {d16-d17}
    vstmia	r2!, {d16-d17}
    cmp	r2, r6
    bne	.L2
    pop	{r4, r5, r6}
    bx	lr
Version (3) seems to rejigger the destination pointers.  I assume this
is as a side effect to not knowing if the target is aligned?
-- Michael

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

Re: Vectorised copy