SlideShare uma empresa Scribd logo
1 de 60
Baixar para ler offline
ENGINEERS AND DEVICES
WORKING TOGETHER
ENGINEERS
AND DEVICES
WORKING
TOGETHER
●
○
○
○
●
●
K (Android 4.4): Dalvik + JIT compiler
L (Android 5.0): ART + AOT compiler
M (Android 6.0): ART + AOT compiler
N (Android 7.0): ART + JIT/AOT compiler
O (Android 8.0): ART + JIT/AOT compiler + vectorization
●
●
●
●
●
●
ENGINEERS
AND DEVICES
WORKING
TOGETHER
A SIMD instruction performs a single
operation to multiple operands in parallel
ARM: NEON Technology (128-bit)
Intel: SSE* (128-bit)
AVX* (256-bit, 512-bit)
MIPS: MSA (128-bit)
All modern general-purpose CPUs support small-scale SIMD
instructions (typically between 64-bit and 512-bit)
4x32-bit operations
●
○
○
○
●
○
○
○
● Many vectorizing compilers were developed by
supercomputer vendors
● Intel introduced first vectorizing compiler for SSE in 1999
● Since the Android O release, the optimizing compiler of
ART has joined the family of vectorizing compilers
www.aartbik.com
ENGINEERS
AND DEVICES
WORKING
TOGETHER
for (int i = 0; i < 256; i++) { for (int i = 0; i < 256; i += 4) {
a[i] = b[i] + 1; -> a[i:i+3] = b[i:i+3] + [1,1,1,1];
} }
Ronny Reader
Abby AuthorWendy Writer
Perry Presenter Vinny Viewer Molly Maker Casey Creator
VectorOperation
VectorMemOpVectorBinOp
VectorAdd VectorSub VectorLoad VectorStore
….
….
has alignment
has vector length
has packed data type
A class hierarchy of general vector operations that is sufficiently
powerful to represent SIMD operations common to all architectures
t = [1,1,1,1];
for (int i = 0; i < 256; i += 4) { -> for (int i = 0; i < 256; i += 8) {
a[i:i+3] = b[i:i+3] + [1,1,1,1]; a[i :i+3] = b[i :i+3] + t;
} a[i+4:i+7] = b[i+4:i+7] + t;
}
t = [1,1,1,1];
for (int i = 0; i < 256; i += 8) { ->
a[i:i+3] = b[i:i+3] + t;
a[i+4:i+7] = b[i+4:i+7] + t;
}
movi v0.4s, #0x1, lsl #0
mov w3, #0xc
mov w0, #0x0
Loop: cmp w0, #0x100 (256)
b.hs Exit
add w4, w0, #0x4 (4)
add w0, w3, w0, lsl #2
add w5, w3, w4, lsl #2
ldr q1, [x2, x0]
add v1.4s, v1.4s, v0.4s
str q1, [x1, x0]
ldr q1, [x2, x5]
add v1.4s, v1.4s, v0.4s
str q1, [x1, x5]
add w0, w4, #0x4 (4)
ldrh w16, [tr] ; suspend check
cbz w16, Loop
VecReplicateScalar(x)
ARM64 x86-64 MIPS64
dup v0.4s, w2 movdq xmm0, rdx fill.w w0, a2
pshufd xmm0, xmm0, 0
/**
* Cross-fade byte arrays x1 and x2 into byte array x_out.
*/
private static void avg(byte[] x_out, byte[] x1, byte[] x2) {
// Compute minimum length of the three byte arrays.
int min = Math.min(x_out.length, Math.min(x1.length, x2.length));
// Morph with rounding halving add (unsigned).
for (int i = 0; i < min; i++) {
x_out[i] = (byte) (((x1[i] & 0xff) + (x2[i] & 0xff) + 1) >> 1);
}
}
SEQUENTIAL (ARMv8 AArch64)
L:cmp w5, w0
b.hs Exit
add w4, w2, #0xc (12)
add w6, w3, #0xc (12)
ldrsb w4, [x4, x5]
ldrsb w6, [x6, x5]
and w4, w4, #0xff
and w6, w6, #0xff
add w4, w4, w6
add w6, w1, #0xc (12)
add w4, w4, #0x1 (1)
asr w4, w4, #1
strb w4, [x6, x5]
add w5, w5, #0x1 (1)
ldrh w16, [tr] ; suspend check
cbz w16, L
SIMD (ARMv8 AArch64 + NEON Technology)
L:cmp w5, w4
b.hs Exit
add w16, w2, w5
ldur q0, [x16, #12]
add w16, w3, w5
ldur q1, [x16, #12]
urhadd v0.16b, v0.16b, v1.16b
add w16, w1, w5
stur q0, [x16, #12]
add w5, w5, #0x10 (16)
ldrh w16, [tr] ; suspend check
cbz w16, L
Runs about 10x faster!
Sequential performance SIMD performance (NEON 128-bit)
≈20fps ≈60fps
ENGINEERS
AND DEVICES
WORKING
TOGETHER
ENGINEERS AND DEVICES
WORKING TOGETHER
Java code Autovectorization result
void mul_add(int[] a,
int[] b) -{
for (int i = 0;
i < 512;
i++) {
a[i] += a[i] * b[i];
}
}
●
○
●
○
○
ENGINEERS AND DEVICES
WORKING TOGETHER
Java code Autovectorization result
void mul_add(int[] a,
int[] b) -{
for (int i = 0;
i < 512;
i++) {
a[i] += a[i] * b[i];
}
}
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.2s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.2s}, [x16]
mul v1.2s, v0.2s, v1.2s
add v0.2s, v0.2s, v1.2s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v0.2s}, [x16]
add w0, w0, #0x2
ldrh w16, [tr]
cbz w16, L
●
○
●
○
○
●
○
○
●
○
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (68% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.2s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.2s}, [x16]
mul v1.2s, v0.2s, v1.2s
add v0.2s, v0.2s, v1.2s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v0.2s}, [x16]
add w0, w0, #0x2
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.4s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.4s}, [x16]
mul v1.4s, v0.4s, v1.4s
add v0.4s, v0.4s, v1.4s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v0.4s}, [x16]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (68% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.2s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.2s}, [x16]
mul v1.2s, v0.2s, v1.2s
add v0.2s, v0.2s, v1.2s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v0.2s}, [x16]
add w0, w0, #0x2
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.4s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.4s}, [x16]
mul v1.4s, v0.4s, v1.4s
add v0.4s, v0.4s, v1.4s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v0.4s}, [x16]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (68% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.2s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.2s}, [x16]
mul v1.2s, v0.2s, v1.2s
add v0.2s, v0.2s, v1.2s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v0.2s}, [x16]
add w0, w0, #0x2
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.4s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.4s}, [x16]
mul v1.4s, v0.4s, v1.4s
add v0.4s, v0.4s, v1.4s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v0.4s}, [x16]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
●
○
●
○
●
○
●
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (11% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.4s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.4s}, [x16]
mul v1.4s, v0.4s, v1.4s
add v0.4s, v0.4s, v1.4s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v0.4s}, [x16]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.4s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.4s}, [x16]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v2.4s}, [x16]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
○
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (11% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.4s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.4s}, [x16]
mul v1.4s, v0.4s, v1.4s
add v0.4s, v0.4s, v1.4s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v0.4s}, [x16]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.4s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.4s}, [x16]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v2.4s}, [x16]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
○
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (11% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.4s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.4s}, [x16]
mul v1.4s, v0.4s, v1.4s
add v0.4s, v0.4s, v1.4s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v0.4s}, [x16]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.4s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.4s}, [x16]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v2.4s}, [x16]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
○
●
○
○
○
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (23% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.4s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.4s}, [x16]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v2.4s}, [x16]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, w0, lsl #2
ldur q0, [x16, #12]
add w16, w2, w0, lsl #2
ldur q1, [x16, #12]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
add w16, w1, w0, lsl #2
stur q2, [x16, #12]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
○
●
○
●
○
○
○
○
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (23% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.4s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.4s}, [x16]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v2.4s}, [x16]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, w0, lsl #2
ldur q0, [x16, #12]
add w16, w2, w0, lsl #2
ldur q1, [x16, #12]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
add w16, w1, w0, lsl #2
stur q2, [x16, #12]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
○
●
○
●
○
○
○
○
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (23% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.4s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.4s}, [x16]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v2.4s}, [x16]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, w0, lsl #2
ldur q0, [x16, #12]
add w16, w2, w0, lsl #2
ldur q1, [x16, #12]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
add w16, w1, w0, lsl #2
stur q2, [x16, #12]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
○
●
○
●
○
○
○
○
●
○
○
●
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (10% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, w0, lsl #2
ldur q0, [x16, #12]
add w16, w2, w0, lsl #2
ldur q1, [x16, #12]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
add w16, w1, w0, lsl #2
stur q2, [x16, #12]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
mov w3, #0xc
L:
cmp w0, #0x200
b.hs Exit
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
○
●
○
○
●
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (10% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, w0, lsl #2
ldur q0, [x16, #12]
add w16, w2, w0, lsl #2
ldur q1, [x16, #12]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
add w16, w1, w0, lsl #2
stur q2, [x16, #12]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
mov w3, #0xc
L:
cmp w0, #0x200
b.hs Exit
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
○
●
○
○
●
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (10% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, w0, lsl #2
ldur q0, [x16, #12]
add w16, w2, w0, lsl #2
ldur q1, [x16, #12]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
add w16, w1, w0, lsl #2
stur q2, [x16, #12]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
mov w3, #0xc
L:
cmp w0, #0x200
b.hs Exit
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
○
●
○
○
●
●
ENGINEERS AND DEVICES
WORKING TOGETHER
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (2.5% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (2.5% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (2.5% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
●
●
○
○
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (12% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w4, w0, #0x4
add w0, w3, w0, lsl #2
add w5, w3, w4, lsl #2
ldr q0, [x1, x0]
ldr q1, [x2, x0]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x0]
ldr q0, [x1, x5]
ldr q1, [x2, x5]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x5]
add w0, w4, #0x4
ldrh w16, [tr]
cbz w16, L
●
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (12% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w4, w0, #0x4
add w0, w3, w0, lsl #2
add w5, w3, w4, lsl #2
ldr q0, [x1, x0]
ldr q1, [x2, x0]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x0]
ldr q0, [x1, x5]
ldr q1, [x2, x5]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x5]
add w0, w4, #0x4
ldrh w16, [tr]
cbz w16, L
●
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (12% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w4, w0, #0x4
add w0, w3, w0, lsl #2
add w5, w3, w4, lsl #2
ldr q0, [x1, x0]
ldr q1, [x2, x0]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x0]
ldr q0, [x1, x5]
ldr q1, [x2, x5]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x5]
add w0, w4, #0x4
ldrh w16, [tr]
cbz w16, L
●
●
●
○
●
○
○
●
○
ENGINEERS AND DEVICES
WORKING TOGETHER
Before After (12% perf boost)
L:
cmp w0, #0x200
b.hs Exit
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
add w4, w3, w0, lsl #2
ldr q0, [x1, x4]
ldr q1, [x2, x4]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x4]
add w0, w0, #0x4
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w4, w0, #0x4
add w0, w3, w0, lsl #2
add w5, w3, w4, lsl #2
ldr q0, [x1, x0]
ldr q1, [x2, x0]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x0]
ldr q0, [x1, x5]
ldr q1, [x2, x5]
mov v2.16b, v0.16b
mla v2.4s, v0.4s, v1.4s
str q2, [x1, x5]
add w0, w4, #0x4
ldrh w16, [tr]
cbz w16, L
●
●
●
○
●
○
○
●
○
●
ENGINEERS AND DEVICES
WORKING TOGETHER
for (int i = 0; i < LENGTH; i++) {
c[i] = (byte)(a[i] + b[i]);
}
i87 Add [i80,i79]
i102 IntermediateAddressIndex [i87,i98,i3]
i99 IntermediateAddressIndex [i80,i98,i3]
d89 VecLoad [l35,i102]
d84 VecLoad [l35,i99]
d83 VecLoad [l29,i99]
d88 VecLoad [l29,i102]
d85 VecAdd [d83,d84]
d90 VecAdd [d88,d89]
d86 VecStore [l27,i99,d85]
d91 VecStore [l27,i102,d90]
i92 Add [i87,i79]
v78 Goto
●
○
○
●
ENGINEERS AND DEVICES
WORKING TOGETHER
(gdb) x/64u 0xefc0b000
0xefc0b000: 0 28 192 18 0 0 0 0
0xefc0b008: 0 0 4 0 100 101 102 103
0xefc0b010: 104 105 106 107 108 109 110 111
0xefc0b018: 112 113 114 115 116 117 118 119
0xefc0b020: 120 121 122 123 124 125 126 127
0xefc0b028: 128 129 130 131 132 133 134 135
0xefc0b030: 136 137 138 139 140 141 142 143
0xefc0b038: 144 145 146 147 148 149 150 151
Java Code
static final int LENGTH = 1024 * 256; // 256K elements, 0x40000
static byte [] a = new byte[LENGTH];
static byte [] b = new byte[LENGTH];
static byte [] c = new byte[LENGTH];
Object Header
data[0]
ENGINEERS AND DEVICES
WORKING TOGETHER
(gdb) x/64u 0xefc0b000
0xefc0b000: 0 28 192 18 0 0 0 0
0xefc0b008: 0 0 4 0 100 101 102 103
0xefc0b010: 104 105 106 107 108 109 110 111
0xefc0b018: 112 113 114 115 116 117 118 119
0xefc0b020: 120 121 122 123 124 125 126 127
0xefc0b028: 128 129 130 131 132 133 134 135
0xefc0b030: 136 137 138 139 140 141 142 143
0xefc0b038: 144 145 146 147 148 149 150 151
One VecLoad / VecStore
Java Code
static final int LENGTH = 1024 * 256; // 256K elements, 0x40000
static byte [] a = new byte[LENGTH];
static byte [] b = new byte[LENGTH];
static byte [] c = new byte[LENGTH];
Object Header
ENGINEERS AND DEVICES
WORKING TOGETHER
●
○
●
○
○
○
●
0xefc0b000: 0 28 192 18 0 0 0 0
0xefc0b008: 0 0 4 0 100 101 102 103
0xefc0b010: 104 105 106 107 108 109 110 111
0xefc0b018: 112 113 114 115 116 117 118 119
0xefc0b020: 120 121 122 123 124 125 126 127
0xefc0b028: 128 129 130 131 132 133 134 135
0xefc0b030: 136 137 138 139 140 141 142 143
0xefc0b038: 144 145 146 147 148 149 150 151
SIMD from here->
Avoid SIMD from here
ENGINEERS
AND DEVICES
WORKING
TOGETHER
ENGINEERS AND DEVICES
WORKING TOGETHER
●
○
●
●
○
○
ENGINEERS AND DEVICES
WORKING TOGETHER
●
○
○
●
●
●
●
○
ENGINEERS AND DEVICES
WORKING TOGETHER
●
●
○
○
○
○
○
○
○
●
○
○
○
Analyzable and flexible CHECKED!
Embeddable CHECKED!
Stable and reproducible CHECKED!
Recognized CHECKED!
ENGINEERS AND DEVICES
WORKING TOGETHER
●
●
○
○
○
●
○
○
○
ENGINEERS AND DEVICES
WORKING TOGETHER
●
ENGINEERS AND DEVICES
WORKING TOGETHER
●
ENGINEERS AND DEVICES
WORKING TOGETHER
●
ENGINEERS AND DEVICES
WORKING TOGETHER
●
ENGINEERS AND DEVICES
WORKING TOGETHER
●
○
●
○
●
○
●
○ LDR q1, [x16] + LDR q2, [x16, #16] -> LDP q1, q2, [x16]
●
○
ENGINEERS AND DEVICES
WORKING TOGETHER
●
●
○
●
○
○
Java Scalar version Initial SIMD Version
void mul_add(int[] a,
int[] b,
int[] c) -{
for (int i=0; i<512; i++) {
a[i] += a[i] * b[i];
}
}
L:
cmp w0, #0x200
b.hs Exit
add w4, w1, #0xc
ldr w6, [x4, x0, lsl #2]
add w5, w2, #0xc
ldr w5, [x5, x0, lsl #2]
madd w5, w6, w5, w6
str w5, [x4, x0, lsl #2]
add w0, w0, #0x1
ldrh w16, [tr]
cbz w16, L
L:
cmp w0, #0x200
b.hs Exit
add w16, w1, #0xc
add x16, x16, x0, lsl #2
ld1 {v0.2s}, [x16]
add w16, w2, #0xc
add x16, x16, x0, lsl #2
ld1 {v1.2s}, [x16]
mul v1.2s, v0.2s, v1.2s
add v0.2s, v0.2s, v1.2s
add w16, w1, #0xc
add x16, x16, x0, lsl #2
st1 {v0.2s}, [x16]
add w0, w0, #0x2
ldrh w16, [tr]
cbz w16, L

Mais conteúdo relacionado

Semelhante a Automatic Vectorization in ART (Android RunTime) - SFO17-216

Pragmatic Optimization in Modern Programming - Mastering Compiler Optimizations
Pragmatic Optimization in Modern Programming - Mastering Compiler OptimizationsPragmatic Optimization in Modern Programming - Mastering Compiler Optimizations
Pragmatic Optimization in Modern Programming - Mastering Compiler OptimizationsMarina Kolpakova
 
C++ Code as Seen by a Hypercritical Reviewer
C++ Code as Seen by a Hypercritical ReviewerC++ Code as Seen by a Hypercritical Reviewer
C++ Code as Seen by a Hypercritical ReviewerAndrey Karpov
 
The forgotten art of assembly
The forgotten art of assemblyThe forgotten art of assembly
The forgotten art of assemblyMarian Marinov
 
Javascript engine performance
Javascript engine performanceJavascript engine performance
Javascript engine performanceDuoyi Wu
 
Entendiendo redes convolucionales paso a paso
Entendiendo  redes convolucionales paso a pasoEntendiendo  redes convolucionales paso a paso
Entendiendo redes convolucionales paso a pasomoisesweb
 
Vector Codegen in the RISC-V Backend
Vector Codegen in the RISC-V BackendVector Codegen in the RISC-V Backend
Vector Codegen in the RISC-V BackendIgalia
 
Windows Debugging with WinDbg
Windows Debugging with WinDbgWindows Debugging with WinDbg
Windows Debugging with WinDbgArno Huetter
 
5 - Advanced SVE.pdf
5 - Advanced SVE.pdf5 - Advanced SVE.pdf
5 - Advanced SVE.pdfJunZhao68
 
Arm tools and roadmap for SVE compiler support
Arm tools and roadmap for SVE compiler supportArm tools and roadmap for SVE compiler support
Arm tools and roadmap for SVE compiler supportLinaro
 
Potapenko, vyukov forewarned is forearmed. a san and tsan
Potapenko, vyukov   forewarned is forearmed. a san and tsanPotapenko, vyukov   forewarned is forearmed. a san and tsan
Potapenko, vyukov forewarned is forearmed. a san and tsanDefconRussia
 
Implement an MPI program to perform matrix-matrix multiplication AB .pdf
Implement an MPI program to perform matrix-matrix multiplication AB .pdfImplement an MPI program to perform matrix-matrix multiplication AB .pdf
Implement an MPI program to perform matrix-matrix multiplication AB .pdfmeerobertsonheyde608
 
Abstracting Vector Architectures in Library Generators: Case Study Convolutio...
Abstracting Vector Architectures in Library Generators: Case Study Convolutio...Abstracting Vector Architectures in Library Generators: Case Study Convolutio...
Abstracting Vector Architectures in Library Generators: Case Study Convolutio...ETH Zurich
 
Please convert the following C code to assembly Y86int j,k; .....pdf
Please convert the following C code to assembly Y86int j,k; .....pdfPlease convert the following C code to assembly Y86int j,k; .....pdf
Please convert the following C code to assembly Y86int j,k; .....pdffoottraders
 
EMBEDDED SYSTEMS 4&5
EMBEDDED SYSTEMS 4&5EMBEDDED SYSTEMS 4&5
EMBEDDED SYSTEMS 4&5PRADEEP
 

Semelhante a Automatic Vectorization in ART (Android RunTime) - SFO17-216 (20)

Pragmatic Optimization in Modern Programming - Mastering Compiler Optimizations
Pragmatic Optimization in Modern Programming - Mastering Compiler OptimizationsPragmatic Optimization in Modern Programming - Mastering Compiler Optimizations
Pragmatic Optimization in Modern Programming - Mastering Compiler Optimizations
 
C++ Code as Seen by a Hypercritical Reviewer
C++ Code as Seen by a Hypercritical ReviewerC++ Code as Seen by a Hypercritical Reviewer
C++ Code as Seen by a Hypercritical Reviewer
 
The forgotten art of assembly
The forgotten art of assemblyThe forgotten art of assembly
The forgotten art of assembly
 
Debugging TV Frame 0x02
Debugging TV Frame 0x02Debugging TV Frame 0x02
Debugging TV Frame 0x02
 
Javascript engine performance
Javascript engine performanceJavascript engine performance
Javascript engine performance
 
Entendiendo redes convolucionales paso a paso
Entendiendo  redes convolucionales paso a pasoEntendiendo  redes convolucionales paso a paso
Entendiendo redes convolucionales paso a paso
 
Vector Codegen in the RISC-V Backend
Vector Codegen in the RISC-V BackendVector Codegen in the RISC-V Backend
Vector Codegen in the RISC-V Backend
 
Windows Debugging with WinDbg
Windows Debugging with WinDbgWindows Debugging with WinDbg
Windows Debugging with WinDbg
 
Debugging TV Frame 0x05
Debugging TV Frame 0x05Debugging TV Frame 0x05
Debugging TV Frame 0x05
 
RISC-V Zce Extension
RISC-V Zce ExtensionRISC-V Zce Extension
RISC-V Zce Extension
 
8086 labmanual
8086 labmanual8086 labmanual
8086 labmanual
 
8086 microprocessor lab manual
8086 microprocessor lab manual8086 microprocessor lab manual
8086 microprocessor lab manual
 
5 - Advanced SVE.pdf
5 - Advanced SVE.pdf5 - Advanced SVE.pdf
5 - Advanced SVE.pdf
 
Arm tools and roadmap for SVE compiler support
Arm tools and roadmap for SVE compiler supportArm tools and roadmap for SVE compiler support
Arm tools and roadmap for SVE compiler support
 
Rapport
RapportRapport
Rapport
 
Potapenko, vyukov forewarned is forearmed. a san and tsan
Potapenko, vyukov   forewarned is forearmed. a san and tsanPotapenko, vyukov   forewarned is forearmed. a san and tsan
Potapenko, vyukov forewarned is forearmed. a san and tsan
 
Implement an MPI program to perform matrix-matrix multiplication AB .pdf
Implement an MPI program to perform matrix-matrix multiplication AB .pdfImplement an MPI program to perform matrix-matrix multiplication AB .pdf
Implement an MPI program to perform matrix-matrix multiplication AB .pdf
 
Abstracting Vector Architectures in Library Generators: Case Study Convolutio...
Abstracting Vector Architectures in Library Generators: Case Study Convolutio...Abstracting Vector Architectures in Library Generators: Case Study Convolutio...
Abstracting Vector Architectures in Library Generators: Case Study Convolutio...
 
Please convert the following C code to assembly Y86int j,k; .....pdf
Please convert the following C code to assembly Y86int j,k; .....pdfPlease convert the following C code to assembly Y86int j,k; .....pdf
Please convert the following C code to assembly Y86int j,k; .....pdf
 
EMBEDDED SYSTEMS 4&5
EMBEDDED SYSTEMS 4&5EMBEDDED SYSTEMS 4&5
EMBEDDED SYSTEMS 4&5
 

Mais de Linaro

Deep Learning Neural Network Acceleration at the Edge - Andrea Gallo
Deep Learning Neural Network Acceleration at the Edge - Andrea GalloDeep Learning Neural Network Acceleration at the Edge - Andrea Gallo
Deep Learning Neural Network Acceleration at the Edge - Andrea GalloLinaro
 
Arm Architecture HPC Workshop Santa Clara 2018 - Kanta Vekaria
Arm Architecture HPC Workshop Santa Clara 2018 - Kanta VekariaArm Architecture HPC Workshop Santa Clara 2018 - Kanta Vekaria
Arm Architecture HPC Workshop Santa Clara 2018 - Kanta VekariaLinaro
 
Huawei’s requirements for the ARM based HPC solution readiness - Joshua Mora
Huawei’s requirements for the ARM based HPC solution readiness - Joshua MoraHuawei’s requirements for the ARM based HPC solution readiness - Joshua Mora
Huawei’s requirements for the ARM based HPC solution readiness - Joshua MoraLinaro
 
Bud17 113: distribution ci using qemu and open qa
Bud17 113: distribution ci using qemu and open qaBud17 113: distribution ci using qemu and open qa
Bud17 113: distribution ci using qemu and open qaLinaro
 
OpenHPC Automation with Ansible - Renato Golin - Linaro Arm HPC Workshop 2018
OpenHPC Automation with Ansible - Renato Golin - Linaro Arm HPC Workshop 2018OpenHPC Automation with Ansible - Renato Golin - Linaro Arm HPC Workshop 2018
OpenHPC Automation with Ansible - Renato Golin - Linaro Arm HPC Workshop 2018Linaro
 
HPC network stack on ARM - Linaro HPC Workshop 2018
HPC network stack on ARM - Linaro HPC Workshop 2018HPC network stack on ARM - Linaro HPC Workshop 2018
HPC network stack on ARM - Linaro HPC Workshop 2018Linaro
 
It just keeps getting better - SUSE enablement for Arm - Linaro HPC Workshop ...
It just keeps getting better - SUSE enablement for Arm - Linaro HPC Workshop ...It just keeps getting better - SUSE enablement for Arm - Linaro HPC Workshop ...
It just keeps getting better - SUSE enablement for Arm - Linaro HPC Workshop ...Linaro
 
Intelligent Interconnect Architecture to Enable Next Generation HPC - Linaro ...
Intelligent Interconnect Architecture to Enable Next Generation HPC - Linaro ...Intelligent Interconnect Architecture to Enable Next Generation HPC - Linaro ...
Intelligent Interconnect Architecture to Enable Next Generation HPC - Linaro ...Linaro
 
Yutaka Ishikawa - Post-K and Arm HPC Ecosystem - Linaro Arm HPC Workshop Sant...
Yutaka Ishikawa - Post-K and Arm HPC Ecosystem - Linaro Arm HPC Workshop Sant...Yutaka Ishikawa - Post-K and Arm HPC Ecosystem - Linaro Arm HPC Workshop Sant...
Yutaka Ishikawa - Post-K and Arm HPC Ecosystem - Linaro Arm HPC Workshop Sant...Linaro
 
Andrew J Younge - Vanguard Astra - Petascale Arm Platform for U.S. DOE/ASC Su...
Andrew J Younge - Vanguard Astra - Petascale Arm Platform for U.S. DOE/ASC Su...Andrew J Younge - Vanguard Astra - Petascale Arm Platform for U.S. DOE/ASC Su...
Andrew J Younge - Vanguard Astra - Petascale Arm Platform for U.S. DOE/ASC Su...Linaro
 
HKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainline
HKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainlineHKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainline
HKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainlineLinaro
 
HKG18-100K1 - George Grey: Opening Keynote
HKG18-100K1 - George Grey: Opening KeynoteHKG18-100K1 - George Grey: Opening Keynote
HKG18-100K1 - George Grey: Opening KeynoteLinaro
 
HKG18-318 - OpenAMP Workshop
HKG18-318 - OpenAMP WorkshopHKG18-318 - OpenAMP Workshop
HKG18-318 - OpenAMP WorkshopLinaro
 
HKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainline
HKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainlineHKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainline
HKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainlineLinaro
 
HKG18-315 - Why the ecosystem is a wonderful thing, warts and all
HKG18-315 - Why the ecosystem is a wonderful thing, warts and allHKG18-315 - Why the ecosystem is a wonderful thing, warts and all
HKG18-315 - Why the ecosystem is a wonderful thing, warts and allLinaro
 
HKG18- 115 - Partitioning ARM Systems with the Jailhouse Hypervisor
HKG18- 115 - Partitioning ARM Systems with the Jailhouse HypervisorHKG18- 115 - Partitioning ARM Systems with the Jailhouse Hypervisor
HKG18- 115 - Partitioning ARM Systems with the Jailhouse HypervisorLinaro
 
HKG18-TR08 - Upstreaming SVE in QEMU
HKG18-TR08 - Upstreaming SVE in QEMUHKG18-TR08 - Upstreaming SVE in QEMU
HKG18-TR08 - Upstreaming SVE in QEMULinaro
 
HKG18-113- Secure Data Path work with i.MX8M
HKG18-113- Secure Data Path work with i.MX8MHKG18-113- Secure Data Path work with i.MX8M
HKG18-113- Secure Data Path work with i.MX8MLinaro
 
HKG18-120 - Devicetree Schema Documentation and Validation
HKG18-120 - Devicetree Schema Documentation and Validation HKG18-120 - Devicetree Schema Documentation and Validation
HKG18-120 - Devicetree Schema Documentation and Validation Linaro
 
HKG18-223 - Trusted FirmwareM: Trusted boot
HKG18-223 - Trusted FirmwareM: Trusted bootHKG18-223 - Trusted FirmwareM: Trusted boot
HKG18-223 - Trusted FirmwareM: Trusted bootLinaro
 

Mais de Linaro (20)

Deep Learning Neural Network Acceleration at the Edge - Andrea Gallo
Deep Learning Neural Network Acceleration at the Edge - Andrea GalloDeep Learning Neural Network Acceleration at the Edge - Andrea Gallo
Deep Learning Neural Network Acceleration at the Edge - Andrea Gallo
 
Arm Architecture HPC Workshop Santa Clara 2018 - Kanta Vekaria
Arm Architecture HPC Workshop Santa Clara 2018 - Kanta VekariaArm Architecture HPC Workshop Santa Clara 2018 - Kanta Vekaria
Arm Architecture HPC Workshop Santa Clara 2018 - Kanta Vekaria
 
Huawei’s requirements for the ARM based HPC solution readiness - Joshua Mora
Huawei’s requirements for the ARM based HPC solution readiness - Joshua MoraHuawei’s requirements for the ARM based HPC solution readiness - Joshua Mora
Huawei’s requirements for the ARM based HPC solution readiness - Joshua Mora
 
Bud17 113: distribution ci using qemu and open qa
Bud17 113: distribution ci using qemu and open qaBud17 113: distribution ci using qemu and open qa
Bud17 113: distribution ci using qemu and open qa
 
OpenHPC Automation with Ansible - Renato Golin - Linaro Arm HPC Workshop 2018
OpenHPC Automation with Ansible - Renato Golin - Linaro Arm HPC Workshop 2018OpenHPC Automation with Ansible - Renato Golin - Linaro Arm HPC Workshop 2018
OpenHPC Automation with Ansible - Renato Golin - Linaro Arm HPC Workshop 2018
 
HPC network stack on ARM - Linaro HPC Workshop 2018
HPC network stack on ARM - Linaro HPC Workshop 2018HPC network stack on ARM - Linaro HPC Workshop 2018
HPC network stack on ARM - Linaro HPC Workshop 2018
 
It just keeps getting better - SUSE enablement for Arm - Linaro HPC Workshop ...
It just keeps getting better - SUSE enablement for Arm - Linaro HPC Workshop ...It just keeps getting better - SUSE enablement for Arm - Linaro HPC Workshop ...
It just keeps getting better - SUSE enablement for Arm - Linaro HPC Workshop ...
 
Intelligent Interconnect Architecture to Enable Next Generation HPC - Linaro ...
Intelligent Interconnect Architecture to Enable Next Generation HPC - Linaro ...Intelligent Interconnect Architecture to Enable Next Generation HPC - Linaro ...
Intelligent Interconnect Architecture to Enable Next Generation HPC - Linaro ...
 
Yutaka Ishikawa - Post-K and Arm HPC Ecosystem - Linaro Arm HPC Workshop Sant...
Yutaka Ishikawa - Post-K and Arm HPC Ecosystem - Linaro Arm HPC Workshop Sant...Yutaka Ishikawa - Post-K and Arm HPC Ecosystem - Linaro Arm HPC Workshop Sant...
Yutaka Ishikawa - Post-K and Arm HPC Ecosystem - Linaro Arm HPC Workshop Sant...
 
Andrew J Younge - Vanguard Astra - Petascale Arm Platform for U.S. DOE/ASC Su...
Andrew J Younge - Vanguard Astra - Petascale Arm Platform for U.S. DOE/ASC Su...Andrew J Younge - Vanguard Astra - Petascale Arm Platform for U.S. DOE/ASC Su...
Andrew J Younge - Vanguard Astra - Petascale Arm Platform for U.S. DOE/ASC Su...
 
HKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainline
HKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainlineHKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainline
HKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainline
 
HKG18-100K1 - George Grey: Opening Keynote
HKG18-100K1 - George Grey: Opening KeynoteHKG18-100K1 - George Grey: Opening Keynote
HKG18-100K1 - George Grey: Opening Keynote
 
HKG18-318 - OpenAMP Workshop
HKG18-318 - OpenAMP WorkshopHKG18-318 - OpenAMP Workshop
HKG18-318 - OpenAMP Workshop
 
HKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainline
HKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainlineHKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainline
HKG18-501 - EAS on Common Kernel 4.14 and getting (much) closer to mainline
 
HKG18-315 - Why the ecosystem is a wonderful thing, warts and all
HKG18-315 - Why the ecosystem is a wonderful thing, warts and allHKG18-315 - Why the ecosystem is a wonderful thing, warts and all
HKG18-315 - Why the ecosystem is a wonderful thing, warts and all
 
HKG18- 115 - Partitioning ARM Systems with the Jailhouse Hypervisor
HKG18- 115 - Partitioning ARM Systems with the Jailhouse HypervisorHKG18- 115 - Partitioning ARM Systems with the Jailhouse Hypervisor
HKG18- 115 - Partitioning ARM Systems with the Jailhouse Hypervisor
 
HKG18-TR08 - Upstreaming SVE in QEMU
HKG18-TR08 - Upstreaming SVE in QEMUHKG18-TR08 - Upstreaming SVE in QEMU
HKG18-TR08 - Upstreaming SVE in QEMU
 
HKG18-113- Secure Data Path work with i.MX8M
HKG18-113- Secure Data Path work with i.MX8MHKG18-113- Secure Data Path work with i.MX8M
HKG18-113- Secure Data Path work with i.MX8M
 
HKG18-120 - Devicetree Schema Documentation and Validation
HKG18-120 - Devicetree Schema Documentation and Validation HKG18-120 - Devicetree Schema Documentation and Validation
HKG18-120 - Devicetree Schema Documentation and Validation
 
HKG18-223 - Trusted FirmwareM: Trusted boot
HKG18-223 - Trusted FirmwareM: Trusted bootHKG18-223 - Trusted FirmwareM: Trusted boot
HKG18-223 - Trusted FirmwareM: Trusted boot
 

Último

Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...Neo4j
 
Factors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptxFactors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptxKatpro Technologies
 
08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking Men08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking MenDelhi Call girls
 
08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking MenDelhi Call girls
 
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc
 
Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024The Digital Insurer
 
Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...Enterprise Knowledge
 
WhatsApp 9892124323 ✓Call Girls In Kalyan ( Mumbai ) secure service
WhatsApp 9892124323 ✓Call Girls In Kalyan ( Mumbai ) secure serviceWhatsApp 9892124323 ✓Call Girls In Kalyan ( Mumbai ) secure service
WhatsApp 9892124323 ✓Call Girls In Kalyan ( Mumbai ) secure servicePooja Nehwal
 
Breaking the Kubernetes Kill Chain: Host Path Mount
Breaking the Kubernetes Kill Chain: Host Path MountBreaking the Kubernetes Kill Chain: Host Path Mount
Breaking the Kubernetes Kill Chain: Host Path MountPuma Security, LLC
 
The Codex of Business Writing Software for Real-World Solutions 2.pptx
The Codex of Business Writing Software for Real-World Solutions 2.pptxThe Codex of Business Writing Software for Real-World Solutions 2.pptx
The Codex of Business Writing Software for Real-World Solutions 2.pptxMalak Abu Hammad
 
How to convert PDF to text with Nanonets
How to convert PDF to text with NanonetsHow to convert PDF to text with Nanonets
How to convert PDF to text with Nanonetsnaman860154
 
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Igalia
 
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...
Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...Neo4j
 
Developing An App To Navigate The Roads of Brazil
Developing An App To Navigate The Roads of BrazilDeveloping An App To Navigate The Roads of Brazil
Developing An App To Navigate The Roads of BrazilV3cube
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsMaria Levchenko
 
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfThe Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfEnterprise Knowledge
 
A Call to Action for Generative AI in 2024
A Call to Action for Generative AI in 2024A Call to Action for Generative AI in 2024
A Call to Action for Generative AI in 2024Results
 
Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024The Digital Insurer
 
2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...Martijn de Jong
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerThousandEyes
 

Último (20)

Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
 
Factors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptxFactors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptx
 
08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking Men08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking Men
 
08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men
 
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
 
Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024
 
Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...
 
WhatsApp 9892124323 ✓Call Girls In Kalyan ( Mumbai ) secure service
WhatsApp 9892124323 ✓Call Girls In Kalyan ( Mumbai ) secure serviceWhatsApp 9892124323 ✓Call Girls In Kalyan ( Mumbai ) secure service
WhatsApp 9892124323 ✓Call Girls In Kalyan ( Mumbai ) secure service
 
Breaking the Kubernetes Kill Chain: Host Path Mount
Breaking the Kubernetes Kill Chain: Host Path MountBreaking the Kubernetes Kill Chain: Host Path Mount
Breaking the Kubernetes Kill Chain: Host Path Mount
 
The Codex of Business Writing Software for Real-World Solutions 2.pptx
The Codex of Business Writing Software for Real-World Solutions 2.pptxThe Codex of Business Writing Software for Real-World Solutions 2.pptx
The Codex of Business Writing Software for Real-World Solutions 2.pptx
 
How to convert PDF to text with Nanonets
How to convert PDF to text with NanonetsHow to convert PDF to text with Nanonets
How to convert PDF to text with Nanonets
 
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
 
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...
Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...
 
Developing An App To Navigate The Roads of Brazil
Developing An App To Navigate The Roads of BrazilDeveloping An App To Navigate The Roads of Brazil
Developing An App To Navigate The Roads of Brazil
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed texts
 
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfThe Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
 
A Call to Action for Generative AI in 2024
A Call to Action for Generative AI in 2024A Call to Action for Generative AI in 2024
A Call to Action for Generative AI in 2024
 
Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024
 
2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected Worker
 

Automatic Vectorization in ART (Android RunTime) - SFO17-216

  • 1.
  • 5. K (Android 4.4): Dalvik + JIT compiler L (Android 5.0): ART + AOT compiler M (Android 6.0): ART + AOT compiler N (Android 7.0): ART + JIT/AOT compiler O (Android 8.0): ART + JIT/AOT compiler + vectorization
  • 8. A SIMD instruction performs a single operation to multiple operands in parallel ARM: NEON Technology (128-bit) Intel: SSE* (128-bit) AVX* (256-bit, 512-bit) MIPS: MSA (128-bit) All modern general-purpose CPUs support small-scale SIMD instructions (typically between 64-bit and 512-bit) 4x32-bit operations
  • 10. ● Many vectorizing compilers were developed by supercomputer vendors ● Intel introduced first vectorizing compiler for SSE in 1999 ● Since the Android O release, the optimizing compiler of ART has joined the family of vectorizing compilers www.aartbik.com
  • 12. for (int i = 0; i < 256; i++) { for (int i = 0; i < 256; i += 4) { a[i] = b[i] + 1; -> a[i:i+3] = b[i:i+3] + [1,1,1,1]; } }
  • 13. Ronny Reader Abby AuthorWendy Writer Perry Presenter Vinny Viewer Molly Maker Casey Creator VectorOperation VectorMemOpVectorBinOp VectorAdd VectorSub VectorLoad VectorStore …. …. has alignment has vector length has packed data type A class hierarchy of general vector operations that is sufficiently powerful to represent SIMD operations common to all architectures
  • 14. t = [1,1,1,1]; for (int i = 0; i < 256; i += 4) { -> for (int i = 0; i < 256; i += 8) { a[i:i+3] = b[i:i+3] + [1,1,1,1]; a[i :i+3] = b[i :i+3] + t; } a[i+4:i+7] = b[i+4:i+7] + t; }
  • 15.
  • 16. t = [1,1,1,1]; for (int i = 0; i < 256; i += 8) { -> a[i:i+3] = b[i:i+3] + t; a[i+4:i+7] = b[i+4:i+7] + t; } movi v0.4s, #0x1, lsl #0 mov w3, #0xc mov w0, #0x0 Loop: cmp w0, #0x100 (256) b.hs Exit add w4, w0, #0x4 (4) add w0, w3, w0, lsl #2 add w5, w3, w4, lsl #2 ldr q1, [x2, x0] add v1.4s, v1.4s, v0.4s str q1, [x1, x0] ldr q1, [x2, x5] add v1.4s, v1.4s, v0.4s str q1, [x1, x5] add w0, w4, #0x4 (4) ldrh w16, [tr] ; suspend check cbz w16, Loop
  • 17. VecReplicateScalar(x) ARM64 x86-64 MIPS64 dup v0.4s, w2 movdq xmm0, rdx fill.w w0, a2 pshufd xmm0, xmm0, 0
  • 18. /** * Cross-fade byte arrays x1 and x2 into byte array x_out. */ private static void avg(byte[] x_out, byte[] x1, byte[] x2) { // Compute minimum length of the three byte arrays. int min = Math.min(x_out.length, Math.min(x1.length, x2.length)); // Morph with rounding halving add (unsigned). for (int i = 0; i < min; i++) { x_out[i] = (byte) (((x1[i] & 0xff) + (x2[i] & 0xff) + 1) >> 1); } }
  • 19. SEQUENTIAL (ARMv8 AArch64) L:cmp w5, w0 b.hs Exit add w4, w2, #0xc (12) add w6, w3, #0xc (12) ldrsb w4, [x4, x5] ldrsb w6, [x6, x5] and w4, w4, #0xff and w6, w6, #0xff add w4, w4, w6 add w6, w1, #0xc (12) add w4, w4, #0x1 (1) asr w4, w4, #1 strb w4, [x6, x5] add w5, w5, #0x1 (1) ldrh w16, [tr] ; suspend check cbz w16, L SIMD (ARMv8 AArch64 + NEON Technology) L:cmp w5, w4 b.hs Exit add w16, w2, w5 ldur q0, [x16, #12] add w16, w3, w5 ldur q1, [x16, #12] urhadd v0.16b, v0.16b, v1.16b add w16, w1, w5 stur q0, [x16, #12] add w5, w5, #0x10 (16) ldrh w16, [tr] ; suspend check cbz w16, L Runs about 10x faster!
  • 20. Sequential performance SIMD performance (NEON 128-bit) ≈20fps ≈60fps
  • 22. ENGINEERS AND DEVICES WORKING TOGETHER Java code Autovectorization result void mul_add(int[] a, int[] b) -{ for (int i = 0; i < 512; i++) { a[i] += a[i] * b[i]; } } ● ○ ● ○ ○
  • 23. ENGINEERS AND DEVICES WORKING TOGETHER Java code Autovectorization result void mul_add(int[] a, int[] b) -{ for (int i = 0; i < 512; i++) { a[i] += a[i] * b[i]; } } L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.2s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.2s}, [x16] mul v1.2s, v0.2s, v1.2s add v0.2s, v0.2s, v1.2s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v0.2s}, [x16] add w0, w0, #0x2 ldrh w16, [tr] cbz w16, L ● ○ ● ○ ○ ● ○ ○ ● ○
  • 24. ENGINEERS AND DEVICES WORKING TOGETHER Before After (68% perf boost) L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.2s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.2s}, [x16] mul v1.2s, v0.2s, v1.2s add v0.2s, v0.2s, v1.2s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v0.2s}, [x16] add w0, w0, #0x2 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.4s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.4s}, [x16] mul v1.4s, v0.4s, v1.4s add v0.4s, v0.4s, v1.4s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v0.4s}, [x16] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ●
  • 25. ENGINEERS AND DEVICES WORKING TOGETHER Before After (68% perf boost) L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.2s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.2s}, [x16] mul v1.2s, v0.2s, v1.2s add v0.2s, v0.2s, v1.2s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v0.2s}, [x16] add w0, w0, #0x2 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.4s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.4s}, [x16] mul v1.4s, v0.4s, v1.4s add v0.4s, v0.4s, v1.4s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v0.4s}, [x16] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ●
  • 26. ENGINEERS AND DEVICES WORKING TOGETHER Before After (68% perf boost) L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.2s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.2s}, [x16] mul v1.2s, v0.2s, v1.2s add v0.2s, v0.2s, v1.2s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v0.2s}, [x16] add w0, w0, #0x2 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.4s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.4s}, [x16] mul v1.4s, v0.4s, v1.4s add v0.4s, v0.4s, v1.4s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v0.4s}, [x16] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ● ● ○ ● ○ ● ○ ●
  • 27. ENGINEERS AND DEVICES WORKING TOGETHER Before After (11% perf boost) L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.4s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.4s}, [x16] mul v1.4s, v0.4s, v1.4s add v0.4s, v0.4s, v1.4s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v0.4s}, [x16] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.4s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.4s}, [x16] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v2.4s}, [x16] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ● ○
  • 28. ENGINEERS AND DEVICES WORKING TOGETHER Before After (11% perf boost) L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.4s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.4s}, [x16] mul v1.4s, v0.4s, v1.4s add v0.4s, v0.4s, v1.4s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v0.4s}, [x16] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.4s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.4s}, [x16] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v2.4s}, [x16] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ● ○
  • 29. ENGINEERS AND DEVICES WORKING TOGETHER Before After (11% perf boost) L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.4s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.4s}, [x16] mul v1.4s, v0.4s, v1.4s add v0.4s, v0.4s, v1.4s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v0.4s}, [x16] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.4s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.4s}, [x16] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v2.4s}, [x16] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ● ○ ● ○ ○ ○
  • 30. ENGINEERS AND DEVICES WORKING TOGETHER Before After (23% perf boost) L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.4s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.4s}, [x16] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v2.4s}, [x16] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w16, w1, w0, lsl #2 ldur q0, [x16, #12] add w16, w2, w0, lsl #2 ldur q1, [x16, #12] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s add w16, w1, w0, lsl #2 stur q2, [x16, #12] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ● ○ ● ○ ● ○ ○ ○ ○
  • 31. ENGINEERS AND DEVICES WORKING TOGETHER Before After (23% perf boost) L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.4s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.4s}, [x16] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v2.4s}, [x16] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w16, w1, w0, lsl #2 ldur q0, [x16, #12] add w16, w2, w0, lsl #2 ldur q1, [x16, #12] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s add w16, w1, w0, lsl #2 stur q2, [x16, #12] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ● ○ ● ○ ● ○ ○ ○ ○
  • 32. ENGINEERS AND DEVICES WORKING TOGETHER Before After (23% perf boost) L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.4s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.4s}, [x16] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v2.4s}, [x16] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w16, w1, w0, lsl #2 ldur q0, [x16, #12] add w16, w2, w0, lsl #2 ldur q1, [x16, #12] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s add w16, w1, w0, lsl #2 stur q2, [x16, #12] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ● ○ ● ○ ● ○ ○ ○ ○ ● ○ ○ ●
  • 33. ENGINEERS AND DEVICES WORKING TOGETHER Before After (10% perf boost) L: cmp w0, #0x200 b.hs Exit add w16, w1, w0, lsl #2 ldur q0, [x16, #12] add w16, w2, w0, lsl #2 ldur q1, [x16, #12] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s add w16, w1, w0, lsl #2 stur q2, [x16, #12] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L mov w3, #0xc L: cmp w0, #0x200 b.hs Exit add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ● ○ ● ○ ○ ●
  • 34. ENGINEERS AND DEVICES WORKING TOGETHER Before After (10% perf boost) L: cmp w0, #0x200 b.hs Exit add w16, w1, w0, lsl #2 ldur q0, [x16, #12] add w16, w2, w0, lsl #2 ldur q1, [x16, #12] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s add w16, w1, w0, lsl #2 stur q2, [x16, #12] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L mov w3, #0xc L: cmp w0, #0x200 b.hs Exit add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ● ○ ● ○ ○ ●
  • 35. ENGINEERS AND DEVICES WORKING TOGETHER Before After (10% perf boost) L: cmp w0, #0x200 b.hs Exit add w16, w1, w0, lsl #2 ldur q0, [x16, #12] add w16, w2, w0, lsl #2 ldur q1, [x16, #12] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s add w16, w1, w0, lsl #2 stur q2, [x16, #12] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L mov w3, #0xc L: cmp w0, #0x200 b.hs Exit add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ● ○ ● ○ ○ ● ●
  • 37. ENGINEERS AND DEVICES WORKING TOGETHER Before After (2.5% perf boost) L: cmp w0, #0x200 b.hs Exit add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ●
  • 38. ENGINEERS AND DEVICES WORKING TOGETHER Before After (2.5% perf boost) L: cmp w0, #0x200 b.hs Exit add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ●
  • 39. ENGINEERS AND DEVICES WORKING TOGETHER Before After (2.5% perf boost) L: cmp w0, #0x200 b.hs Exit add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L ● ● ○ ○
  • 40. ENGINEERS AND DEVICES WORKING TOGETHER Before After (12% perf boost) L: cmp w0, #0x200 b.hs Exit add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w4, w0, #0x4 add w0, w3, w0, lsl #2 add w5, w3, w4, lsl #2 ldr q0, [x1, x0] ldr q1, [x2, x0] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x0] ldr q0, [x1, x5] ldr q1, [x2, x5] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x5] add w0, w4, #0x4 ldrh w16, [tr] cbz w16, L ●
  • 41. ENGINEERS AND DEVICES WORKING TOGETHER Before After (12% perf boost) L: cmp w0, #0x200 b.hs Exit add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w4, w0, #0x4 add w0, w3, w0, lsl #2 add w5, w3, w4, lsl #2 ldr q0, [x1, x0] ldr q1, [x2, x0] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x0] ldr q0, [x1, x5] ldr q1, [x2, x5] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x5] add w0, w4, #0x4 ldrh w16, [tr] cbz w16, L ●
  • 42. ENGINEERS AND DEVICES WORKING TOGETHER Before After (12% perf boost) L: cmp w0, #0x200 b.hs Exit add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w4, w0, #0x4 add w0, w3, w0, lsl #2 add w5, w3, w4, lsl #2 ldr q0, [x1, x0] ldr q1, [x2, x0] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x0] ldr q0, [x1, x5] ldr q1, [x2, x5] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x5] add w0, w4, #0x4 ldrh w16, [tr] cbz w16, L ● ● ● ○ ● ○ ○ ● ○
  • 43. ENGINEERS AND DEVICES WORKING TOGETHER Before After (12% perf boost) L: cmp w0, #0x200 b.hs Exit add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 add w4, w3, w0, lsl #2 ldr q0, [x1, x4] ldr q1, [x2, x4] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x4] add w0, w0, #0x4 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w4, w0, #0x4 add w0, w3, w0, lsl #2 add w5, w3, w4, lsl #2 ldr q0, [x1, x0] ldr q1, [x2, x0] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x0] ldr q0, [x1, x5] ldr q1, [x2, x5] mov v2.16b, v0.16b mla v2.4s, v0.4s, v1.4s str q2, [x1, x5] add w0, w4, #0x4 ldrh w16, [tr] cbz w16, L ● ● ● ○ ● ○ ○ ● ○ ●
  • 44. ENGINEERS AND DEVICES WORKING TOGETHER for (int i = 0; i < LENGTH; i++) { c[i] = (byte)(a[i] + b[i]); } i87 Add [i80,i79] i102 IntermediateAddressIndex [i87,i98,i3] i99 IntermediateAddressIndex [i80,i98,i3] d89 VecLoad [l35,i102] d84 VecLoad [l35,i99] d83 VecLoad [l29,i99] d88 VecLoad [l29,i102] d85 VecAdd [d83,d84] d90 VecAdd [d88,d89] d86 VecStore [l27,i99,d85] d91 VecStore [l27,i102,d90] i92 Add [i87,i79] v78 Goto ● ○ ○ ●
  • 45. ENGINEERS AND DEVICES WORKING TOGETHER (gdb) x/64u 0xefc0b000 0xefc0b000: 0 28 192 18 0 0 0 0 0xefc0b008: 0 0 4 0 100 101 102 103 0xefc0b010: 104 105 106 107 108 109 110 111 0xefc0b018: 112 113 114 115 116 117 118 119 0xefc0b020: 120 121 122 123 124 125 126 127 0xefc0b028: 128 129 130 131 132 133 134 135 0xefc0b030: 136 137 138 139 140 141 142 143 0xefc0b038: 144 145 146 147 148 149 150 151 Java Code static final int LENGTH = 1024 * 256; // 256K elements, 0x40000 static byte [] a = new byte[LENGTH]; static byte [] b = new byte[LENGTH]; static byte [] c = new byte[LENGTH]; Object Header data[0]
  • 46. ENGINEERS AND DEVICES WORKING TOGETHER (gdb) x/64u 0xefc0b000 0xefc0b000: 0 28 192 18 0 0 0 0 0xefc0b008: 0 0 4 0 100 101 102 103 0xefc0b010: 104 105 106 107 108 109 110 111 0xefc0b018: 112 113 114 115 116 117 118 119 0xefc0b020: 120 121 122 123 124 125 126 127 0xefc0b028: 128 129 130 131 132 133 134 135 0xefc0b030: 136 137 138 139 140 141 142 143 0xefc0b038: 144 145 146 147 148 149 150 151 One VecLoad / VecStore Java Code static final int LENGTH = 1024 * 256; // 256K elements, 0x40000 static byte [] a = new byte[LENGTH]; static byte [] b = new byte[LENGTH]; static byte [] c = new byte[LENGTH]; Object Header
  • 47. ENGINEERS AND DEVICES WORKING TOGETHER ● ○ ● ○ ○ ○ ● 0xefc0b000: 0 28 192 18 0 0 0 0 0xefc0b008: 0 0 4 0 100 101 102 103 0xefc0b010: 104 105 106 107 108 109 110 111 0xefc0b018: 112 113 114 115 116 117 118 119 0xefc0b020: 120 121 122 123 124 125 126 127 0xefc0b028: 128 129 130 131 132 133 134 135 0xefc0b030: 136 137 138 139 140 141 142 143 0xefc0b038: 144 145 146 147 148 149 150 151 SIMD from here-> Avoid SIMD from here
  • 49. ENGINEERS AND DEVICES WORKING TOGETHER ● ○ ● ● ○ ○
  • 50. ENGINEERS AND DEVICES WORKING TOGETHER ● ○ ○ ● ● ● ● ○
  • 51. ENGINEERS AND DEVICES WORKING TOGETHER ● ● ○ ○ ○ ○ ○ ○ ○ ● ○ ○ ○ Analyzable and flexible CHECKED! Embeddable CHECKED! Stable and reproducible CHECKED! Recognized CHECKED!
  • 52. ENGINEERS AND DEVICES WORKING TOGETHER ● ● ○ ○ ○ ● ○ ○ ○
  • 57. ENGINEERS AND DEVICES WORKING TOGETHER ● ○ ● ○ ● ○ ● ○ LDR q1, [x16] + LDR q2, [x16, #16] -> LDP q1, q2, [x16] ● ○
  • 58.
  • 59. ENGINEERS AND DEVICES WORKING TOGETHER ● ● ○ ● ○ ○
  • 60. Java Scalar version Initial SIMD Version void mul_add(int[] a, int[] b, int[] c) -{ for (int i=0; i<512; i++) { a[i] += a[i] * b[i]; } } L: cmp w0, #0x200 b.hs Exit add w4, w1, #0xc ldr w6, [x4, x0, lsl #2] add w5, w2, #0xc ldr w5, [x5, x0, lsl #2] madd w5, w6, w5, w6 str w5, [x4, x0, lsl #2] add w0, w0, #0x1 ldrh w16, [tr] cbz w16, L L: cmp w0, #0x200 b.hs Exit add w16, w1, #0xc add x16, x16, x0, lsl #2 ld1 {v0.2s}, [x16] add w16, w2, #0xc add x16, x16, x0, lsl #2 ld1 {v1.2s}, [x16] mul v1.2s, v0.2s, v1.2s add v0.2s, v0.2s, v1.2s add w16, w1, #0xc add x16, x16, x0, lsl #2 st1 {v0.2s}, [x16] add w0, w0, #0x2 ldrh w16, [tr] cbz w16, L