mirror of
https://gitlab.alpinelinux.org/alpine/aports.git
synced 2025-07-24 11:45:18 +03:00
3369 lines
101 KiB
Diff
3369 lines
101 KiB
Diff
From 3b3d7cc47b9c21be4d004d2a53455f0ddee968a2 Mon Sep 17 00:00:00 2001
|
|
From: Guodong Xu <guodong.xu@linaro.org>
|
|
Date: Tue, 28 Dec 2021 17:32:39 +0800
|
|
Subject: [PATCH] Enable SVE in ISA-L erasure code for aarch64
|
|
|
|
This patch adds Arm (aarch64) SVE [1] variable-length vector assembly support
|
|
into ISA-L erasure code library. "Arm designed the Scalable Vector Extension
|
|
(SVE) as a next-generation SIMD extension to AArch64. SVE allows flexible
|
|
vector length implementations with a range of possible values in CPU
|
|
implementations. The vector length can vary from a minimum of 128 bits up to
|
|
a maximum of 2048 bits, at 128-bit increments. The SVE design guarantees
|
|
that the same application can run on different implementations that support
|
|
SVE, without the need to recompile the code. " [3]
|
|
|
|
Test method:
|
|
- This patch was tested on Fujitsu's A64FX [2], and it passed all erasure
|
|
code related test cases, including "make checks" , "make test", and
|
|
"make perf".
|
|
- To ensure code testing coverage, parameters in files (erasure_code/
|
|
erasure_code_test.c , erasure_code_update_test.c and gf_vect_mad_test.c)
|
|
are modified to cover all _vect versions of _mad_sve() / _dot_prod_sve()
|
|
rutines.
|
|
|
|
Performance improvements over NEON:
|
|
In general, SVE benchmarks (bandwidth in MB/s) are 40% ~ 100% higher than NEON
|
|
when running _cold style (data uncached and pulled from memory) perfs. This
|
|
includes routines of dot_prod, mad, and mul.
|
|
|
|
Optimization points:
|
|
This patch was tuned for the best performance on A64FX. Tuning points being
|
|
touched in this patch include:
|
|
1) Data prefetch into L2 cache before loading. See _sve.S files.
|
|
2) Instruction sequence orchestration. Such as interleaving every two
|
|
'ld1b/st1b' instructions with other instructions. See _sve.S files.
|
|
3) To improve dest vectors parallelism, in highlevel, running
|
|
gf_4vect_dot_prod_sve twice is better than running gf_8vect_dot_prod_sve()
|
|
once, and it's also better than running _7vect + _vect, _6vect + _2vect,
|
|
and _5vect + _3vect. The similar idea is applied to improve 11 ~ 9 dest
|
|
vectors dot product computing as well. The related change can be found
|
|
in ec_encode_data_sve() of file:
|
|
erasure_code/aarch64/ec_aarch64_highlevel_func.c
|
|
|
|
Notes:
|
|
1) About vector length: A64FX has a vector register length of 512bit. However,
|
|
this patchset was written with variable length assembly so it work
|
|
automatically on aarch64 machines with any types of SVE vector length,
|
|
such as SVE-128, SVE-256, etc..
|
|
2) About optimization: Due to differences in microarchitecture and
|
|
cache/memory design, to achieve optimum performance on SVE capable CPUs
|
|
other than A64FX, it is considered necessary to do microarchitecture-level
|
|
tunings on these CPUs.
|
|
|
|
[1] Introduction to SVE - Arm Developer.
|
|
https://developer.arm.com/documentation/102476/latest/
|
|
[2] FUJITSU Processor A64FX.
|
|
https://www.fujitsu.com/global/products/computing/servers/supercomputer/a64fx/
|
|
[3] Introducing SVE.
|
|
https://developer.arm.com/documentation/102476/0001/Introducing-SVE
|
|
|
|
Change-Id: If49eb8a956154d799dcda0ba4c9c6d979f5064a9
|
|
Signed-off-by: Guodong Xu <guodong.xu@linaro.org>
|
|
---
|
|
erasure_code/aarch64/Makefile.am | 15 +
|
|
erasure_code/aarch64/ec_aarch64_dispatcher.c | 30 +-
|
|
.../aarch64/ec_aarch64_highlevel_func.c | 137 ++++++++
|
|
erasure_code/aarch64/gf_2vect_dot_prod_sve.S | 164 ++++++++++
|
|
erasure_code/aarch64/gf_2vect_mad_sve.S | 148 +++++++++
|
|
erasure_code/aarch64/gf_3vect_dot_prod_sve.S | 185 +++++++++++
|
|
erasure_code/aarch64/gf_3vect_mad_sve.S | 171 ++++++++++
|
|
erasure_code/aarch64/gf_4vect_dot_prod_sve.S | 204 ++++++++++++
|
|
erasure_code/aarch64/gf_4vect_mad_sve.S | 190 +++++++++++
|
|
erasure_code/aarch64/gf_5vect_dot_prod_sve.S | 233 ++++++++++++++
|
|
erasure_code/aarch64/gf_5vect_mad_sve.S | 214 +++++++++++++
|
|
erasure_code/aarch64/gf_6vect_dot_prod_sve.S | 254 +++++++++++++++
|
|
erasure_code/aarch64/gf_6vect_mad_sve.S | 233 ++++++++++++++
|
|
erasure_code/aarch64/gf_7vect_dot_prod_sve.S | 277 ++++++++++++++++
|
|
erasure_code/aarch64/gf_8vect_dot_prod_sve.S | 303 ++++++++++++++++++
|
|
erasure_code/aarch64/gf_vect_dot_prod_sve.S | 128 ++++++++
|
|
erasure_code/aarch64/gf_vect_mad_sve.S | 123 +++++++
|
|
erasure_code/aarch64/gf_vect_mul_sve.S | 117 +++++++
|
|
18 files changed, 3121 insertions(+), 5 deletions(-)
|
|
create mode 100644 erasure_code/aarch64/gf_2vect_dot_prod_sve.S
|
|
create mode 100644 erasure_code/aarch64/gf_2vect_mad_sve.S
|
|
create mode 100644 erasure_code/aarch64/gf_3vect_dot_prod_sve.S
|
|
create mode 100644 erasure_code/aarch64/gf_3vect_mad_sve.S
|
|
create mode 100644 erasure_code/aarch64/gf_4vect_dot_prod_sve.S
|
|
create mode 100644 erasure_code/aarch64/gf_4vect_mad_sve.S
|
|
create mode 100644 erasure_code/aarch64/gf_5vect_dot_prod_sve.S
|
|
create mode 100644 erasure_code/aarch64/gf_5vect_mad_sve.S
|
|
create mode 100644 erasure_code/aarch64/gf_6vect_dot_prod_sve.S
|
|
create mode 100644 erasure_code/aarch64/gf_6vect_mad_sve.S
|
|
create mode 100644 erasure_code/aarch64/gf_7vect_dot_prod_sve.S
|
|
create mode 100644 erasure_code/aarch64/gf_8vect_dot_prod_sve.S
|
|
create mode 100644 erasure_code/aarch64/gf_vect_dot_prod_sve.S
|
|
create mode 100644 erasure_code/aarch64/gf_vect_mad_sve.S
|
|
create mode 100644 erasure_code/aarch64/gf_vect_mul_sve.S
|
|
|
|
diff --git a/erasure_code/aarch64/Makefile.am b/erasure_code/aarch64/Makefile.am
|
|
index 94bb5a13..47bbf12d 100644
|
|
--- a/erasure_code/aarch64/Makefile.am
|
|
+++ b/erasure_code/aarch64/Makefile.am
|
|
@@ -42,4 +42,19 @@ lsrc_aarch64 += \
|
|
erasure_code/aarch64/gf_5vect_mad_neon.S \
|
|
erasure_code/aarch64/gf_6vect_mad_neon.S \
|
|
erasure_code/aarch64/gf_vect_mul_neon.S \
|
|
+ erasure_code/aarch64/gf_vect_mad_sve.S \
|
|
+ erasure_code/aarch64/gf_2vect_mad_sve.S \
|
|
+ erasure_code/aarch64/gf_3vect_mad_sve.S \
|
|
+ erasure_code/aarch64/gf_4vect_mad_sve.S \
|
|
+ erasure_code/aarch64/gf_5vect_mad_sve.S \
|
|
+ erasure_code/aarch64/gf_6vect_mad_sve.S \
|
|
+ erasure_code/aarch64/gf_vect_dot_prod_sve.S \
|
|
+ erasure_code/aarch64/gf_2vect_dot_prod_sve.S \
|
|
+ erasure_code/aarch64/gf_3vect_dot_prod_sve.S \
|
|
+ erasure_code/aarch64/gf_4vect_dot_prod_sve.S \
|
|
+ erasure_code/aarch64/gf_5vect_dot_prod_sve.S \
|
|
+ erasure_code/aarch64/gf_6vect_dot_prod_sve.S \
|
|
+ erasure_code/aarch64/gf_7vect_dot_prod_sve.S \
|
|
+ erasure_code/aarch64/gf_8vect_dot_prod_sve.S \
|
|
+ erasure_code/aarch64/gf_vect_mul_sve.S \
|
|
erasure_code/aarch64/ec_multibinary_arm.S
|
|
diff --git a/erasure_code/aarch64/ec_aarch64_dispatcher.c b/erasure_code/aarch64/ec_aarch64_dispatcher.c
|
|
index ba663478..42bd7802 100644
|
|
--- a/erasure_code/aarch64/ec_aarch64_dispatcher.c
|
|
+++ b/erasure_code/aarch64/ec_aarch64_dispatcher.c
|
|
@@ -30,7 +30,11 @@
|
|
|
|
DEFINE_INTERFACE_DISPATCHER(gf_vect_dot_prod)
|
|
{
|
|
- if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
|
|
+ unsigned long auxval = getauxval(AT_HWCAP);
|
|
+
|
|
+ if (auxval & HWCAP_SVE)
|
|
+ return PROVIDER_INFO(gf_vect_dot_prod_sve);
|
|
+ if (auxval & HWCAP_ASIMD)
|
|
return PROVIDER_INFO(gf_vect_dot_prod_neon);
|
|
return PROVIDER_BASIC(gf_vect_dot_prod);
|
|
|
|
@@ -38,7 +42,11 @@ DEFINE_INTERFACE_DISPATCHER(gf_vect_dot_prod)
|
|
|
|
DEFINE_INTERFACE_DISPATCHER(gf_vect_mad)
|
|
{
|
|
- if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
|
|
+ unsigned long auxval = getauxval(AT_HWCAP);
|
|
+
|
|
+ if (auxval & HWCAP_SVE)
|
|
+ return PROVIDER_INFO(gf_vect_mad_sve);
|
|
+ if (auxval & HWCAP_ASIMD)
|
|
return PROVIDER_INFO(gf_vect_mad_neon);
|
|
return PROVIDER_BASIC(gf_vect_mad);
|
|
|
|
@@ -46,7 +54,11 @@ DEFINE_INTERFACE_DISPATCHER(gf_vect_mad)
|
|
|
|
DEFINE_INTERFACE_DISPATCHER(ec_encode_data)
|
|
{
|
|
- if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
|
|
+ unsigned long auxval = getauxval(AT_HWCAP);
|
|
+
|
|
+ if (auxval & HWCAP_SVE)
|
|
+ return PROVIDER_INFO(ec_encode_data_sve);
|
|
+ if (auxval & HWCAP_ASIMD)
|
|
return PROVIDER_INFO(ec_encode_data_neon);
|
|
return PROVIDER_BASIC(ec_encode_data);
|
|
|
|
@@ -54,7 +66,11 @@ DEFINE_INTERFACE_DISPATCHER(ec_encode_data)
|
|
|
|
DEFINE_INTERFACE_DISPATCHER(ec_encode_data_update)
|
|
{
|
|
- if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
|
|
+ unsigned long auxval = getauxval(AT_HWCAP);
|
|
+
|
|
+ if (auxval & HWCAP_SVE)
|
|
+ return PROVIDER_INFO(ec_encode_data_update_sve);
|
|
+ if (auxval & HWCAP_ASIMD)
|
|
return PROVIDER_INFO(ec_encode_data_update_neon);
|
|
return PROVIDER_BASIC(ec_encode_data_update);
|
|
|
|
@@ -62,7 +78,11 @@ DEFINE_INTERFACE_DISPATCHER(ec_encode_data_update)
|
|
|
|
DEFINE_INTERFACE_DISPATCHER(gf_vect_mul)
|
|
{
|
|
- if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
|
|
+ unsigned long auxval = getauxval(AT_HWCAP);
|
|
+
|
|
+ if (auxval & HWCAP_SVE)
|
|
+ return PROVIDER_INFO(gf_vect_mul_sve);
|
|
+ if (auxval & HWCAP_ASIMD)
|
|
return PROVIDER_INFO(gf_vect_mul_neon);
|
|
return PROVIDER_BASIC(gf_vect_mul);
|
|
|
|
diff --git a/erasure_code/aarch64/ec_aarch64_highlevel_func.c b/erasure_code/aarch64/ec_aarch64_highlevel_func.c
|
|
index dd23702c..e001fd72 100644
|
|
--- a/erasure_code/aarch64/ec_aarch64_highlevel_func.c
|
|
+++ b/erasure_code/aarch64/ec_aarch64_highlevel_func.c
|
|
@@ -125,3 +125,140 @@ void ec_encode_data_update_neon(int len, int k, int rows, int vec_i, unsigned ch
|
|
break;
|
|
}
|
|
}
|
|
+
|
|
+/* SVE */
|
|
+extern void gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char *dest);
|
|
+extern void gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char **dest);
|
|
+extern void gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char **dest);
|
|
+extern void gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char **dest);
|
|
+extern void gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char **dest);
|
|
+extern void gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char **dest);
|
|
+extern void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char **dest);
|
|
+extern void gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char **dest);
|
|
+extern void gf_vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
|
|
+ unsigned char *src, unsigned char *dest);
|
|
+extern void gf_2vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
|
|
+ unsigned char *src, unsigned char **dest);
|
|
+extern void gf_3vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
|
|
+ unsigned char *src, unsigned char **dest);
|
|
+extern void gf_4vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
|
|
+ unsigned char *src, unsigned char **dest);
|
|
+extern void gf_5vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
|
|
+ unsigned char *src, unsigned char **dest);
|
|
+extern void gf_6vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
|
|
+ unsigned char *src, unsigned char **dest);
|
|
+
|
|
+void ec_encode_data_sve(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
|
|
+ unsigned char **coding)
|
|
+{
|
|
+ if (len < 16) {
|
|
+ ec_encode_data_base(len, k, rows, g_tbls, data, coding);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ while (rows > 11) {
|
|
+ gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ g_tbls += 6 * k * 32;
|
|
+ coding += 6;
|
|
+ rows -= 6;
|
|
+ }
|
|
+
|
|
+ switch (rows) {
|
|
+ case 11:
|
|
+ /* 7 + 4 */
|
|
+ gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ g_tbls += 7 * k * 32;
|
|
+ coding += 7;
|
|
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 10:
|
|
+ /* 6 + 4 */
|
|
+ gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ g_tbls += 6 * k * 32;
|
|
+ coding += 6;
|
|
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 9:
|
|
+ /* 5 + 4 */
|
|
+ gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ g_tbls += 5 * k * 32;
|
|
+ coding += 5;
|
|
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 8:
|
|
+ /* 4 + 4 */
|
|
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ g_tbls += 4 * k * 32;
|
|
+ coding += 4;
|
|
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 7:
|
|
+ gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 6:
|
|
+ gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 5:
|
|
+ gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 4:
|
|
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 3:
|
|
+ gf_3vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 2:
|
|
+ gf_2vect_dot_prod_sve(len, k, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 1:
|
|
+ gf_vect_dot_prod_sve(len, k, g_tbls, data, *coding);
|
|
+ break;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+void ec_encode_data_update_sve(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
|
|
+ unsigned char *data, unsigned char **coding)
|
|
+{
|
|
+ if (len < 16) {
|
|
+ ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
|
|
+ return;
|
|
+ }
|
|
+ while (rows > 6) {
|
|
+ gf_6vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
|
|
+ g_tbls += 6 * k * 32;
|
|
+ coding += 6;
|
|
+ rows -= 6;
|
|
+ }
|
|
+ switch (rows) {
|
|
+ case 6:
|
|
+ gf_6vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 5:
|
|
+ gf_5vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 4:
|
|
+ gf_4vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 3:
|
|
+ gf_3vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 2:
|
|
+ gf_2vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
|
|
+ break;
|
|
+ case 1:
|
|
+ gf_vect_mad_sve(len, k, vec_i, g_tbls, data, *coding);
|
|
+ break;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+}
|
|
diff --git a/erasure_code/aarch64/gf_2vect_dot_prod_sve.S b/erasure_code/aarch64/gf_2vect_dot_prod_sve.S
|
|
new file mode 100644
|
|
index 00000000..abe50833
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_2vect_dot_prod_sve.S
|
|
@@ -0,0 +1,164 @@
|
|
+/*************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+.global gf_2vect_dot_prod_sve
|
|
+.type gf_2vect_dot_prod_sve, %function
|
|
+/* void gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char **dest);
|
|
+ */
|
|
+
|
|
+/* arguments */
|
|
+x_len .req x0 /* vector length */
|
|
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
|
|
+x_tbl .req x2
|
|
+x_src .req x3
|
|
+x_dest .req x4
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_vec_i .req x5
|
|
+x_ptr .req x6
|
|
+x_pos .req x7
|
|
+
|
|
+x_tbl1 .req x8
|
|
+x_tbl2 .req x9
|
|
+x_dest1 .req x10
|
|
+x_dest2 .req x_dest /* reused */
|
|
+
|
|
+/* r16,r17,r18,r29,r30: special role registers, avoided */
|
|
+/* r19..r29 and SP must be preserved */
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src
|
|
+
|
|
+z_dest1 .req z3
|
|
+
|
|
+z_gft1_lo .req z4
|
|
+z_gft1_hi .req z5
|
|
+q_gft1_lo .req q4
|
|
+q_gft1_hi .req q5
|
|
+
|
|
+/* bottom 64-bit of v8..v15 must be preserved if used */
|
|
+z_gft2_lo .req z17
|
|
+z_gft2_hi .req z18
|
|
+q_gft2_lo .req q17
|
|
+q_gft2_hi .req q18
|
|
+
|
|
+z_dest2 .req z27
|
|
+
|
|
+gf_2vect_dot_prod_sve:
|
|
+ /* less than 16 bytes, return_fail */
|
|
+ cmp x_len, #16
|
|
+ blt .return_fail
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ mov x_pos, #0
|
|
+ lsl x_vec, x_vec, #3
|
|
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
|
|
+
|
|
+/* Loop 1: x_len, vector length */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ mov x_vec_i, #0 /* clear x_vec_i */
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+
|
|
+ mov z_dest1.b, #0 /* clear z_dest1 */
|
|
+ mov z_dest2.b, #0 /* clear z_dest2 */
|
|
+
|
|
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
|
|
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
|
|
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
|
|
+
|
|
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
|
|
+.Lloopsve_vl_vects:
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+
|
|
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
|
|
+ /* load gf_table's */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
|
|
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
|
|
+
|
|
+ /* prefetch */
|
|
+ prfb pldl2keep, p0, [x_tbl1]
|
|
+ prfb pldl2keep, p0, [x_tbl2]
|
|
+
|
|
+ /* calc for next */
|
|
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+
|
|
+ /* dest 1 */
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
|
|
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
|
|
+
|
|
+ /* dest 2 */
|
|
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
|
|
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
|
|
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
|
|
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
|
|
+
|
|
+ cmp x_vec_i, x_vec
|
|
+ blt .Lloopsve_vl_vects
|
|
+/* end of Loop 2 */
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
|
|
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
|
|
+
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+ b .Lloopsve_vl
|
|
+/* end of Loop 1 */
|
|
+
|
|
+.return_pass:
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|
|
diff --git a/erasure_code/aarch64/gf_2vect_mad_sve.S b/erasure_code/aarch64/gf_2vect_mad_sve.S
|
|
new file mode 100644
|
|
index 00000000..5e832109
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_2vect_mad_sve.S
|
|
@@ -0,0 +1,148 @@
|
|
+/**************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+.global gf_2vect_mad_sve
|
|
+.type gf_2vect_mad_sve, %function
|
|
+
|
|
+/* gf_2vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
|
|
+ unsigned char *src, unsigned char **dest);
|
|
+ */
|
|
+/* arguments */
|
|
+x_len .req x0
|
|
+x_vec .req x1
|
|
+x_vec_i .req x2
|
|
+x_tbl .req x3
|
|
+x_src .req x4
|
|
+x_dest .req x5
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_pos .req x6
|
|
+x_dest2 .req x7
|
|
+x_dest1 .req x12
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src
|
|
+
|
|
+z_dest1 .req z3
|
|
+
|
|
+z_tmp_lo .req z4
|
|
+z_tmp_hi .req z5
|
|
+
|
|
+z_gft1_lo .req z6
|
|
+z_gft1_hi .req z7
|
|
+q_gft1_lo .req q6
|
|
+q_gft1_hi .req q7
|
|
+
|
|
+/* bottom 64-bit of v8..v15 must be preserved if used */
|
|
+z_gft2_lo .req z17
|
|
+z_gft2_hi .req z18
|
|
+q_gft2_lo .req q17
|
|
+q_gft2_hi .req q18
|
|
+
|
|
+z_dest2 .req z27
|
|
+
|
|
+gf_2vect_mad_sve:
|
|
+ /* less than 16 bytes, return_fail */
|
|
+ cmp x_len, #16
|
|
+ blt .return_fail
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ /* load table 1 */
|
|
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
|
|
+
|
|
+ /* Load table 1 with NEON instruction ldp */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
|
|
+ /* load table 2 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
|
|
+
|
|
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
|
|
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
|
|
+
|
|
+ mov x_pos, #0
|
|
+
|
|
+ /* vector length agnostic */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ /* prefetch dest data */
|
|
+ prfb pldl2strm, p0, [x_dest1, x_pos]
|
|
+ prfb pldl2strm, p0, [x_dest2, x_pos]
|
|
+
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_src, x_pos]
|
|
+
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+ /* load dest data, governed by p0 */
|
|
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
|
|
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
|
|
+
|
|
+ /* dest1 */
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
|
|
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
|
|
+
|
|
+ /* dest2 */
|
|
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
|
|
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
|
|
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
|
|
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+
|
|
+ b .Lloopsve_vl
|
|
+
|
|
+.return_pass:
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|
|
diff --git a/erasure_code/aarch64/gf_3vect_dot_prod_sve.S b/erasure_code/aarch64/gf_3vect_dot_prod_sve.S
|
|
new file mode 100644
|
|
index 00000000..b326c72c
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_3vect_dot_prod_sve.S
|
|
@@ -0,0 +1,185 @@
|
|
+/*************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+.global gf_3vect_dot_prod_sve
|
|
+.type gf_3vect_dot_prod_sve, %function
|
|
+/* void gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char **dest);
|
|
+ */
|
|
+
|
|
+/* arguments */
|
|
+x_len .req x0 /* vector length */
|
|
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
|
|
+x_tbl .req x2
|
|
+x_src .req x3
|
|
+x_dest .req x4
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_vec_i .req x5
|
|
+x_ptr .req x6
|
|
+x_pos .req x7
|
|
+
|
|
+x_tbl1 .req x8
|
|
+x_tbl2 .req x9
|
|
+x_tbl3 .req x10
|
|
+x_dest1 .req x11
|
|
+x_dest2 .req x12
|
|
+x_dest3 .req x_dest /* reused */
|
|
+
|
|
+/* r16,r17,r18,r29,r30: special role registers, avoided */
|
|
+/* r19..r29 and SP must be preserved */
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src
|
|
+
|
|
+z_dest1 .req z3
|
|
+
|
|
+z_gft1_lo .req z4
|
|
+z_gft1_hi .req z5
|
|
+q_gft1_lo .req q4
|
|
+q_gft1_hi .req q5
|
|
+
|
|
+/* bottom 64-bit of v8..v15 must be preserved if used */
|
|
+z_gft2_lo .req z17
|
|
+z_gft2_hi .req z18
|
|
+q_gft2_lo .req q17
|
|
+q_gft2_hi .req q18
|
|
+
|
|
+z_gft3_lo .req z19
|
|
+z_gft3_hi .req z20
|
|
+q_gft3_lo .req q19
|
|
+q_gft3_hi .req q20
|
|
+
|
|
+z_dest2 .req z27
|
|
+z_dest3 .req z28
|
|
+
|
|
+gf_3vect_dot_prod_sve:
|
|
+ /* less than 16 bytes, return_fail */
|
|
+ cmp x_len, #16
|
|
+ blt .return_fail
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ mov x_pos, #0
|
|
+ lsl x_vec, x_vec, #3
|
|
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
|
|
+ ldr x_dest3, [x_dest, #8*2]
|
|
+
|
|
+/* Loop 1: x_len, vector length */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ mov x_vec_i, #0 /* clear x_vec_i */
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+
|
|
+ mov z_dest1.b, #0 /* clear z_dest1 */
|
|
+ mov z_dest2.b, #0 /* clear z_dest2 */
|
|
+ mov z_dest3.b, #0 /* clear z_dest3 */
|
|
+
|
|
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
|
|
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
|
|
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
|
|
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
|
|
+
|
|
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
|
|
+.Lloopsve_vl_vects:
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+
|
|
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
|
|
+ /* load gf_table's */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
|
|
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
|
|
+
|
|
+ /* prefetch */
|
|
+ prfb pldl2keep, p0, [x_tbl1]
|
|
+ prfb pldl2keep, p0, [x_tbl2]
|
|
+
|
|
+ /* calc for next */
|
|
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+
|
|
+ /* dest 1 */
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
|
|
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
|
|
+
|
|
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
|
|
+ prfb pldl2keep, p0, [x_tbl3]
|
|
+
|
|
+ /* dest 2 */
|
|
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
|
|
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
|
|
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
|
|
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
|
|
+
|
|
+ /* dest 3 */
|
|
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
|
|
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
|
|
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
|
|
+ eor z_dest3.d, z_dest3.d, z_gft3_hi.d
|
|
+
|
|
+ cmp x_vec_i, x_vec
|
|
+ blt .Lloopsve_vl_vects
|
|
+/* end of Loop 2 */
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
|
|
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
|
|
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
|
|
+
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+ b .Lloopsve_vl
|
|
+/* end of Loop 1 */
|
|
+
|
|
+.return_pass:
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|
|
diff --git a/erasure_code/aarch64/gf_3vect_mad_sve.S b/erasure_code/aarch64/gf_3vect_mad_sve.S
|
|
new file mode 100644
|
|
index 00000000..52c2ffc5
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_3vect_mad_sve.S
|
|
@@ -0,0 +1,171 @@
|
|
+/**************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+.global gf_3vect_mad_sve
|
|
+.type gf_3vect_mad_sve, %function
|
|
+
|
|
+/* gf_3vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
|
|
+ unsigned char *src, unsigned char **dest);
|
|
+ */
|
|
+/* arguments */
|
|
+x_len .req x0
|
|
+x_vec .req x1
|
|
+x_vec_i .req x2
|
|
+x_tbl .req x3
|
|
+x_src .req x4
|
|
+x_dest .req x5
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_pos .req x6
|
|
+x_dest2 .req x7
|
|
+x_dest3 .req x8
|
|
+x_dest1 .req x12
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src
|
|
+
|
|
+z_dest1 .req z3
|
|
+
|
|
+z_tmp_lo .req z4
|
|
+z_tmp_hi .req z5
|
|
+
|
|
+z_gft1_lo .req z6
|
|
+z_gft1_hi .req z7
|
|
+q_gft1_lo .req q6
|
|
+q_gft1_hi .req q7
|
|
+
|
|
+/* bottom 64-bit of v8..v15 must be preserved if used */
|
|
+z_gft2_lo .req z17
|
|
+z_gft2_hi .req z18
|
|
+q_gft2_lo .req q17
|
|
+q_gft2_hi .req q18
|
|
+
|
|
+z_gft3_lo .req z19
|
|
+z_gft3_hi .req z20
|
|
+q_gft3_lo .req q19
|
|
+q_gft3_hi .req q20
|
|
+
|
|
+z_dest2 .req z27
|
|
+z_dest3 .req z28
|
|
+
|
|
+gf_3vect_mad_sve:
|
|
+ /* less than 16 bytes, return_fail */
|
|
+ cmp x_len, #16
|
|
+ blt .return_fail
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ /* load table 1 */
|
|
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
|
|
+
|
|
+ /* Load table 1 with NEON instruction ldp */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
|
|
+ /* load table 2 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
|
|
+ /* load table 3 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl]
|
|
+
|
|
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
|
|
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
|
|
+ ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */
|
|
+
|
|
+ mov x_pos, #0
|
|
+
|
|
+ /* vector length agnostic */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ /* dest data prefetch */
|
|
+ prfb pldl2strm, p0, [x_dest1, x_pos]
|
|
+ prfb pldl2strm, p0, [x_dest2, x_pos]
|
|
+
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_src, x_pos]
|
|
+
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+ /* load dest data, governed by p0 */
|
|
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
|
|
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
|
|
+ prfb pldl2strm, p0, [x_dest3, x_pos]
|
|
+
|
|
+ /* dest1 */
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
|
|
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
|
|
+
|
|
+ /* dest2 */
|
|
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
|
|
+
|
|
+ ld1b z_dest3.b, p0/z, [x_dest3, x_pos]
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
|
|
+
|
|
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
|
|
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
|
|
+
|
|
+ /* dest3 */
|
|
+ tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b
|
|
+ eor z_dest3.d, z_tmp_lo.d, z_dest3.d
|
|
+ eor z_dest3.d, z_tmp_hi.d, z_dest3.d
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
|
|
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+
|
|
+ b .Lloopsve_vl
|
|
+
|
|
+.return_pass:
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|
|
diff --git a/erasure_code/aarch64/gf_4vect_dot_prod_sve.S b/erasure_code/aarch64/gf_4vect_dot_prod_sve.S
|
|
new file mode 100644
|
|
index 00000000..ae7cdcbe
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_4vect_dot_prod_sve.S
|
|
@@ -0,0 +1,204 @@
|
|
+/*************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+.global gf_4vect_dot_prod_sve
|
|
+.type gf_4vect_dot_prod_sve, %function
|
|
+/* void gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char **dest);
|
|
+ */
|
|
+
|
|
+/* arguments */
|
|
+x_len .req x0 /* vector length */
|
|
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
|
|
+x_tbl .req x2
|
|
+x_src .req x3
|
|
+x_dest .req x4
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_vec_i .req x5
|
|
+x_ptr .req x6
|
|
+x_pos .req x7
|
|
+
|
|
+x_tbl1 .req x8
|
|
+x_tbl2 .req x9
|
|
+x_tbl3 .req x10
|
|
+x_tbl4 .req x11
|
|
+x_dest1 .req x12
|
|
+x_dest2 .req x13
|
|
+x_dest3 .req x14
|
|
+x_dest4 .req x_dest /* reused */
|
|
+
|
|
+/* r16,r17,r18,r29,r30: special role registers, avoided */
|
|
+/* r19..r29 and SP must be preserved */
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src
|
|
+
|
|
+z_dest1 .req z3
|
|
+
|
|
+z_gft1_lo .req z4
|
|
+z_gft1_hi .req z5
|
|
+q_gft1_lo .req q4
|
|
+q_gft1_hi .req q5
|
|
+
|
|
+/* bottom 64-bit of v8..v15 must be preserved if used */
|
|
+z_gft2_lo .req z17
|
|
+z_gft2_hi .req z18
|
|
+q_gft2_lo .req q17
|
|
+q_gft2_hi .req q18
|
|
+
|
|
+z_gft3_lo .req z19
|
|
+z_gft3_hi .req z20
|
|
+q_gft3_lo .req q19
|
|
+q_gft3_hi .req q20
|
|
+
|
|
+z_gft4_lo .req z21
|
|
+z_gft4_hi .req z22
|
|
+q_gft4_lo .req q21
|
|
+q_gft4_hi .req q22
|
|
+
|
|
+z_dest2 .req z27
|
|
+z_dest3 .req z28
|
|
+z_dest4 .req z29
|
|
+
|
|
+gf_4vect_dot_prod_sve:
|
|
+ /* less than 16 bytes, return_fail */
|
|
+ cmp x_len, #16
|
|
+ blt .return_fail
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ mov x_pos, #0
|
|
+ lsl x_vec, x_vec, #3
|
|
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
|
|
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
|
|
+
|
|
+/* Loop 1: x_len, vector length */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ mov x_vec_i, #0 /* clear x_vec_i */
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+
|
|
+ mov z_dest1.b, #0 /* clear z_dest1 */
|
|
+ mov z_dest2.b, #0 /* clear z_dest2 */
|
|
+ mov z_dest3.b, #0 /* clear z_dest3 */
|
|
+ mov z_dest4.b, #0 /* clear z_dest4 */
|
|
+
|
|
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
|
|
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
|
|
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
|
|
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
|
|
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
|
|
+
|
|
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
|
|
+.Lloopsve_vl_vects:
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+
|
|
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
|
|
+ /* load gf_table's */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
|
|
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
|
|
+
|
|
+ /* prefetch */
|
|
+ prfb pldl2keep, p0, [x_tbl1]
|
|
+ prfb pldl2keep, p0, [x_tbl2]
|
|
+
|
|
+ /* calc for next */
|
|
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+
|
|
+ /* dest 1 */
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
|
|
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
|
|
+
|
|
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
|
|
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
|
|
+ prfb pldl2keep, p0, [x_tbl3]
|
|
+ prfb pldl2keep, p0, [x_tbl4]
|
|
+
|
|
+ /* dest 2 */
|
|
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
|
|
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
|
|
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
|
|
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
|
|
+
|
|
+ /* dest 3 */
|
|
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
|
|
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
|
|
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
|
|
+ eor z_dest3.d, z_dest3.d, z_gft3_hi.d
|
|
+
|
|
+ /* dest 4 */
|
|
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
|
|
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
|
|
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
|
|
+ eor z_dest4.d, z_dest4.d, z_gft4_hi.d
|
|
+
|
|
+ cmp x_vec_i, x_vec
|
|
+ blt .Lloopsve_vl_vects
|
|
+/* end of Loop 2 */
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
|
|
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
|
|
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
|
|
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
|
|
+
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+ b .Lloopsve_vl
|
|
+/* end of Loop 1 */
|
|
+
|
|
+.return_pass:
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|
|
diff --git a/erasure_code/aarch64/gf_4vect_mad_sve.S b/erasure_code/aarch64/gf_4vect_mad_sve.S
|
|
new file mode 100644
|
|
index 00000000..8bf682c5
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_4vect_mad_sve.S
|
|
@@ -0,0 +1,190 @@
|
|
+/**************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+.global gf_4vect_mad_sve
|
|
+.type gf_4vect_mad_sve, %function
|
|
+
|
|
+/* gf_4vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
|
|
+ unsigned char *src, unsigned char **dest);
|
|
+ */
|
|
+/* arguments */
|
|
+x_len .req x0
|
|
+x_vec .req x1
|
|
+x_vec_i .req x2
|
|
+x_tbl .req x3
|
|
+x_src .req x4
|
|
+x_dest .req x5
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_pos .req x6
|
|
+x_dest2 .req x7
|
|
+x_dest3 .req x8
|
|
+x_dest4 .req x9
|
|
+x_dest1 .req x12
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src
|
|
+
|
|
+z_dest1 .req z3
|
|
+
|
|
+z_tmp_lo .req z4
|
|
+z_tmp_hi .req z5
|
|
+
|
|
+z_gft1_lo .req z6
|
|
+z_gft1_hi .req z7
|
|
+q_gft1_lo .req q6
|
|
+q_gft1_hi .req q7
|
|
+
|
|
+/* bottom 64-bit of v8..v15 must be preserved if used */
|
|
+z_gft2_lo .req z17
|
|
+z_gft2_hi .req z18
|
|
+q_gft2_lo .req q17
|
|
+q_gft2_hi .req q18
|
|
+
|
|
+z_gft3_lo .req z19
|
|
+z_gft3_hi .req z20
|
|
+q_gft3_lo .req q19
|
|
+q_gft3_hi .req q20
|
|
+
|
|
+z_gft4_lo .req z21
|
|
+z_gft4_hi .req z22
|
|
+q_gft4_lo .req q21
|
|
+q_gft4_hi .req q22
|
|
+
|
|
+z_dest2 .req z27
|
|
+z_dest3 .req z28
|
|
+z_dest4 .req z29
|
|
+
|
|
+gf_4vect_mad_sve:
|
|
+ /* less than 16 bytes, return_fail */
|
|
+ cmp x_len, #16
|
|
+ blt .return_fail
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ /* load table 1 */
|
|
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
|
|
+
|
|
+ /* Load table 1 with NEON instruction ldp */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
|
|
+ /* load table 2 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
|
|
+ /* load table 3 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl]
|
|
+ /* load table 4 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl]
|
|
+
|
|
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
|
|
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
|
|
+ ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */
|
|
+ ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */
|
|
+
|
|
+ mov x_pos, #0
|
|
+
|
|
+ /* vector length agnostic */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ prfb pldl2strm, p0, [x_dest1, x_pos]
|
|
+ prfb pldl2strm, p0, [x_dest2, x_pos]
|
|
+
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_src, x_pos]
|
|
+
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+ /* load dest data, governed by p0 */
|
|
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
|
|
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
|
|
+
|
|
+ prfb pldl2strm, p0, [x_dest3, x_pos]
|
|
+ prfb pldl2strm, p0, [x_dest4, x_pos]
|
|
+
|
|
+ /* dest1 */
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
|
|
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
|
|
+
|
|
+ ld1b z_dest3.b, p0/z, [x_dest3, x_pos]
|
|
+ ld1b z_dest4.b, p0/z, [x_dest4, x_pos]
|
|
+
|
|
+ /* dest2 */
|
|
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
|
|
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
|
|
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
|
|
+
|
|
+ /* dest3 */
|
|
+ tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b
|
|
+ eor z_dest3.d, z_tmp_lo.d, z_dest3.d
|
|
+ eor z_dest3.d, z_tmp_hi.d, z_dest3.d
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
|
|
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
|
|
+
|
|
+ /* dest4 */
|
|
+ tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b
|
|
+ eor z_dest4.d, z_tmp_lo.d, z_dest4.d
|
|
+ eor z_dest4.d, z_tmp_hi.d, z_dest4.d
|
|
+
|
|
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
|
|
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+
|
|
+ b .Lloopsve_vl
|
|
+
|
|
+.return_pass:
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|
|
diff --git a/erasure_code/aarch64/gf_5vect_dot_prod_sve.S b/erasure_code/aarch64/gf_5vect_dot_prod_sve.S
|
|
new file mode 100644
|
|
index 00000000..ae999ff4
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_5vect_dot_prod_sve.S
|
|
@@ -0,0 +1,233 @@
|
|
+/*************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+.global gf_5vect_dot_prod_sve
|
|
+.type gf_5vect_dot_prod_sve, %function
|
|
+/* void gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char **dest);
|
|
+ */
|
|
+
|
|
+/* arguments */
|
|
+x_len .req x0 /* vector length */
|
|
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
|
|
+x_tbl .req x2
|
|
+x_src .req x3
|
|
+x_dest .req x4
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_vec_i .req x5
|
|
+x_ptr .req x6
|
|
+x_pos .req x7
|
|
+
|
|
+x_tbl1 .req x8
|
|
+x_tbl2 .req x9
|
|
+x_tbl3 .req x10
|
|
+x_tbl4 .req x11
|
|
+x_tbl5 .req x12
|
|
+x_dest1 .req x13
|
|
+x_dest2 .req x14
|
|
+x_dest4 .req x15
|
|
+x_dest5 .req x_dest /* reused */
|
|
+
|
|
+/* r16,r17,r18,r29,r30: special role registers, avoided */
|
|
+/* r19..r29 and SP must be preserved */
|
|
+x_dest3 .req x19
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src
|
|
+
|
|
+z_dest1 .req z3
|
|
+
|
|
+z_gft1_lo .req z4
|
|
+z_gft1_hi .req z5
|
|
+q_gft1_lo .req q4
|
|
+q_gft1_hi .req q5
|
|
+
|
|
+/* bottom 64-bit of v8..v15 must be preserved if used */
|
|
+z_gft2_lo .req z17
|
|
+z_gft2_hi .req z18
|
|
+q_gft2_lo .req q17
|
|
+q_gft2_hi .req q18
|
|
+
|
|
+z_gft3_lo .req z19
|
|
+z_gft3_hi .req z20
|
|
+q_gft3_lo .req q19
|
|
+q_gft3_hi .req q20
|
|
+
|
|
+z_gft4_lo .req z21
|
|
+z_gft4_hi .req z22
|
|
+q_gft4_lo .req q21
|
|
+q_gft4_hi .req q22
|
|
+
|
|
+z_gft5_lo .req z23
|
|
+z_gft5_hi .req z24
|
|
+q_gft5_lo .req q23
|
|
+q_gft5_hi .req q24
|
|
+
|
|
+z_dest2 .req z27
|
|
+z_dest3 .req z28
|
|
+z_dest4 .req z29
|
|
+z_dest5 .req z30
|
|
+
|
|
+gf_5vect_dot_prod_sve:
|
|
+ /* less than 16 bytes, return_fail */
|
|
+ cmp x_len, #16
|
|
+ blt .return_fail
|
|
+
|
|
+ /* save r19..r29 */
|
|
+ sub sp, sp, #16 /* alignment */
|
|
+ str x19, [sp]
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ mov x_pos, #0
|
|
+ lsl x_vec, x_vec, #3
|
|
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
|
|
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
|
|
+ ldr x_dest5, [x_dest, #8*4]
|
|
+
|
|
+/* Loop 1: x_len, vector length */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ mov x_vec_i, #0 /* clear x_vec_i */
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+
|
|
+ mov z_dest1.b, #0 /* clear z_dest1 */
|
|
+ mov z_dest2.b, #0 /* clear z_dest2 */
|
|
+ mov z_dest3.b, #0 /* clear z_dest3 */
|
|
+ mov z_dest4.b, #0 /* clear z_dest4 */
|
|
+ mov z_dest5.b, #0 /* clear z_dest5 */
|
|
+
|
|
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
|
|
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
|
|
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
|
|
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
|
|
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
|
|
+ add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */
|
|
+
|
|
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
|
|
+.Lloopsve_vl_vects:
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+
|
|
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
|
|
+ /* load gf_table's */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
|
|
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
|
|
+
|
|
+ /* prefetch */
|
|
+ prfb pldl2keep, p0, [x_tbl1]
|
|
+ prfb pldl2keep, p0, [x_tbl2]
|
|
+
|
|
+ /* calc for next */
|
|
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+
|
|
+ /* dest 1 */
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
|
|
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
|
|
+
|
|
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
|
|
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
|
|
+ prfb pldl2keep, p0, [x_tbl3]
|
|
+ prfb pldl2keep, p0, [x_tbl4]
|
|
+
|
|
+ /* dest 2 */
|
|
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
|
|
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
|
|
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
|
|
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
|
|
+
|
|
+ /* dest 3 */
|
|
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
|
|
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
|
|
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
|
|
+ eor z_dest3.d, z_dest3.d, z_gft3_hi.d
|
|
+
|
|
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32
|
|
+ prfb pldl2keep, p0, [x_tbl5]
|
|
+
|
|
+ /* dest 4 */
|
|
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
|
|
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
|
|
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
|
|
+ eor z_dest4.d, z_dest4.d, z_gft4_hi.d
|
|
+
|
|
+ /* dest 5 */
|
|
+ tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b
|
|
+ tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b
|
|
+ eor z_dest5.d, z_gft5_lo.d, z_dest5.d
|
|
+ eor z_dest5.d, z_dest5.d, z_gft5_hi.d
|
|
+
|
|
+ cmp x_vec_i, x_vec
|
|
+ blt .Lloopsve_vl_vects
|
|
+/* end of Loop 2 */
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
|
|
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
|
|
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
|
|
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
|
|
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
|
|
+
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+ b .Lloopsve_vl
|
|
+/* end of Loop 1 */
|
|
+
|
|
+.return_pass:
|
|
+ /* restore r19..r29 */
|
|
+ ldr x19, [sp]
|
|
+ add sp, sp, #16
|
|
+
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|
|
diff --git a/erasure_code/aarch64/gf_5vect_mad_sve.S b/erasure_code/aarch64/gf_5vect_mad_sve.S
|
|
new file mode 100644
|
|
index 00000000..82e88d98
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_5vect_mad_sve.S
|
|
@@ -0,0 +1,214 @@
|
|
+/**************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+.global gf_5vect_mad_sve
|
|
+.type gf_5vect_mad_sve, %function
|
|
+
|
|
+/* gf_5vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
|
|
+ unsigned char *src, unsigned char **dest);
|
|
+ */
|
|
+/* arguments */
|
|
+x_len .req x0
|
|
+x_vec .req x1
|
|
+x_vec_i .req x2
|
|
+x_tbl .req x3
|
|
+x_src .req x4
|
|
+x_dest .req x5
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_pos .req x6
|
|
+x_dest2 .req x7
|
|
+x_dest3 .req x8
|
|
+x_dest4 .req x9
|
|
+x_dest5 .req x10
|
|
+x_dest1 .req x12
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src
|
|
+
|
|
+z_dest1 .req z3
|
|
+
|
|
+z_tmp_lo .req z4
|
|
+z_tmp_hi .req z5
|
|
+
|
|
+z_gft1_lo .req z6
|
|
+z_gft1_hi .req z7
|
|
+q_gft1_lo .req q6
|
|
+q_gft1_hi .req q7
|
|
+
|
|
+/* bottom 64-bit of v8..v15 must be preserved if used */
|
|
+z_gft2_lo .req z17
|
|
+z_gft2_hi .req z18
|
|
+q_gft2_lo .req q17
|
|
+q_gft2_hi .req q18
|
|
+
|
|
+z_gft3_lo .req z19
|
|
+z_gft3_hi .req z20
|
|
+q_gft3_lo .req q19
|
|
+q_gft3_hi .req q20
|
|
+
|
|
+z_gft4_lo .req z21
|
|
+z_gft4_hi .req z22
|
|
+q_gft4_lo .req q21
|
|
+q_gft4_hi .req q22
|
|
+
|
|
+z_gft5_lo .req z23
|
|
+z_gft5_hi .req z24
|
|
+q_gft5_lo .req q23
|
|
+q_gft5_hi .req q24
|
|
+
|
|
+z_dest2 .req z27
|
|
+z_dest3 .req z28
|
|
+z_dest4 .req z29
|
|
+z_dest5 .req z30
|
|
+
|
|
+gf_5vect_mad_sve:
|
|
+ /* less than 16 bytes, return_fail */
|
|
+ cmp x_len, #16
|
|
+ blt .return_fail
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ /* load table 1 */
|
|
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
|
|
+
|
|
+ /* Load with NEON instruction ldp */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
|
|
+ /* load table 2 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
|
|
+ /* load table 3 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl]
|
|
+ /* load table 4 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl]
|
|
+ /* load table 5 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl]
|
|
+
|
|
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
|
|
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
|
|
+ ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */
|
|
+ ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */
|
|
+ ldr x_dest5, [x_dest, #8*4] /* pointer to dest5 */
|
|
+
|
|
+ mov x_pos, #0
|
|
+
|
|
+ /* vector length agnostic */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ prfb pldl2strm, p0, [x_dest1, x_pos]
|
|
+ prfb pldl2strm, p0, [x_dest2, x_pos]
|
|
+
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_src, x_pos]
|
|
+
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+ /* load dest data, governed by p0 */
|
|
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
|
|
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
|
|
+
|
|
+ prfb pldl2strm, p0, [x_dest3, x_pos]
|
|
+ prfb pldl2strm, p0, [x_dest4, x_pos]
|
|
+
|
|
+ /* dest1 */
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
|
|
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
|
|
+
|
|
+ ld1b z_dest3.b, p0/z, [x_dest3, x_pos]
|
|
+ ld1b z_dest4.b, p0/z, [x_dest4, x_pos]
|
|
+ prfb pldl2strm, p0, [x_dest5, x_pos]
|
|
+
|
|
+ /* dest2 */
|
|
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
|
|
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
|
|
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
|
|
+
|
|
+ ld1b z_dest5.b, p0/z, [x_dest5, x_pos]
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
|
|
+
|
|
+ /* dest3 */
|
|
+ tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b
|
|
+ eor z_dest3.d, z_tmp_lo.d, z_dest3.d
|
|
+ eor z_dest3.d, z_tmp_hi.d, z_dest3.d
|
|
+
|
|
+ /* dest4 */
|
|
+ tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b
|
|
+ eor z_dest4.d, z_tmp_lo.d, z_dest4.d
|
|
+ eor z_dest4.d, z_tmp_hi.d, z_dest4.d
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
|
|
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
|
|
+
|
|
+ /* dest5 */
|
|
+ tbl z_tmp_lo.b, {z_gft5_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft5_hi.b}, z_src_hi.b
|
|
+ eor z_dest5.d, z_tmp_lo.d, z_dest5.d
|
|
+ eor z_dest5.d, z_tmp_hi.d, z_dest5.d
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
|
|
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+
|
|
+ b .Lloopsve_vl
|
|
+
|
|
+.return_pass:
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|
|
diff --git a/erasure_code/aarch64/gf_6vect_dot_prod_sve.S b/erasure_code/aarch64/gf_6vect_dot_prod_sve.S
|
|
new file mode 100644
|
|
index 00000000..1196bc19
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_6vect_dot_prod_sve.S
|
|
@@ -0,0 +1,254 @@
|
|
+/*************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+.global gf_6vect_dot_prod_sve
|
|
+.type gf_6vect_dot_prod_sve, %function
|
|
+/* void gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char **dest);
|
|
+ */
|
|
+
|
|
+/* arguments */
|
|
+x_len .req x0 /* vector length */
|
|
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
|
|
+x_tbl .req x2
|
|
+x_src .req x3
|
|
+x_dest .req x4
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_vec_i .req x5
|
|
+x_ptr .req x6
|
|
+x_pos .req x7
|
|
+
|
|
+x_tbl1 .req x8
|
|
+x_tbl2 .req x9
|
|
+x_tbl3 .req x10
|
|
+x_tbl4 .req x11
|
|
+x_tbl5 .req x12
|
|
+x_tbl6 .req x13
|
|
+x_dest1 .req x14
|
|
+x_dest2 .req x15
|
|
+x_dest6 .req x_dest /* reused */
|
|
+
|
|
+/* r16,r17,r18,r29,r30: special role registers, avoided */
|
|
+/* r19..r29 and SP must be preserved */
|
|
+x_dest3 .req x19
|
|
+x_dest4 .req x20
|
|
+x_dest5 .req x21
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src
|
|
+
|
|
+z_dest1 .req z3
|
|
+
|
|
+z_gft1_lo .req z4
|
|
+z_gft1_hi .req z5
|
|
+q_gft1_lo .req q4
|
|
+q_gft1_hi .req q5
|
|
+
|
|
+/* bottom 64-bit of v8..v15 must be preserved if used */
|
|
+z_gft2_lo .req z17
|
|
+z_gft2_hi .req z18
|
|
+q_gft2_lo .req q17
|
|
+q_gft2_hi .req q18
|
|
+
|
|
+z_gft3_lo .req z19
|
|
+z_gft3_hi .req z20
|
|
+q_gft3_lo .req q19
|
|
+q_gft3_hi .req q20
|
|
+
|
|
+z_gft4_lo .req z21
|
|
+z_gft4_hi .req z22
|
|
+q_gft4_lo .req q21
|
|
+q_gft4_hi .req q22
|
|
+
|
|
+z_gft5_lo .req z23
|
|
+z_gft5_hi .req z24
|
|
+q_gft5_lo .req q23
|
|
+q_gft5_hi .req q24
|
|
+
|
|
+z_gft6_lo .req z25
|
|
+z_gft6_hi .req z26
|
|
+q_gft6_lo .req q25
|
|
+q_gft6_hi .req q26
|
|
+
|
|
+z_dest2 .req z27
|
|
+z_dest3 .req z28
|
|
+z_dest4 .req z29
|
|
+z_dest5 .req z30
|
|
+z_dest6 .req z31
|
|
+
|
|
+gf_6vect_dot_prod_sve:
|
|
+ /* less than 16 bytes, return_fail */
|
|
+ cmp x_len, #16
|
|
+ blt .return_fail
|
|
+
|
|
+ /* save r19..r29 */
|
|
+ sub sp, sp, #32 /* alignment */
|
|
+ stp x19, x20, [sp]
|
|
+ str x21, [sp, #16]
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ mov x_pos, #0
|
|
+ lsl x_vec, x_vec, #3
|
|
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
|
|
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
|
|
+ ldp x_dest5, x_dest6, [x_dest, #8*4] /* x_dest6 reuses x_dest */
|
|
+
|
|
+/* Loop 1: x_len, vector length */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ mov x_vec_i, #0 /* clear x_vec_i */
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+
|
|
+ mov z_dest1.b, #0 /* clear z_dest1 */
|
|
+ mov z_dest2.b, #0 /* clear z_dest2 */
|
|
+ mov z_dest3.b, #0 /* clear z_dest3 */
|
|
+ mov z_dest4.b, #0 /* clear z_dest4 */
|
|
+ mov z_dest5.b, #0 /* clear z_dest5 */
|
|
+ mov z_dest6.b, #0 /* clear z_dest6 */
|
|
+
|
|
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
|
|
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
|
|
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
|
|
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
|
|
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
|
|
+ add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */
|
|
+ add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */
|
|
+
|
|
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
|
|
+.Lloopsve_vl_vects:
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+
|
|
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
|
|
+ /* load gf_table's */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
|
|
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
|
|
+
|
|
+ /* prefetch */
|
|
+ prfb pldl2keep, p0, [x_tbl1]
|
|
+ prfb pldl2keep, p0, [x_tbl2]
|
|
+
|
|
+ /* calc for next and prefetch */
|
|
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+
|
|
+ /* dest 1 */
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
|
|
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
|
|
+
|
|
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
|
|
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
|
|
+ prfb pldl2keep, p0, [x_tbl3]
|
|
+ prfb pldl2keep, p0, [x_tbl4]
|
|
+
|
|
+ /* dest 2 */
|
|
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
|
|
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
|
|
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
|
|
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
|
|
+
|
|
+ /* dest 3 */
|
|
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
|
|
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
|
|
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
|
|
+ eor z_dest3.d, z_dest3.d, z_gft3_hi.d
|
|
+
|
|
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32
|
|
+ ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32
|
|
+ prfb pldl2keep, p0, [x_tbl5]
|
|
+ prfb pldl2keep, p0, [x_tbl6]
|
|
+
|
|
+ /* dest 4 */
|
|
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
|
|
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
|
|
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
|
|
+ eor z_dest4.d, z_dest4.d, z_gft4_hi.d
|
|
+
|
|
+ /* dest 5 */
|
|
+ tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b
|
|
+ tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b
|
|
+ eor z_dest5.d, z_gft5_lo.d, z_dest5.d
|
|
+ eor z_dest5.d, z_dest5.d, z_gft5_hi.d
|
|
+
|
|
+ /* dest 6 */
|
|
+ tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b
|
|
+ tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b
|
|
+ eor z_dest6.d, z_gft6_lo.d, z_dest6.d
|
|
+ eor z_dest6.d, z_dest6.d, z_gft6_hi.d
|
|
+
|
|
+ cmp x_vec_i, x_vec
|
|
+ blt .Lloopsve_vl_vects
|
|
+/* end of Loop 2 */
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
|
|
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
|
|
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
|
|
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
|
|
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
|
|
+ st1b z_dest6.b, p0, [x_dest6, x_pos]
|
|
+
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+ b .Lloopsve_vl
|
|
+/* end of Loop 1 */
|
|
+
|
|
+.return_pass:
|
|
+ /* restore r19..r29 */
|
|
+ ldr x21, [sp, #16]
|
|
+ ldp x19, x20, [sp]
|
|
+ add sp, sp, #32
|
|
+
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|
|
diff --git a/erasure_code/aarch64/gf_6vect_mad_sve.S b/erasure_code/aarch64/gf_6vect_mad_sve.S
|
|
new file mode 100644
|
|
index 00000000..670e664d
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_6vect_mad_sve.S
|
|
@@ -0,0 +1,233 @@
|
|
+/**************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+.global gf_6vect_mad_sve
|
|
+.type gf_6vect_mad_sve, %function
|
|
+
|
|
+/* gf_6vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
|
|
+ unsigned char *src, unsigned char **dest);
|
|
+ */
|
|
+/* arguments */
|
|
+x_len .req x0
|
|
+x_vec .req x1
|
|
+x_vec_i .req x2
|
|
+x_tbl .req x3
|
|
+x_src .req x4
|
|
+x_dest .req x5
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_pos .req x6
|
|
+x_dest2 .req x7
|
|
+x_dest3 .req x8
|
|
+x_dest4 .req x9
|
|
+x_dest5 .req x10
|
|
+x_dest6 .req x11
|
|
+x_dest1 .req x12
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src
|
|
+
|
|
+z_dest1 .req z3
|
|
+
|
|
+z_tmp_lo .req z4
|
|
+z_tmp_hi .req z5
|
|
+
|
|
+z_gft1_lo .req z6
|
|
+z_gft1_hi .req z7
|
|
+q_gft1_lo .req q6
|
|
+q_gft1_hi .req q7
|
|
+
|
|
+/* bottom 64-bit of v8..v15 must be preserved if used */
|
|
+z_gft2_lo .req z17
|
|
+z_gft2_hi .req z18
|
|
+q_gft2_lo .req q17
|
|
+q_gft2_hi .req q18
|
|
+
|
|
+z_gft3_lo .req z19
|
|
+z_gft3_hi .req z20
|
|
+q_gft3_lo .req q19
|
|
+q_gft3_hi .req q20
|
|
+
|
|
+z_gft4_lo .req z21
|
|
+z_gft4_hi .req z22
|
|
+q_gft4_lo .req q21
|
|
+q_gft4_hi .req q22
|
|
+
|
|
+z_gft5_lo .req z23
|
|
+z_gft5_hi .req z24
|
|
+q_gft5_lo .req q23
|
|
+q_gft5_hi .req q24
|
|
+
|
|
+z_gft6_lo .req z25
|
|
+z_gft6_hi .req z26
|
|
+q_gft6_lo .req q25
|
|
+q_gft6_hi .req q26
|
|
+
|
|
+z_dest2 .req z27
|
|
+z_dest3 .req z28
|
|
+z_dest4 .req z29
|
|
+z_dest5 .req z30
|
|
+z_dest6 .req z31
|
|
+
|
|
+gf_6vect_mad_sve:
|
|
+ /* less than 16 bytes, return_fail */
|
|
+ cmp x_len, #16
|
|
+ blt .return_fail
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ /* load table 1 */
|
|
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
|
|
+
|
|
+ /* Load with NEON instruction ldp */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
|
|
+ /* load table 2 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
|
|
+ /* load table 3 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl]
|
|
+ /* load table 4 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl]
|
|
+ /* load table 5 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl]
|
|
+ /* load table 6 */
|
|
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
|
|
+ ldp q_gft6_lo, q_gft6_hi, [x_tbl]
|
|
+
|
|
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
|
|
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
|
|
+ ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */
|
|
+ ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */
|
|
+ ldr x_dest5, [x_dest, #8*4] /* pointer to dest5 */
|
|
+ ldr x_dest6, [x_dest, #8*5] /* pointer to dest6 */
|
|
+
|
|
+ mov x_pos, #0
|
|
+
|
|
+ /* vector length agnostic */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ prfb pldl2strm, p0, [x_dest1, x_pos]
|
|
+ prfb pldl2strm, p0, [x_dest2, x_pos]
|
|
+
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_src, x_pos]
|
|
+
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+ /* load dest data, governed by p0 */
|
|
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
|
|
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
|
|
+
|
|
+ prfb pldl2strm, p0, [x_dest3, x_pos]
|
|
+ prfb pldl2strm, p0, [x_dest4, x_pos]
|
|
+
|
|
+ /* dest1 */
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
|
|
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
|
|
+
|
|
+ ld1b z_dest3.b, p0/z, [x_dest3, x_pos]
|
|
+ ld1b z_dest4.b, p0/z, [x_dest4, x_pos]
|
|
+
|
|
+ prfb pldl2strm, p0, [x_dest5, x_pos]
|
|
+ prfb pldl2strm, p0, [x_dest6, x_pos]
|
|
+
|
|
+ /* dest2 */
|
|
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
|
|
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
|
|
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
|
|
+
|
|
+ ld1b z_dest5.b, p0/z, [x_dest5, x_pos]
|
|
+ ld1b z_dest6.b, p0/z, [x_dest6, x_pos]
|
|
+
|
|
+ /* dest3 */
|
|
+ tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b
|
|
+ eor z_dest3.d, z_tmp_lo.d, z_dest3.d
|
|
+ eor z_dest3.d, z_tmp_hi.d, z_dest3.d
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
|
|
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
|
|
+
|
|
+ /* dest4 */
|
|
+ tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b
|
|
+ eor z_dest4.d, z_tmp_lo.d, z_dest4.d
|
|
+ eor z_dest4.d, z_tmp_hi.d, z_dest4.d
|
|
+
|
|
+ /* dest5 */
|
|
+ tbl z_tmp_lo.b, {z_gft5_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft5_hi.b}, z_src_hi.b
|
|
+ eor z_dest5.d, z_tmp_lo.d, z_dest5.d
|
|
+ eor z_dest5.d, z_tmp_hi.d, z_dest5.d
|
|
+
|
|
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
|
|
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
|
|
+
|
|
+ /* dest6 */
|
|
+ tbl z_tmp_lo.b, {z_gft6_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp_hi.b, {z_gft6_hi.b}, z_src_hi.b
|
|
+ eor z_dest6.d, z_tmp_lo.d, z_dest6.d
|
|
+ eor z_dest6.d, z_tmp_hi.d, z_dest6.d
|
|
+
|
|
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
|
|
+ st1b z_dest6.b, p0, [x_dest6, x_pos]
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+
|
|
+ b .Lloopsve_vl
|
|
+
|
|
+.return_pass:
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|
|
diff --git a/erasure_code/aarch64/gf_7vect_dot_prod_sve.S b/erasure_code/aarch64/gf_7vect_dot_prod_sve.S
|
|
new file mode 100644
|
|
index 00000000..cccaec5c
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_7vect_dot_prod_sve.S
|
|
@@ -0,0 +1,277 @@
|
|
+/*************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+.global gf_7vect_dot_prod_sve
|
|
+.type gf_7vect_dot_prod_sve, %function
|
|
+/* void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char **dest);
|
|
+ */
|
|
+
|
|
+/* arguments */
|
|
+x_len .req x0 /* vector length */
|
|
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
|
|
+x_tbl .req x2
|
|
+x_src .req x3
|
|
+x_dest .req x4
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_vec_i .req x5
|
|
+x_ptr .req x6
|
|
+x_pos .req x7
|
|
+
|
|
+x_tbl1 .req x8
|
|
+x_tbl2 .req x9
|
|
+x_tbl3 .req x10
|
|
+x_tbl4 .req x11
|
|
+x_tbl5 .req x12
|
|
+x_tbl6 .req x13
|
|
+x_tbl7 .req x14
|
|
+
|
|
+x_dest1 .req x15
|
|
+
|
|
+/* r16,r17,r18,r29,r30: special role registers, avoided */
|
|
+/* r19..r29 and SP must be preserved */
|
|
+x_dest2 .req x19
|
|
+x_dest3 .req x20
|
|
+x_dest4 .req x21
|
|
+x_dest5 .req x22
|
|
+x_dest6 .req x23
|
|
+x_dest7 .req x_dest /* reused */
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src
|
|
+
|
|
+z_dest1 .req z3
|
|
+z_gft1_lo .req z4
|
|
+z_gft1_hi .req z5
|
|
+q_gft1_lo .req q4
|
|
+q_gft1_hi .req q5
|
|
+
|
|
+z_gft7_lo .req z6
|
|
+z_gft7_hi .req z7
|
|
+q_gft7_lo .req q6
|
|
+q_gft7_hi .req q7
|
|
+
|
|
+/* bottom 64-bit of v8..v15 must be preserved if used */
|
|
+z_dest7 .req z16
|
|
+
|
|
+z_gft2_lo .req z17
|
|
+z_gft2_hi .req z18
|
|
+q_gft2_lo .req q17
|
|
+q_gft2_hi .req q18
|
|
+
|
|
+z_gft3_lo .req z19
|
|
+z_gft3_hi .req z20
|
|
+q_gft3_lo .req q19
|
|
+q_gft3_hi .req q20
|
|
+
|
|
+z_gft4_lo .req z21
|
|
+z_gft4_hi .req z22
|
|
+q_gft4_lo .req q21
|
|
+q_gft4_hi .req q22
|
|
+
|
|
+z_gft5_lo .req z23
|
|
+z_gft5_hi .req z24
|
|
+q_gft5_lo .req q23
|
|
+q_gft5_hi .req q24
|
|
+
|
|
+z_gft6_lo .req z25
|
|
+z_gft6_hi .req z26
|
|
+q_gft6_lo .req q25
|
|
+q_gft6_hi .req q26
|
|
+
|
|
+z_dest2 .req z27
|
|
+z_dest3 .req z28
|
|
+z_dest4 .req z29
|
|
+z_dest5 .req z30
|
|
+z_dest6 .req z31
|
|
+
|
|
+gf_7vect_dot_prod_sve:
|
|
+ /* less than 16 bytes, return_fail */
|
|
+ cmp x_len, #16
|
|
+ blt .return_fail
|
|
+
|
|
+ /* save r19..r29 */
|
|
+ sub sp, sp, #48 /* alignment */
|
|
+ stp x19, x20, [sp]
|
|
+ stp x21, x22, [sp, #16]
|
|
+ str x23, [sp, #32]
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ mov x_pos, #0
|
|
+ lsl x_vec, x_vec, #3
|
|
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
|
|
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
|
|
+ ldp x_dest5, x_dest6, [x_dest, #8*4]
|
|
+ ldr x_dest7, [x_dest, #8*6] /* x_dest7 reuses x_dest */
|
|
+
|
|
+/* Loop 1: x_len, vector length */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ mov x_vec_i, #0 /* clear x_vec_i */
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+
|
|
+ mov z_dest1.b, #0 /* clear z_dest1 */
|
|
+ mov z_dest2.b, #0 /* clear z_dest2 */
|
|
+ mov z_dest3.b, #0 /* clear z_dest3 */
|
|
+ mov z_dest4.b, #0 /* clear z_dest4 */
|
|
+ mov z_dest5.b, #0 /* clear z_dest5 */
|
|
+ mov z_dest6.b, #0 /* clear z_dest6 */
|
|
+ mov z_dest7.b, #0 /* clear z_dest7 */
|
|
+
|
|
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
|
|
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
|
|
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
|
|
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
|
|
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
|
|
+ add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */
|
|
+ add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */
|
|
+ add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */
|
|
+
|
|
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
|
|
+.Lloopsve_vl_vects:
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
|
|
+ /* load gf_table's */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
|
|
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
|
|
+
|
|
+ /* prefetch */
|
|
+ prfb pldl2keep, p0, [x_tbl1]
|
|
+ prfb pldl2keep, p0, [x_tbl2]
|
|
+
|
|
+ /* calc for next and prefetch */
|
|
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+
|
|
+ /* dest 1 */
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
|
|
+ eor z_dest1.d, z_gft1_hi.d, z_dest1.d
|
|
+
|
|
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
|
|
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
|
|
+ prfb pldl2keep, p0, [x_tbl3]
|
|
+ prfb pldl2keep, p0, [x_tbl4]
|
|
+
|
|
+ /* dest 2 */
|
|
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
|
|
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
|
|
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
|
|
+ eor z_dest2.d, z_gft2_hi.d, z_dest2.d
|
|
+
|
|
+ /* dest 3 */
|
|
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
|
|
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
|
|
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
|
|
+ eor z_dest3.d, z_gft3_hi.d, z_dest3.d
|
|
+
|
|
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32
|
|
+ ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32
|
|
+ prfb pldl2keep, p0, [x_tbl5]
|
|
+ prfb pldl2keep, p0, [x_tbl6]
|
|
+
|
|
+ /* dest 4 */
|
|
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
|
|
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
|
|
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
|
|
+ eor z_dest4.d, z_gft4_hi.d, z_dest4.d
|
|
+
|
|
+ /* dest 5 */
|
|
+ tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b
|
|
+ tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b
|
|
+ eor z_dest5.d, z_gft5_lo.d, z_dest5.d
|
|
+ eor z_dest5.d, z_gft5_hi.d, z_dest5.d
|
|
+
|
|
+ ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32
|
|
+ prfb pldl2keep, p0, [x_tbl7]
|
|
+
|
|
+ /* dest 6 */
|
|
+ tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b
|
|
+ tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b
|
|
+ eor z_dest6.d, z_gft6_lo.d, z_dest6.d
|
|
+ eor z_dest6.d, z_gft6_hi.d, z_dest6.d
|
|
+
|
|
+ /* dest 7 */
|
|
+ tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b
|
|
+ tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b
|
|
+ eor z_dest7.d, z_gft7_lo.d, z_dest7.d
|
|
+ eor z_dest7.d, z_gft7_hi.d, z_dest7.d
|
|
+
|
|
+ cmp x_vec_i, x_vec
|
|
+ blt .Lloopsve_vl_vects
|
|
+/* end of Loop 2 */
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
|
|
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
|
|
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
|
|
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
|
|
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
|
|
+ st1b z_dest6.b, p0, [x_dest6, x_pos]
|
|
+ st1b z_dest7.b, p0, [x_dest7, x_pos]
|
|
+
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+ b .Lloopsve_vl
|
|
+/* end of Loop 1 */
|
|
+
|
|
+.return_pass:
|
|
+ /* restore r19..r29 */
|
|
+ ldr x23, [sp, #32]
|
|
+ ldp x21, x22, [sp, #16]
|
|
+ ldp x19, x20, [sp]
|
|
+ add sp, sp, #48
|
|
+
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|
|
diff --git a/erasure_code/aarch64/gf_8vect_dot_prod_sve.S b/erasure_code/aarch64/gf_8vect_dot_prod_sve.S
|
|
new file mode 100644
|
|
index 00000000..ee839a43
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_8vect_dot_prod_sve.S
|
|
@@ -0,0 +1,303 @@
|
|
+/*************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+.global gf_8vect_dot_prod_sve
|
|
+.type gf_8vect_dot_prod_sve, %function
|
|
+/* void gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char **dest);
|
|
+ */
|
|
+
|
|
+/* arguments */
|
|
+x_len .req x0 /* vector length */
|
|
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
|
|
+x_tbl .req x2
|
|
+x_src .req x3
|
|
+x_dest .req x4
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_vec_i .req x5
|
|
+x_ptr .req x6
|
|
+x_pos .req x7
|
|
+
|
|
+x_tbl1 .req x8
|
|
+x_tbl2 .req x9
|
|
+x_tbl3 .req x10
|
|
+x_tbl4 .req x11
|
|
+x_tbl5 .req x12
|
|
+x_tbl6 .req x13
|
|
+x_tbl7 .req x14
|
|
+
|
|
+x_dest1 .req x15
|
|
+
|
|
+/* r16,r17,r18,r29,r30: special role registers, avoided */
|
|
+/* r19..r29 and SP must be preserved */
|
|
+x_dest2 .req x19
|
|
+x_dest3 .req x20
|
|
+x_dest4 .req x21
|
|
+x_dest5 .req x22
|
|
+x_dest6 .req x23
|
|
+x_dest7 .req x24
|
|
+x_dest8 .req x_dest /* reused */
|
|
+x_tbl8 .req x25
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src
|
|
+
|
|
+z_dest1 .req z3
|
|
+z_gft1_lo .req z4
|
|
+z_gft1_hi .req z5
|
|
+q_gft1_lo .req q4
|
|
+q_gft1_hi .req q5
|
|
+
|
|
+z_gft7_lo .req z6
|
|
+z_gft7_hi .req z7
|
|
+q_gft7_lo .req q6
|
|
+q_gft7_hi .req q7
|
|
+
|
|
+/* bottom 64-bit of v8..v15 must be preserved if used */
|
|
+z_dest7 .req z8
|
|
+
|
|
+z_gft8_lo .req z9
|
|
+z_gft8_hi .req z10
|
|
+q_gft8_lo .req q9
|
|
+q_gft8_hi .req q10
|
|
+
|
|
+z_dest8 .req z16
|
|
+
|
|
+z_gft2_lo .req z17
|
|
+z_gft2_hi .req z18
|
|
+q_gft2_lo .req q17
|
|
+q_gft2_hi .req q18
|
|
+
|
|
+z_gft3_lo .req z19
|
|
+z_gft3_hi .req z20
|
|
+q_gft3_lo .req q19
|
|
+q_gft3_hi .req q20
|
|
+
|
|
+z_gft4_lo .req z21
|
|
+z_gft4_hi .req z22
|
|
+q_gft4_lo .req q21
|
|
+q_gft4_hi .req q22
|
|
+
|
|
+z_gft5_lo .req z23
|
|
+z_gft5_hi .req z24
|
|
+q_gft5_lo .req q23
|
|
+q_gft5_hi .req q24
|
|
+
|
|
+z_gft6_lo .req z25
|
|
+z_gft6_hi .req z26
|
|
+q_gft6_lo .req q25
|
|
+q_gft6_hi .req q26
|
|
+
|
|
+z_dest2 .req z27
|
|
+z_dest3 .req z28
|
|
+z_dest4 .req z29
|
|
+z_dest5 .req z30
|
|
+z_dest6 .req z31
|
|
+
|
|
+gf_8vect_dot_prod_sve:
|
|
+ /* less than 16 bytes, return_fail */
|
|
+ cmp x_len, #16
|
|
+ blt .return_fail
|
|
+
|
|
+ /* save r19..r29 */
|
|
+ sub sp, sp, #80 /* alignment */
|
|
+ stp x19, x20, [sp]
|
|
+ stp x21, x22, [sp, #16]
|
|
+ stp x23, x24, [sp, #32]
|
|
+ stp d8, d9, [sp, #48]
|
|
+ str d10, [sp, #56]
|
|
+ str x25, [sp, #64]
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ mov x_pos, #0
|
|
+ lsl x_vec, x_vec, #3
|
|
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
|
|
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
|
|
+ ldp x_dest5, x_dest6, [x_dest, #8*4]
|
|
+ ldp x_dest7, x_dest8, [x_dest, #8*6] /* x_dest8 reuses x_dest */
|
|
+
|
|
+/* Loop 1: x_len, vector length */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ mov x_vec_i, #0 /* clear x_vec_i */
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+
|
|
+ mov z_dest1.b, #0 /* clear z_dest1 */
|
|
+ mov z_dest2.b, #0 /* clear z_dest2 */
|
|
+ mov z_dest3.b, #0 /* clear z_dest3 */
|
|
+ mov z_dest4.b, #0 /* clear z_dest4 */
|
|
+ mov z_dest5.b, #0 /* clear z_dest5 */
|
|
+ mov z_dest6.b, #0 /* clear z_dest6 */
|
|
+ mov z_dest7.b, #0 /* clear z_dest7 */
|
|
+ mov z_dest8.b, #0 /* clear z_dest8 */
|
|
+
|
|
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
|
|
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
|
|
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
|
|
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
|
|
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
|
|
+ add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */
|
|
+ add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */
|
|
+ add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */
|
|
+ add x_tbl8, x_tbl7, x_vec, LSL #2 /* reset x_tbl8 */
|
|
+
|
|
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
|
|
+.Lloopsve_vl_vects:
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
|
|
+ /* load gf_table's */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
|
|
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
|
|
+
|
|
+ /* prefetch */
|
|
+ prfb pldl2keep, p0, [x_tbl1]
|
|
+ prfb pldl2keep, p0, [x_tbl2]
|
|
+
|
|
+ /* calc for next and prefetch */
|
|
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+
|
|
+ /* dest 1 */
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
|
|
+ eor z_dest1.d, z_gft1_hi.d, z_dest1.d
|
|
+
|
|
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
|
|
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
|
|
+ prfb pldl2keep, p0, [x_tbl3]
|
|
+ prfb pldl2keep, p0, [x_tbl4]
|
|
+
|
|
+ /* dest 2 */
|
|
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
|
|
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
|
|
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
|
|
+ eor z_dest2.d, z_gft2_hi.d, z_dest2.d
|
|
+
|
|
+ /* dest 3 */
|
|
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
|
|
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
|
|
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
|
|
+ eor z_dest3.d, z_gft3_hi.d, z_dest3.d
|
|
+
|
|
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32
|
|
+ ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32
|
|
+ prfb pldl2keep, p0, [x_tbl5]
|
|
+ prfb pldl2keep, p0, [x_tbl6]
|
|
+
|
|
+ /* dest 4 */
|
|
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
|
|
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
|
|
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
|
|
+ eor z_dest4.d, z_gft4_hi.d, z_dest4.d
|
|
+
|
|
+ /* dest 5 */
|
|
+ tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b
|
|
+ tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b
|
|
+ eor z_dest5.d, z_gft5_lo.d, z_dest5.d
|
|
+ eor z_dest5.d, z_gft5_hi.d, z_dest5.d
|
|
+
|
|
+ ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32
|
|
+ ldp q_gft8_lo, q_gft8_hi, [x_tbl8], #32
|
|
+ prfb pldl2keep, p0, [x_tbl7]
|
|
+ prfb pldl2keep, p0, [x_tbl8]
|
|
+
|
|
+ /* dest 6 */
|
|
+ tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b
|
|
+ tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b
|
|
+ eor z_dest6.d, z_gft6_lo.d, z_dest6.d
|
|
+ eor z_dest6.d, z_gft6_hi.d, z_dest6.d
|
|
+
|
|
+ /* dest 7 */
|
|
+ tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b
|
|
+ tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b
|
|
+ eor z_dest7.d, z_gft7_lo.d, z_dest7.d
|
|
+ eor z_dest7.d, z_gft7_hi.d, z_dest7.d
|
|
+
|
|
+ /* dest 8 */
|
|
+ tbl z_gft8_lo.b, {z_gft8_lo.b}, z_src_lo.b
|
|
+ tbl z_gft8_hi.b, {z_gft8_hi.b}, z_src_hi.b
|
|
+ eor z_dest8.d, z_gft8_lo.d, z_dest8.d
|
|
+ eor z_dest8.d, z_gft8_hi.d, z_dest8.d
|
|
+
|
|
+ cmp x_vec_i, x_vec
|
|
+ blt .Lloopsve_vl_vects
|
|
+/* end of Loop 2 */
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
|
|
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
|
|
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
|
|
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
|
|
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
|
|
+ st1b z_dest6.b, p0, [x_dest6, x_pos]
|
|
+ st1b z_dest7.b, p0, [x_dest7, x_pos]
|
|
+ st1b z_dest8.b, p0, [x_dest8, x_pos]
|
|
+
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+ b .Lloopsve_vl
|
|
+/* end of Loop 1 */
|
|
+
|
|
+.return_pass:
|
|
+ /* restore r19..r29 */
|
|
+ ldr x25, [sp, #64]
|
|
+ ldr d10, [sp, #56]
|
|
+ ldp d8, d9, [sp, #48]
|
|
+ ldp x23, x24, [sp, #32]
|
|
+ ldp x21, x22, [sp, #16]
|
|
+ ldp x19, x20, [sp]
|
|
+ add sp, sp, #80
|
|
+
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|
|
diff --git a/erasure_code/aarch64/gf_vect_dot_prod_sve.S b/erasure_code/aarch64/gf_vect_dot_prod_sve.S
|
|
new file mode 100644
|
|
index 00000000..7cf3d0df
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_vect_dot_prod_sve.S
|
|
@@ -0,0 +1,128 @@
|
|
+/**************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+.global gf_vect_dot_prod_sve
|
|
+.type gf_vect_dot_prod_sve, %function
|
|
+/* void gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
|
|
+ unsigned char **src, unsigned char *dest);
|
|
+ */
|
|
+
|
|
+/* arguments */
|
|
+x_len .req x0 /* vector length */
|
|
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
|
|
+x_tbl .req x2
|
|
+x_src .req x3
|
|
+x_dest1 .req x4
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_vec_i .req x5
|
|
+x_ptr .req x6
|
|
+x_pos .req x7
|
|
+x_tbl1 .req x8
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src
|
|
+
|
|
+z_dest .req z3
|
|
+
|
|
+z_gft1_lo .req z4
|
|
+z_gft1_hi .req z5
|
|
+q_gft1_lo .req q4
|
|
+q_gft1_hi .req q5
|
|
+
|
|
+gf_vect_dot_prod_sve:
|
|
+ /* less than 16 bytes, return_fail */
|
|
+ cmp x_len, #16
|
|
+ blt .return_fail
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ mov x_pos, #0
|
|
+ lsl x_vec, x_vec, #3
|
|
+
|
|
+/* Loop 1: x_len, vector length */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ mov z_dest.b, #0 /* clear z_dest */
|
|
+ mov x_vec_i, #0 /* clear x_vec_i */
|
|
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
|
|
+
|
|
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
|
|
+.Lloopsve_vl_vects:
|
|
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
|
|
+
|
|
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
|
|
+
|
|
+ /* load gf_table */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is added by #32
|
|
+ for each src vect */
|
|
+
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest.d, z_gft1_lo.d, z_dest.d
|
|
+ eor z_dest.d, z_gft1_hi.d, z_dest.d
|
|
+
|
|
+ cmp x_vec_i, x_vec
|
|
+ blt .Lloopsve_vl_vects
|
|
+
|
|
+ /* end of Loop 2 */
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest.b, p0, [x_dest1, x_pos]
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+
|
|
+ b .Lloopsve_vl
|
|
+
|
|
+.return_pass:
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|
|
diff --git a/erasure_code/aarch64/gf_vect_mad_sve.S b/erasure_code/aarch64/gf_vect_mad_sve.S
|
|
new file mode 100644
|
|
index 00000000..970cf237
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_vect_mad_sve.S
|
|
@@ -0,0 +1,123 @@
|
|
+/**************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+
|
|
+.global gf_vect_mad_sve
|
|
+.type gf_vect_mad_sve, %function
|
|
+
|
|
+/* gf_vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
|
|
+ unsigned char *src, unsigned char *dest);
|
|
+ */
|
|
+/* arguments */
|
|
+x_len .req x0
|
|
+x_vec .req x1
|
|
+x_vec_i .req x2
|
|
+x_tbl .req x3
|
|
+x_src .req x4
|
|
+x_dest .req x5
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_pos .req x6
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src
|
|
+
|
|
+z_dest .req z3
|
|
+
|
|
+z_tmp1_lo .req z4
|
|
+z_tmp1_hi .req z5
|
|
+
|
|
+z_gft1_lo .req z6
|
|
+z_gft1_hi .req z7
|
|
+q_gft1_lo .req q6
|
|
+q_gft1_hi .req q7
|
|
+
|
|
+gf_vect_mad_sve:
|
|
+ /* less than 16 bytes, return_fail */
|
|
+ cmp x_len, #16
|
|
+ blt .return_fail
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
|
|
+
|
|
+ /* Load with NEON instruction ldp */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
|
|
+
|
|
+ mov x_pos, #0
|
|
+
|
|
+ /* vector length agnostic */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ /* prefetch dest data */
|
|
+ prfb pldl2strm, p0, [x_dest, x_pos]
|
|
+
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_src, x_pos]
|
|
+
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+ /* load dest data, governed by p0 */
|
|
+ ld1b z_dest.b, p0/z, [x_dest, x_pos]
|
|
+
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_tmp1_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp1_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest.d, z_tmp1_lo.d, z_dest.d
|
|
+ eor z_dest.d, z_tmp1_hi.d, z_dest.d
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest.b, p0, [x_dest, x_pos]
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+
|
|
+ b .Lloopsve_vl
|
|
+
|
|
+.return_pass:
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|
|
diff --git a/erasure_code/aarch64/gf_vect_mul_sve.S b/erasure_code/aarch64/gf_vect_mul_sve.S
|
|
new file mode 100644
|
|
index 00000000..195b5973
|
|
--- /dev/null
|
|
+++ b/erasure_code/aarch64/gf_vect_mul_sve.S
|
|
@@ -0,0 +1,117 @@
|
|
+/**************************************************************
|
|
+ Copyright (c) 2021 Linaro Ltd.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Huawei Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+.text
|
|
+.align 6
|
|
+.arch armv8-a+sve
|
|
+
|
|
+.global gf_vect_mul_sve
|
|
+.type gf_vect_mul_sve, %function
|
|
+
|
|
+/* Refer to include/gf_vect_mul.h
|
|
+ *
|
|
+ * @param len Length of vector in bytes. Must be aligned to 32B.
|
|
+ * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
|
|
+ * @param src Pointer to src data array. Must be aligned to 32B.
|
|
+ * @param dest Pointer to destination data array. Must be aligned to 32B.
|
|
+ * @returns 0 pass, other fail
|
|
+ *
|
|
+ * int gf_vect_mul(int len, unsigned char *gftbl, void *src, void *dest);
|
|
+ */
|
|
+
|
|
+/* arguments */
|
|
+x_len .req x0
|
|
+x_tbl .req x1
|
|
+x_src .req x2
|
|
+x_dest .req x3
|
|
+
|
|
+/* returns */
|
|
+w_ret .req w0
|
|
+
|
|
+/* local variables */
|
|
+x_pos .req x4
|
|
+
|
|
+/* vectors */
|
|
+z_mask0f .req z0
|
|
+
|
|
+z_src .req z1
|
|
+z_src_lo .req z2
|
|
+z_src_hi .req z_src /* reuse */
|
|
+
|
|
+z_dest .req z3
|
|
+z_tmp1_lo .req z4
|
|
+z_tmp1_hi .req z_dest /* reuse */
|
|
+
|
|
+z_gft1_lo .req z6
|
|
+z_gft1_hi .req z7
|
|
+q_gft1_lo .req q6
|
|
+q_gft1_hi .req q7
|
|
+
|
|
+gf_vect_mul_sve:
|
|
+ /* less than 32 bytes, return_fail */
|
|
+ cmp x_len, #32
|
|
+ blt .return_fail
|
|
+
|
|
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
|
|
+ mov x_pos, #0
|
|
+
|
|
+ /* Load with NEON instruction ldp */
|
|
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
|
|
+
|
|
+ /* vector length agnostic */
|
|
+.Lloopsve_vl:
|
|
+ whilelo p0.b, x_pos, x_len
|
|
+ b.none .return_pass
|
|
+
|
|
+ /* load src data, governed by p0 */
|
|
+ ld1b z_src.b, p0/z, [x_src, x_pos]
|
|
+
|
|
+ /* split 4-bit lo; 4-bit hi */
|
|
+ and z_src_lo.d, z_src.d, z_mask0f.d
|
|
+ lsr z_src_hi.b, z_src.b, #4
|
|
+
|
|
+ /* table indexing, ie. gf(2^8) multiplication */
|
|
+ tbl z_tmp1_lo.b, {z_gft1_lo.b}, z_src_lo.b
|
|
+ tbl z_tmp1_hi.b, {z_gft1_hi.b}, z_src_hi.b
|
|
+ /* exclusive or, ie. gf(2^8) add */
|
|
+ eor z_dest.d, z_tmp1_hi.d, z_tmp1_lo.d
|
|
+
|
|
+ /* store dest data, governed by p0 */
|
|
+ st1b z_dest.b, p0, [x_dest, x_pos]
|
|
+ /* increment one vector length */
|
|
+ incb x_pos
|
|
+
|
|
+ b .Lloopsve_vl
|
|
+
|
|
+.return_pass:
|
|
+ mov w_ret, #0
|
|
+ ret
|
|
+
|
|
+.return_fail:
|
|
+ mov w_ret, #1
|
|
+ ret
|