Skip to content

Commit c44e10e

Browse files
Merge branch 'develop' into export-shared-only
2 parents 142e39e + d92f151 commit c44e10e

File tree

124 files changed

+6750
-510
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

124 files changed

+6750
-510
lines changed

.github/workflows/arm64_graviton.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,13 +88,14 @@ jobs:
8888
run: |
8989
case "${{ matrix.build }}" in
9090
"make")
91-
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
91+
make -j$(nproc) DYNAMIC_ARCH=1 BUILD_BFLOAT16=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
9292
;;
9393
"cmake")
9494
mkdir build && cd build
9595
cmake -DDYNAMIC_ARCH=1 \
9696
-DNOFORTRAN=0 \
9797
-DBUILD_WITHOUT_LAPACK=0 \
98+
-DBUILD_BFLOAT16=1 \
9899
-DCMAKE_VERBOSE_MAKEFILE=ON \
99100
-DCMAKE_BUILD_TYPE=Release \
100101
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \

.github/workflows/windows_arm64.yml

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
name: Windows ARM64 CI
2+
3+
on:
4+
push:
5+
branches:
6+
- develop
7+
- release-**
8+
pull_request:
9+
branches:
10+
- develop
11+
- release-**
12+
13+
concurrency:
14+
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
15+
cancel-in-progress: true
16+
17+
permissions:
18+
contents: read # to fetch code (actions/checkout)
19+
20+
jobs:
21+
build:
22+
if: "github.repository == 'OpenMathLib/OpenBLAS'"
23+
runs-on: windows-11-arm
24+
steps:
25+
- name: Checkout repository
26+
uses: actions/checkout@v3
27+
28+
- name: Install LLVM for Win-ARM64
29+
shell: pwsh
30+
run: |
31+
Invoke-WebRequest https://github.com/llvm/llvm-project/releases/download/llvmorg-20.1.6/LLVM-20.1.6-woa64.exe -UseBasicParsing -OutFile LLVM-woa64.exe
32+
Start-Process -FilePath ".\LLVM-woa64.exe" -ArgumentList "/S" -Wait
33+
echo "C:\Program Files\LLVM\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
34+
35+
- name: Install CMake and Ninja for Win-ARM64
36+
shell: pwsh
37+
run: |
38+
Invoke-WebRequest https://github.com/Kitware/CMake/releases/download/v3.29.4/cmake-3.29.4-windows-arm64.msi -OutFile cmake-arm64.msi
39+
Start-Process msiexec.exe -ArgumentList "/i cmake-arm64.msi /quiet /norestart" -Wait
40+
echo "C:\Program Files\CMake\bin" >> $env:GITHUB_PATH
41+
42+
Invoke-WebRequest https://github.com/ninja-build/ninja/releases/download/v1.13.1/ninja-winarm64.zip -OutFile ninja-winarm64.zip
43+
Expand-Archive ninja-winarm64.zip -DestinationPath ninja
44+
Copy-Item ninja\ninja.exe -Destination "C:\Windows\System32"
45+
46+
- name: Configure OpenBLAS
47+
shell: cmd
48+
run: |
49+
CALL "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsarm64.bat"
50+
mkdir build
51+
cd build
52+
cmake .. -G Ninja ^
53+
-DCMAKE_BUILD_TYPE=Release ^
54+
-DTARGET=ARMV8 ^
55+
-DBINARY=64 ^
56+
-DCMAKE_C_COMPILER=clang-cl ^
57+
-DCMAKE_Fortran_COMPILER=flang-new ^
58+
-DBUILD_SHARED_LIBS=ON ^
59+
-DCMAKE_SYSTEM_PROCESSOR=arm64 ^
60+
-DCMAKE_SYSTEM_NAME=Windows ^
61+
-DCMAKE_INSTALL_PREFIX=C:/opt
62+
63+
- name: Build OpenBLAS
64+
shell: cmd
65+
run: |
66+
cd build
67+
ninja -j16
68+
69+
- name: Install OpenBLAS
70+
shell: cmd
71+
run: |
72+
cd build
73+
cmake --install .
74+
75+
- name: Run ctests
76+
shell: pwsh
77+
run: |
78+
$env:PATH = "C:\opt\bin;$env:PATH"
79+
cd build
80+
ctest
81+
82+

.gitignore

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ lapack-3.4.1.tgz
1313
lapack-3.4.2
1414
lapack-3.4.2.tgz
1515
lapack-netlib/make.inc
16-
lapack-netlib/lapacke/include/lapacke_mangling.h
1716
lapack-netlib/SRC/la_constants.mod
17+
lapack-netlib/SRC/la_xisnan.mod
1818
lapack-netlib/TESTING/testing_results.txt
1919
lapack-netlib/INSTALL/test*
2020
lapack-netlib/TESTING/xeigtstc
@@ -81,7 +81,10 @@ test/ZBLAT2.SUMM
8181
test/ZBLAT3.SUMM
8282
test/ZBLAT3_3M.SUMM
8383
test/SHBLAT3.SUMM
84+
test/SBBLAT2.SUMM
8485
test/SBBLAT3.SUMM
86+
test/BBLAT2.SUMM
87+
test/BBLAT3.SUMM
8588
test/cblat1
8689
test/cblat2
8790
test/cblat3
@@ -96,6 +99,9 @@ test/sblat3
9699
test/sblat3_3m
97100
test/test_shgemm
98101
test/test_sbgemm
102+
test/test_sbgemv
103+
test/test_bgemm
104+
test/test_bgemv
99105
test/zblat1
100106
test/zblat2
101107
test/zblat3

CMakeLists.txt

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,9 @@ endif ()
152152
if (NOT DEFINED BUILD_BFLOAT16)
153153
set (BUILD_BFLOAT16 false)
154154
endif ()
155+
if (NOT DEFINED BUILD_HFLOAT16)
156+
set (BUILD_HFLOAT16 false)
157+
endif ()
155158
# set which float types we want to build for
156159
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
157160
# if none are defined, build for all
@@ -302,8 +305,8 @@ if (USE_OPENMP)
302305
endif()
303306
endif()
304307

305-
# Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on
306-
if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64"))
308+
# Fix "Argument list too long" for macOS with POWERPC or Intel CPUs
309+
if(APPLE AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64"))
307310
# Use response files
308311
set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
309312
# Always build static library first
@@ -537,18 +540,18 @@ if (BUILD_SHARED_LIBS OR DELETE_STATIC_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFI
537540
else ()
538541
if (NOT USE_PERL)
539542
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
540-
COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
541-
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so
543+
COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
544+
COMMAND objcopy --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so
542545
COMMENT "renaming symbols"
543-
)
546+
)
544547
else()
545548
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
546549
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
547-
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
550+
COMMAND objcopy --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
548551
COMMENT "renaming symbols"
549-
)
550-
endif()
552+
)
551553
endif()
554+
endif()
552555
endif()
553556
554557
if (BUILD_BENCHMARKS)

CONTRIBUTORS.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,18 @@ In chronological order:
251251
* Ye Tao <ye.tao@arm.com>
252252
* [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1
253253
* [2025-02-27] Add sbgemv_n_neon kernel
254+
* [2025-05-17] Impl prototype of BGEMM inferface
254255

255256
* Abhishek Kumar <https://github.com/abhishek-iitmadras>
256-
* [2025-04-22] Optimise dot kernel for NEOVERSE V1
257+
* [2025-04-22] Optimise dot kernel for NEOVERSE V1
258+
259+
* Sharif Inamdar <sharif.inamdar@arm.com>
260+
* [2025-06-05] Optimize gemv_n_sve_v1x3 kernel
261+
262+
* Guoyuan Li <https://github.com/guoyuanplct>
263+
* [2025-04-11] Optimise gemv kernel for RISCV64_ZVL256B
264+
* [2025-05-01] Optimise zgemv kernel for RISCV64_ZVL256B
265+
* [2025-05-17] Optimise omatcopy/zomatcopy kernel for RISCV64_ZVL256B
266+
* [2025-05-29] Optimise axpby kernel for RISCV64_ZVL256B
267+
* [2025-06-05] Optimise hbmv kernel for RISCV64_ZVL256B
268+

Makefile.arm64

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,16 @@ endif
191191
endif
192192
endif
193193

194+
# Detect Ampere AmpereOne(ampere1,ampere1a) processors.
195+
ifeq ($(CORE), AMPERE1)
196+
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
197+
CCOMMON_OPT += -march=armv8.6-a+crypto+crc+fp16+sha3+rng
198+
ifneq ($(F_COMPILER), NAG)
199+
FCOMMON_OPT += -march=armv8.6-a+crypto+crc+fp16+sha3+rng
200+
endif
201+
endif
202+
endif
203+
194204
# Use a53 tunings because a55 is only available in GCC>=8.1
195205
ifeq ($(CORE), CORTEXA55)
196206
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))

Makefile.power

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,16 @@ ifeq ($(CORE), POWER10)
1313
ifneq ($(C_COMPILER), PGI)
1414
ifeq ($(C_COMPILER), GCC)
1515
ifeq ($(GCCVERSIONGTEQ10), 1)
16-
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
16+
CCOMMON_OPT += -O3 -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
1717
else ifneq ($(GCCVERSIONGT4), 1)
1818
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
19-
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
19+
CCOMMON_OPT += -O3 -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
2020
else
2121
$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
22-
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
22+
CCOMMON_OPT += -O3 -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
2323
endif
2424
else
25-
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
25+
CCOMMON_OPT += -O3 -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
2626
endif
2727
ifeq ($(F_COMPILER), IBM)
2828
FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
@@ -34,7 +34,7 @@ endif
3434

3535
ifeq ($(CORE), POWER9)
3636
ifneq ($(C_COMPILER), PGI)
37-
CCOMMON_OPT += -Ofast -mvsx -fno-fast-math
37+
CCOMMON_OPT += -O3 -mvsx -fno-fast-math
3838
ifeq ($(C_COMPILER), GCC)
3939
ifneq ($(GCCVERSIONGT4), 1)
4040
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
@@ -70,7 +70,7 @@ endif
7070

7171
ifeq ($(CORE), POWER8)
7272
ifneq ($(C_COMPILER), PGI)
73-
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
73+
CCOMMON_OPT += -O3 -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
7474
else
7575
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
7676
endif

Makefile.prebuild

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,11 @@ TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
6464
endif
6565

6666
ifeq ($(TARGET), RISCV64_ZVL256B)
67-
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
67+
TARGET_FLAGS = -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
6868
endif
6969

7070
ifeq ($(TARGET), RISCV64_ZVL128B)
71-
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
71+
TARGET_FLAGS = -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
7272
endif
7373

7474
ifeq ($(TARGET), RISCV64_GENERIC)

Makefile.riscv64

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d
77
FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
88
endif
99
ifeq ($(CORE), RISCV64_ZVL256B)
10-
CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
11-
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
10+
CCOMMON_OPT += -march=rv64imafdcv_zvl256b_zvfh_zfh -mabi=lp64d
11+
FCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
1212
endif
1313
ifeq ($(CORE), RISCV64_ZVL128B)
14-
CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
15-
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
14+
CCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
15+
FCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
1616
endif
1717
ifeq ($(CORE), RISCV64_GENERIC)
1818
CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d

Makefile.rule

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,8 @@ COMMON_PROF = -pg
308308
# If you want to enable the experimental BFLOAT16 support
309309
# BUILD_BFLOAT16 = 1
310310

311+
# If you want to enable the experimental HFLOAT16 support
312+
# BUILD_HFLOAT16 = 1
311313

312314
# Set the thread number threshold beyond which the job array for the threaded level3 BLAS
313315
# will be allocated on the heap rather than the stack. (This array alone requires

0 commit comments

Comments
 (0)